def profile_check_make(self,domain): #smart profile management for scraper
		try: #make profile folder and create an domain
			os.makedirs(self.domainDataFolder)
			print('LOG: created path >>',self.domainDataFolder)
		except:
			print(f"WARNING: profile already exist for >> {self.domain}")
		finally:
			mx.touch(self.domainExploreFile) #Create if file not exist
			mx.touch(self.domainVisitFile)
Beispiel #2
0
 def profile_check_make(self, domain):
     #smart profile management for scraper
     try:
         #make profile folder and create an domain
         if not os.path.exists(self.profileFolder):
             os.mkdir(self.profileFolder)
         if not os.path.exists(self.currentDomainPath):
             os.mkdir(self.currentDomainPath)
         if not os.path.exists(self.domainDataFolder):
             os.mkdir(self.domainDataFolder)
     except:
         print(self.domain, "profile already exist")
     finally:
         mx.touch(self.domainExploreFile)  #Create if file not exist
         mx.touch(self.domainVisitFile)  #Create if file not exist
Beispiel #3
0
import os
from mxproxy import mx
import re


def get_body(url, bodyclass=''):
    if bodyclass:
        return mx.get_page(url).find(class_=bodyclass)
    else:
        print('define bodyclass as parameter of this function')


if __name__ == '__main__':
    SUBJECTFOLDER = 'SEM-5/DIP/'
    url = 'https://www.sanfoundry.com/1000-digital-image-processing-questions-answers/'
    page = mx.get_page_soup(url)

    links = {x['href'] for x in page.select('.entry-content a')}

    mx.touch(SUBJECTFOLDER + 'visited.url')
    visited = mx.setload(SUBJECTFOLDER + 'visited.url')

    pendinglinks = links - visited
    for x in pendinglinks:
        text = mx.get_page_soup(x).select_one('.entry-content').text
        mx.fappend('SEM-5/DIP/BigData.txt', text)
        mx.fappend(SUBJECTFOLDER + 'visited.url', x)
        # print(text)
Beispiel #4
0
from mxproxy import mx
import re,os,time
import multiprocessing as mp
Cache=mx.Cache

#PATHS-------------------------------------
basedict_path='./Dictionary/dictionary.json' ; mx.touch(basedict_path)
lexico_index_path='./Dictionary/lexico_index.set';	mx.touch(lexico_index_path)
lexico_dict_path='./Dictionary/lexico_dict.jsonl'; 	mx.touch(lexico_dict_path)

#NLP---------------------------------------
def get_lemma(WORD):
	from spacy import load as spacy_load 
	try:
		Cache.nlpengine
	except:
		Cache.nlpengine = spacy_load('en_core_web_sm',disable=['parser','ner', 'tagger', 'textcat'])
	w=Cache.nlpengine(WORD)[0]
	return w.lemma_.lower() if w.lemma_ != '-PRON-' else w.lower_

#CACHE LAYERS------------------------------
Cache.base_dict=mx.jload(basedict_path)
Cache.lexico_index=mx.setload(lexico_index_path)
Cache.lexico_dict=mx.jloadlines(lexico_dict_path)

#CORE--------------------------------------
def get_definition(WORD):
	base_dict=Cache.base_dict
	defn=base_dict.get(WORD) if base_dict.get(WORD) else '' 
	finds=re.findall(r'[\d].*?;',defn)
	if finds:
Beispiel #5
0
from mxproxy import mx

Cache = mx.Cache

#INITIALIZATION_______________________
lexico_syn_path = './Dictionary/lexico_syn.jsonl'
mx.touch(lexico_syn_path)
lexico_syn_index_path = './Dictionary/lexico_syn_index.set'
mx.touch(lexico_syn_index_path)

Cache.lexico_syn_index = mx.setload(lexico_syn_index_path)
Cache.lexico_syn = mx.jloadlines(lexico_syn_path)


#-------------------------------------
class Helpers:
    def get_lemma(WORD):
        from spacy import load as spacy_load
        try:
            Cache.nlpengine
        except:
            Cache.nlpengine = spacy_load(
                'en_core_web_sm',
                disable=['parser', 'ner', 'tagger', 'textcat'])
        w = Cache.nlpengine(WORD)[0]
        return w.lemma_.lower() if w.lemma_ != '-PRON-' else w.lower_

    def db_sync(WORD, finalresult):
        if WORD not in Cache.lexico_syn_index:
            mx.fappend(lexico_syn_index_path, WORD)
            print(f'adding \'{WORD}\' to Thesaurus')