def profile_check_make(self,domain): #smart profile management for scraper try: #make profile folder and create an domain os.makedirs(self.domainDataFolder) print('LOG: created path >>',self.domainDataFolder) except: print(f"WARNING: profile already exist for >> {self.domain}") finally: mx.touch(self.domainExploreFile) #Create if file not exist mx.touch(self.domainVisitFile)
def profile_check_make(self, domain): #smart profile management for scraper try: #make profile folder and create an domain if not os.path.exists(self.profileFolder): os.mkdir(self.profileFolder) if not os.path.exists(self.currentDomainPath): os.mkdir(self.currentDomainPath) if not os.path.exists(self.domainDataFolder): os.mkdir(self.domainDataFolder) except: print(self.domain, "profile already exist") finally: mx.touch(self.domainExploreFile) #Create if file not exist mx.touch(self.domainVisitFile) #Create if file not exist
import os from mxproxy import mx import re def get_body(url, bodyclass=''): if bodyclass: return mx.get_page(url).find(class_=bodyclass) else: print('define bodyclass as parameter of this function') if __name__ == '__main__': SUBJECTFOLDER = 'SEM-5/DIP/' url = 'https://www.sanfoundry.com/1000-digital-image-processing-questions-answers/' page = mx.get_page_soup(url) links = {x['href'] for x in page.select('.entry-content a')} mx.touch(SUBJECTFOLDER + 'visited.url') visited = mx.setload(SUBJECTFOLDER + 'visited.url') pendinglinks = links - visited for x in pendinglinks: text = mx.get_page_soup(x).select_one('.entry-content').text mx.fappend('SEM-5/DIP/BigData.txt', text) mx.fappend(SUBJECTFOLDER + 'visited.url', x) # print(text)
from mxproxy import mx import re,os,time import multiprocessing as mp Cache=mx.Cache #PATHS------------------------------------- basedict_path='./Dictionary/dictionary.json' ; mx.touch(basedict_path) lexico_index_path='./Dictionary/lexico_index.set'; mx.touch(lexico_index_path) lexico_dict_path='./Dictionary/lexico_dict.jsonl'; mx.touch(lexico_dict_path) #NLP--------------------------------------- def get_lemma(WORD): from spacy import load as spacy_load try: Cache.nlpengine except: Cache.nlpengine = spacy_load('en_core_web_sm',disable=['parser','ner', 'tagger', 'textcat']) w=Cache.nlpengine(WORD)[0] return w.lemma_.lower() if w.lemma_ != '-PRON-' else w.lower_ #CACHE LAYERS------------------------------ Cache.base_dict=mx.jload(basedict_path) Cache.lexico_index=mx.setload(lexico_index_path) Cache.lexico_dict=mx.jloadlines(lexico_dict_path) #CORE-------------------------------------- def get_definition(WORD): base_dict=Cache.base_dict defn=base_dict.get(WORD) if base_dict.get(WORD) else '' finds=re.findall(r'[\d].*?;',defn) if finds:
from mxproxy import mx Cache = mx.Cache #INITIALIZATION_______________________ lexico_syn_path = './Dictionary/lexico_syn.jsonl' mx.touch(lexico_syn_path) lexico_syn_index_path = './Dictionary/lexico_syn_index.set' mx.touch(lexico_syn_index_path) Cache.lexico_syn_index = mx.setload(lexico_syn_index_path) Cache.lexico_syn = mx.jloadlines(lexico_syn_path) #------------------------------------- class Helpers: def get_lemma(WORD): from spacy import load as spacy_load try: Cache.nlpengine except: Cache.nlpengine = spacy_load( 'en_core_web_sm', disable=['parser', 'ner', 'tagger', 'textcat']) w = Cache.nlpengine(WORD)[0] return w.lemma_.lower() if w.lemma_ != '-PRON-' else w.lower_ def db_sync(WORD, finalresult): if WORD not in Cache.lexico_syn_index: mx.fappend(lexico_syn_index_path, WORD) print(f'adding \'{WORD}\' to Thesaurus')