def __init__(self, url): #SEED VARIABLES self.url = url self.baseurl = '/'.join(url.split('/')[0:3]) self.domain = url.split("/")[2] self.regex = self.baseurl + r'/.*' print(self.regex) #INIT AND SETUP VARIABLES self.programName = 'hyperScraper' self.profileFolder = self.programName + "_profiles" self.currentDomainPath = './{}/{}/'.format(self.profileFolder, self.domain) self.domainDataFolder = self.currentDomainPath + "data/" self.domainExploreFile = self.currentDomainPath + 'explored.url' self.domainVisitFile = self.currentDomainPath + 'visited.url' self.profile_check_make(self.domain) #MAKE A PROFILE FOR DOMAIN #RUNTIME VARIABLES self.exploredURLMemory = mx.setload(self.domainExploreFile) self.visitedURLMemory = mx.setload(self.domainVisitFile) self.pendingURLMemory = list(self.exploredURLMemory - self.visitedURLMemory) #INITIAL SEED self.visit(url) #initialize seed url and build links
def __init__(self, url, myregex=r'/.*'): #SEED VARIABLES self.url = url #same as input while initializing self.domain = url.split("/")[2] #==www.asdasd.com self.topLevelDomain = ".".join(self.domain.split('.')[1:]) self.baseurl = '/'.join(url.split('/')[:3]) #==https://www.asdasd.com self.regex = self.baseurl + myregex print("LOG: regex url matching pattern is >>", self.regex) #INIT AND SETUP VARIABLES self.programName = 'hyperScraper' self.profileFolder = self.programName + "_profiles/" self.currentDomainPath = self.profileFolder + f'{self.domain}/' self.domainDataFolder = self.currentDomainPath + "data/" self.domainExploreFile = self.currentDomainPath + 'explored.url' self.domainVisitFile = self.currentDomainPath + 'visited.url' self.profile_check_make(self.domain) #MAKE A PROFILE FOR DOMAIN #INIT RUNTIME VARIABLES self.exploredURLMemory = mx.setload(self.domainExploreFile) self.visitedURLMemory = mx.setload(self.domainVisitFile) self.pendingURLMemory = list(self.exploredURLMemory - self.visitedURLMemory) #INITIAL SEED try: self.visit(url) #initialize seed url and build links except Exception as e: print( "ERROR: initial Seed encountered serious error change to {raise e} in this line to track it" ) raise e
lexico_index_path='./Dictionary/lexico_index.set'; mx.touch(lexico_index_path) lexico_dict_path='./Dictionary/lexico_dict.jsonl'; mx.touch(lexico_dict_path) #NLP--------------------------------------- def get_lemma(WORD): from spacy import load as spacy_load try: Cache.nlpengine except: Cache.nlpengine = spacy_load('en_core_web_sm',disable=['parser','ner', 'tagger', 'textcat']) w=Cache.nlpengine(WORD)[0] return w.lemma_.lower() if w.lemma_ != '-PRON-' else w.lower_ #CACHE LAYERS------------------------------ Cache.base_dict=mx.jload(basedict_path) Cache.lexico_index=mx.setload(lexico_index_path) Cache.lexico_dict=mx.jloadlines(lexico_dict_path) #CORE-------------------------------------- def get_definition(WORD): base_dict=Cache.base_dict defn=base_dict.get(WORD) if base_dict.get(WORD) else '' finds=re.findall(r'[\d].*?;',defn) if finds: return finds else: return [defn] #-------------------------------------- def lexico_fetch(WORD): #lower Level WORD=get_lemma(WORD) while True:
def seed_dict_with_words(): #do seeding of dict with standard most common words. words={POOL.apply_async(get_lemma, (x.lower(),)) for x in mx.setload('./Dictionary/3000common.set')} ; words={x.get() for x in words} add_words_to_dictionary(words)
import os from mxproxy import mx import re def get_body(url, bodyclass=''): if bodyclass: return mx.get_page(url).find(class_=bodyclass) else: print('define bodyclass as parameter of this function') if __name__ == '__main__': SUBJECTFOLDER = 'SEM-5/DIP/' url = 'https://www.sanfoundry.com/1000-digital-image-processing-questions-answers/' page = mx.get_page_soup(url) links = {x['href'] for x in page.select('.entry-content a')} mx.touch(SUBJECTFOLDER + 'visited.url') visited = mx.setload(SUBJECTFOLDER + 'visited.url') pendinglinks = links - visited for x in pendinglinks: text = mx.get_page_soup(x).select_one('.entry-content').text mx.fappend('SEM-5/DIP/BigData.txt', text) mx.fappend(SUBJECTFOLDER + 'visited.url', x) # print(text)