def update_exploredURLMemory(self, newLinks): difference = newLinks - self.exploredURLMemory #save DATA IO by calc difference if difference: mx.fappend(self.domainExploreFile, "\n".join(difference) + '\n') self.exploredURLMemory.update(newLinks) #add just discovered links if Logging.showDiscoverCount: print(f"Discovered {len(difference)} URLS")
def update_visitedURLMemory(self, current_visited_url): if Logging.showCurrentVisit: print(f'WORKER VISIT >> {current_visited_url}') if current_visited_url not in self.visitedURLMemory: mx.fappend(self.domainVisitFile, "\n" + current_visited_url) self.visitedURLMemory.add( current_visited_url) #never use set.update() for only 1 item
def db_sync(WORD, finalresult): if WORD not in Cache.lexico_syn_index: mx.fappend(lexico_syn_index_path, WORD) print(f'adding \'{WORD}\' to Thesaurus') mx.fappend(lexico_syn_path, mx.json.dumps(finalresult)) #dump the new word Cache.lexico_syn_index.add(WORD)
def update_exploredURLMemory(self,nowDiscovered): self.exploredURLMemory= set(mx.fread(self.domainExploreFile).split('\n'))#reducing compute difference= list(set(nowDiscovered).difference(self.exploredURLMemory)) self.exploredURLMemory.update(difference) updateFile= mx.fappend(self.domainExploreFile, "\n".join(difference+['\n'])) if Logging.showDiscoverCount: print(f"added {len(difference)} URL records\n")
def visit(self, url): #visit page and gather info try: page_req = mx.get_page(url) #requests.get object if page_req.status_code != 200: #break if page error mx.fappend(self.currentDomainPath + 'errors.url', f'\nPAGE ERROR :{page_req.status_code,url}') return page_text = page_req.text #page text cache Pool.apply(Processor.save_page_info, (self, page_text, url)) #asyncProcessing using Processor Module soup = mx.soup(page_text, features='html.parser') self.update_exploredURLMemory(self.get_all_links(soup)) #1 self.update_visitedURLMemory(url) #2 self.update_pendingURLMemory() #3 dont change order Memory.visitCount += 1 if Logging.showVisitCount: print(f'SESSION VISITS | { Memory.visitCount }') if Logging.showUrlPerSecond: Logging.calculate_url_second() except Exception as e: print('ERROR =', url, e) raise e # | DEPTH 2
def update_visitedURLMemory(self, current_visited_url): if current_visited_url not in self.visitedURLMemory: mx.fappend(self.domainVisitFile, "\n" + current_visited_url) self.visitedURLMemory.add(current_visited_url)
def update_visitedURLMemory(self,current_visited_url): if current_visited_url not in self.visitedURLMemory: mx.fappend(self.domainVisitFile,"\n"+current_visited_url) self.visitedURLMemory.add(current_visited_url) #never use set.update() for only 1 item
def update_lexico_db_and_index(WORD,finalresult): #medium level mx.fappend(lexico_index_path, WORD) if WORD not in Cache.lexico_index: print(f'adding ({WORD}) to lexico_index') mx.fappend(lexico_dict_path,mx.json.dumps(finalresult))#dump the new word
import os from mxproxy import mx import re def get_body(url, bodyclass=''): if bodyclass: return mx.get_page(url).find(class_=bodyclass) else: print('define bodyclass as parameter of this function') if __name__ == '__main__': SUBJECTFOLDER = 'SEM-5/DIP/' url = 'https://www.sanfoundry.com/1000-digital-image-processing-questions-answers/' page = mx.get_page_soup(url) links = {x['href'] for x in page.select('.entry-content a')} mx.touch(SUBJECTFOLDER + 'visited.url') visited = mx.setload(SUBJECTFOLDER + 'visited.url') pendinglinks = links - visited for x in pendinglinks: text = mx.get_page_soup(x).select_one('.entry-content').text mx.fappend('SEM-5/DIP/BigData.txt', text) mx.fappend(SUBJECTFOLDER + 'visited.url', x) # print(text)
def update_visitedURLMemory(nowVisited): #update in file First Important Macros.homogenize(self,self.domainVisitFile) mx.fappend(self.domainVisitFile,"\n"+nowVisited) self.visitedURLMemory=mx.fread(self.domainVisitFile).split('\n')
def save_to_disk(label,data): mx.fappend(self.domainDataFolder+f'{label}.json',data)