コード例 #1
0
 def update_exploredURLMemory(self, newLinks):
     difference = newLinks - self.exploredURLMemory  #save DATA IO by calc difference
     if difference:
         mx.fappend(self.domainExploreFile, "\n".join(difference) + '\n')
         self.exploredURLMemory.update(newLinks)  #add just discovered links
     if Logging.showDiscoverCount:
         print(f"Discovered {len(difference)} URLS")
コード例 #2
0
ファイル: hyperScraper.py プロジェクト: shirish-01/DataCake
 def update_visitedURLMemory(self, current_visited_url):
     if Logging.showCurrentVisit:
         print(f'WORKER VISIT >> {current_visited_url}')
     if current_visited_url not in self.visitedURLMemory:
         mx.fappend(self.domainVisitFile, "\n" + current_visited_url)
     self.visitedURLMemory.add(
         current_visited_url)  #never use set.update() for only 1 item
コード例 #3
0
 def db_sync(WORD, finalresult):
     if WORD not in Cache.lexico_syn_index:
         mx.fappend(lexico_syn_index_path, WORD)
         print(f'adding \'{WORD}\' to Thesaurus')
         mx.fappend(lexico_syn_path,
                    mx.json.dumps(finalresult))  #dump the new word
     Cache.lexico_syn_index.add(WORD)
コード例 #4
0
		def update_exploredURLMemory(self,nowDiscovered):
			self.exploredURLMemory= set(mx.fread(self.domainExploreFile).split('\n'))#reducing compute
			difference= list(set(nowDiscovered).difference(self.exploredURLMemory))
			self.exploredURLMemory.update(difference)
			updateFile= mx.fappend(self.domainExploreFile, "\n".join(difference+['\n']))
			if Logging.showDiscoverCount: 
				print(f"added {len(difference)} URL records\n")
コード例 #5
0
ファイル: hyperScraper.py プロジェクト: shirish-01/DataCake
    def visit(self, url):  #visit page and gather info
        try:
            page_req = mx.get_page(url)  #requests.get object
            if page_req.status_code != 200:  #break if page error
                mx.fappend(self.currentDomainPath + 'errors.url',
                           f'\nPAGE ERROR :{page_req.status_code,url}')
                return

            page_text = page_req.text  #page text cache
            Pool.apply(Processor.save_page_info,
                       (self, page_text,
                        url))  #asyncProcessing using Processor Module
            soup = mx.soup(page_text, features='html.parser')
            self.update_exploredURLMemory(self.get_all_links(soup))  #1
            self.update_visitedURLMemory(url)  #2
            self.update_pendingURLMemory()  #3 dont change order
            Memory.visitCount += 1
            if Logging.showVisitCount:
                print(f'SESSION VISITS | { Memory.visitCount }')
            if Logging.showUrlPerSecond: Logging.calculate_url_second()

        except Exception as e:
            print('ERROR =', url, e)
            raise e  # | DEPTH 2
コード例 #6
0
 def update_visitedURLMemory(self, current_visited_url):
     if current_visited_url not in self.visitedURLMemory:
         mx.fappend(self.domainVisitFile, "\n" + current_visited_url)
     self.visitedURLMemory.add(current_visited_url)
コード例 #7
0
	def update_visitedURLMemory(self,current_visited_url):
		if current_visited_url not in self.visitedURLMemory:
			mx.fappend(self.domainVisitFile,"\n"+current_visited_url)
		self.visitedURLMemory.add(current_visited_url) #never use set.update() for only 1 item
コード例 #8
0
ファイル: auto_dict.py プロジェクト: shirish-01/DataCake
def update_lexico_db_and_index(WORD,finalresult): #medium level
	mx.fappend(lexico_index_path, WORD)
	if WORD not in Cache.lexico_index:
		print(f'adding ({WORD}) to lexico_index')
		mx.fappend(lexico_dict_path,mx.json.dumps(finalresult))#dump the new word
コード例 #9
0
import os
from mxproxy import mx
import re


def get_body(url, bodyclass=''):
    if bodyclass:
        return mx.get_page(url).find(class_=bodyclass)
    else:
        print('define bodyclass as parameter of this function')


if __name__ == '__main__':
    SUBJECTFOLDER = 'SEM-5/DIP/'
    url = 'https://www.sanfoundry.com/1000-digital-image-processing-questions-answers/'
    page = mx.get_page_soup(url)

    links = {x['href'] for x in page.select('.entry-content a')}

    mx.touch(SUBJECTFOLDER + 'visited.url')
    visited = mx.setload(SUBJECTFOLDER + 'visited.url')

    pendinglinks = links - visited
    for x in pendinglinks:
        text = mx.get_page_soup(x).select_one('.entry-content').text
        mx.fappend('SEM-5/DIP/BigData.txt', text)
        mx.fappend(SUBJECTFOLDER + 'visited.url', x)
        # print(text)
コード例 #10
0
		def update_visitedURLMemory(nowVisited):
			#update in file First Important
			Macros.homogenize(self,self.domainVisitFile)
			mx.fappend(self.domainVisitFile,"\n"+nowVisited)
			self.visitedURLMemory=mx.fread(self.domainVisitFile).split('\n')
コード例 #11
0
		def save_to_disk(label,data):
			mx.fappend(self.domainDataFolder+f'{label}.json',data)