Ejemplo n.º 1
0
    def visit(self, url):  #visit page and gather info
        import scraper_utils as su

        def extract_info(soup, url=url):
            pageDict = {}
            pageDict['url'] = url
            pageDict['head'] = su.sanitize_text(soup.find('h1').get_text())
            pageDict['body'] = su.extract_body(soup)
            pageDict['time'] = su.time_stamper(delta='future')
            pageDict['author'] = su.get_author_info(soup)
            # return mx.jdumps(pageDict)
            return pageDict

        def save_to_disk(filename, data):
            mx.fwrite(self.domainDataFolder + f'{filename}.json', data)

        try:
            page = mx.get_page_soup(url)
            pageinfojson = extract_info(page)
            self.update_exploredURLMemory(self.get_all_links(page))
            self.update_visitedURLMemory(url)
            self.update_pendingURLMemory()
            save_to_disk(su.file_namer(pageinfojson['head']),
                         mx.jdumps(pageinfojson))
            Memory.visitCount += 1
            if Logging.showCurrentVisit:
                print(f'{ threading.current_thread().ident:<6}>> {url}')
        except Exception as e:
            print('ERROR: while visiting >>', url, e)
Ejemplo n.º 2
0
def get_lexico_syn(WORD):  #medium level
    if WORD in Cache.lexico_syn:
        print(f'syn: {WORD} found in db')
        return {WORD: Cache.lexico_syn[WORD]}
    else:
        WORD = Helpers.get_lemma(WORD.lower())
        synGroups = mx.get_page_soup(
            f'https://www.lexico.com/synonyms/{WORD}').select('.synGroup')
        removeWaste = [[
            y.decompose() for y in x.findAll(class_=lambda z: 'syn' not in z)
        ] for x in synGroups]
        sortedWords = [[y.strip() for y in x.text.split(',')]
                       for x in synGroups]
        theresult = {WORD: sortedWords}
        Helpers.db_sync(WORD, theresult)
        return theresult
Ejemplo n.º 3
0
    def visit(self, url):  #visit page and gather info
        def extract_info(soup, url=url):
            def soup_select(soup, bs_select):
                try:
                    return sanitize_text(soup.select_one(bs_select).text)
                except:
                    return 'UNKNOWN'

            bodyIdentifier = 'article'
            headIdentifier = 'h1'
            pageDict = {}
            pageDict['url'] = url
            pageDict['head'] = soup_select(soup, headIdentifier)
            pageDict['body'] = soup_select(soup, bodyIdentifier)
            pageDict['time'] = str(random.randint(1, 90) / 2) + 'Days Ago'
            pageDict['imgs'] = str(soup.select(bodyIdentifier + ' img'))
            # return mx.jdumps(pageDict)
            return pageDict

        def save_to_disk(label, data):
            mx.fwrite(self.domainDataFolder + f'{label}.json', data)

        Macros.homogenize(
            self, self.domainVisitFile)  #remove duplicate entries in file

        try:
            page = mx.get_page_soup(url)
            pageinfojson = extract_info(page)
            self.update_exploredURLMemory(self.get_all_links(page))
            self.update_visitedURLMemory(url)
            self.update_pendingURLMemory()
            save_to_disk(pageinfojson['head'].lower(), mx.jdumps(pageinfojson))
            Memory.visitCount += 1
            if Logging.showCurrentVisit:
                print(f'{ threading.current_thread().ident:<6}>> {url}')
        except Exception as e:
            print('exception 		', url, e)
Ejemplo n.º 4
0
	def visit(self,url): #visit page and gather info
		import scraper_utils as su
		def extract_info(soup,url=url):
			[x.decompose() for x in soup.findAll(['script','head','style','iframe'])]#remove unwanted

			pageDict={}
			pageDict['url']=url
			pageDict['head']=	su.sanitize_text(soup.h1.text)
			pageDict['author']=	su.get_author_info(soup)
			pageDict['time']=	su.time_stamper(delta='future')
			pageDict['body']=	su.extract_body(soup)
			# return mx.jdumps(pageDict)
			return pageDict

		def save_to_disk(filename,data):
			mx.fwrite(self.domainDataFolder+f'{filename}.json',data)

		try:
			page= mx.get_page_soup(url) ; 
			if page == 404: return
			self.update_exploredURLMemory(self.get_all_links(page))
			self.update_visitedURLMemory(url)
			self.update_pendingURLMemory()
			pageDict=extract_info(page)
			save_to_disk( su.file_namer(pageDict['head']) ,mx.jdumps(pageDict) )
			Memory.visitCount+=1
			if Logging.showCurrentVisit: 
				print(f'WORKER VISIT >> {url}')
			if Logging.showVisitCount:
				print(f'SESSION VISITS | { Memory.visitCount }')
			if Logging.showUrlPerSecond:
				if int(time.time()) % 5 == 0:
					tdelta=time.time()-Memory.beginTime
					print('Pages-Per-Second : ',Memory.visitCount/tdelta)
		except Exception as e: 
			print('ERROR =',url,e)
			raise e # | DEPTH 2
Ejemplo n.º 5
0
import os
from mxproxy import mx
import re


def get_body(url, bodyclass=''):
    if bodyclass:
        return mx.get_page(url).find(class_=bodyclass)
    else:
        print('define bodyclass as parameter of this function')


if __name__ == '__main__':
    SUBJECTFOLDER = 'SEM-5/DIP/'
    url = 'https://www.sanfoundry.com/1000-digital-image-processing-questions-answers/'
    page = mx.get_page_soup(url)

    links = {x['href'] for x in page.select('.entry-content a')}

    mx.touch(SUBJECTFOLDER + 'visited.url')
    visited = mx.setload(SUBJECTFOLDER + 'visited.url')

    pendinglinks = links - visited
    for x in pendinglinks:
        text = mx.get_page_soup(x).select_one('.entry-content').text
        mx.fappend('SEM-5/DIP/BigData.txt', text)
        mx.fappend(SUBJECTFOLDER + 'visited.url', x)
        # print(text)