Example #1
0
    def __init__(self, url):
        #SEED VARIABLES
        self.url = url
        self.baseurl = '/'.join(url.split('/')[0:3])
        self.domain = url.split("/")[2]
        self.regex = self.baseurl + r'/.*'
        print(self.regex)

        #INIT AND SETUP VARIABLES
        self.programName = 'hyperScraper'
        self.profileFolder = self.programName + "_profiles"
        self.currentDomainPath = './{}/{}/'.format(self.profileFolder,
                                                   self.domain)
        self.domainDataFolder = self.currentDomainPath + "data/"
        self.domainExploreFile = self.currentDomainPath + 'explored.url'
        self.domainVisitFile = self.currentDomainPath + 'visited.url'
        self.profile_check_make(self.domain)  #MAKE A PROFILE FOR DOMAIN

        #RUNTIME VARIABLES
        self.exploredURLMemory = mx.setload(self.domainExploreFile)
        self.visitedURLMemory = mx.setload(self.domainVisitFile)
        self.pendingURLMemory = list(self.exploredURLMemory -
                                     self.visitedURLMemory)

        #INITIAL SEED
        self.visit(url)  #initialize seed url and build links
Example #2
0
 def __init__(self, url, myregex=r'/.*'):
     #SEED VARIABLES
     self.url = url  #same as input while initializing
     self.domain = url.split("/")[2]  #==www.asdasd.com
     self.topLevelDomain = ".".join(self.domain.split('.')[1:])
     self.baseurl = '/'.join(url.split('/')[:3])  #==https://www.asdasd.com
     self.regex = self.baseurl + myregex
     print("LOG: regex url matching pattern is >>", self.regex)
     #INIT AND SETUP VARIABLES
     self.programName = 'hyperScraper'
     self.profileFolder = self.programName + "_profiles/"
     self.currentDomainPath = self.profileFolder + f'{self.domain}/'
     self.domainDataFolder = self.currentDomainPath + "data/"
     self.domainExploreFile = self.currentDomainPath + 'explored.url'
     self.domainVisitFile = self.currentDomainPath + 'visited.url'
     self.profile_check_make(self.domain)  #MAKE A PROFILE FOR DOMAIN
     #INIT RUNTIME VARIABLES
     self.exploredURLMemory = mx.setload(self.domainExploreFile)
     self.visitedURLMemory = mx.setload(self.domainVisitFile)
     self.pendingURLMemory = list(self.exploredURLMemory -
                                  self.visitedURLMemory)
     #INITIAL SEED
     try:
         self.visit(url)  #initialize seed url and build links
     except Exception as e:
         print(
             "ERROR: initial Seed encountered serious error change to {raise e} in this line to track it"
         )
         raise e
Example #3
0
lexico_index_path='./Dictionary/lexico_index.set';	mx.touch(lexico_index_path)
lexico_dict_path='./Dictionary/lexico_dict.jsonl'; 	mx.touch(lexico_dict_path)

#NLP---------------------------------------
def get_lemma(WORD):
	from spacy import load as spacy_load 
	try:
		Cache.nlpengine
	except:
		Cache.nlpengine = spacy_load('en_core_web_sm',disable=['parser','ner', 'tagger', 'textcat'])
	w=Cache.nlpengine(WORD)[0]
	return w.lemma_.lower() if w.lemma_ != '-PRON-' else w.lower_

#CACHE LAYERS------------------------------
Cache.base_dict=mx.jload(basedict_path)
Cache.lexico_index=mx.setload(lexico_index_path)
Cache.lexico_dict=mx.jloadlines(lexico_dict_path)

#CORE--------------------------------------
def get_definition(WORD):
	base_dict=Cache.base_dict
	defn=base_dict.get(WORD) if base_dict.get(WORD) else '' 
	finds=re.findall(r'[\d].*?;',defn)
	if finds:
		return finds
	else:
		return [defn]
#--------------------------------------
def lexico_fetch(WORD): #lower Level
	WORD=get_lemma(WORD)
	while True:
Example #4
0
def seed_dict_with_words():
	#do seeding of dict with standard most common words.
	words={POOL.apply_async(get_lemma, (x.lower(),)) for x in mx.setload('./Dictionary/3000common.set')} ;
	words={x.get() for x in words}
	add_words_to_dictionary(words)
Example #5
0
import os
from mxproxy import mx
import re


def get_body(url, bodyclass=''):
    if bodyclass:
        return mx.get_page(url).find(class_=bodyclass)
    else:
        print('define bodyclass as parameter of this function')


if __name__ == '__main__':
    SUBJECTFOLDER = 'SEM-5/DIP/'
    url = 'https://www.sanfoundry.com/1000-digital-image-processing-questions-answers/'
    page = mx.get_page_soup(url)

    links = {x['href'] for x in page.select('.entry-content a')}

    mx.touch(SUBJECTFOLDER + 'visited.url')
    visited = mx.setload(SUBJECTFOLDER + 'visited.url')

    pendinglinks = links - visited
    for x in pendinglinks:
        text = mx.get_page_soup(x).select_one('.entry-content').text
        mx.fappend('SEM-5/DIP/BigData.txt', text)
        mx.fappend(SUBJECTFOLDER + 'visited.url', x)
        # print(text)