def test_find_urls(self): # Assert URL finder with common URL notations. for url in ("http://domain.co.uk", "https://domain.co.uk", "www.domain.cu.uk", "domain.com", "domain.org", "domain.net"): self.assertEqual(web.find_urls("(" + url + ".")[0], url) # Assert case-insensitive and <a href="">. # Assert several matches in string. self.assertEqual( web.find_urls("<a href=\"HTTP://domain.net\">")[0], "HTTP://domain.net") self.assertEqual(web.find_urls("domain.com, domain.net"), ["domain.com", "domain.net"]) print "pattern.web.find_urls()"
def test_find_urls(self): # Assert URL finder with common URL notations. for url in ( "http://domain.co.uk", "https://domain.co.uk", "www.domain.cu.uk", "domain.com", "domain.org", "domain.net"): self.assertEqual(web.find_urls("("+url+".")[0], url) # Assert case-insensitive and <a href="">. # Assert several matches in string. self.assertEqual(web.find_urls("<a href=\"HTTP://domain.net\">")[0], "HTTP://domain.net") self.assertEqual(web.find_urls("domain.com, domain.net"), ["domain.com", "domain.net"]) print "pattern.web.find_urls()"
def parseURL(url, force=False): "Parses the given url and saves it to Database" try: wr = WebResource.objects.get(url=url) except MultipleObjectsReturned: WebResource.objects.filter(url=url).delete() except: pass wr, created = WebResource.objects.get_or_create(url=url) if created or force: # print "Parsing and Caching {0}".format(url) try: a = newspaper.Article(url) a.download() a.parse() text = a.text title = a.title if 'books.google' in url: text = '' wr.text = str(text.encode('utf-8', 'replace').lower()) wr.title = a.title wr.urls = ",".join( find_urls(strip_between("<body*>", "</body", text))) wr.save() print " PARSED ", url except: print " Failed" return wr
def gerar_urls_twitter(self, loops=2): if loops < 2: print "pelo menos 2" pass urlsBase = [] t = 1 tweets = "" while t < loops: twitter = BuscaTwitter(self.termo) buscatwitter = twitter.formata_twitter(twitter.busca_twitter(tpp=100, p=t)) if buscatwitter: for tweet in buscatwitter: tweets += " " + tweet["texto"] else: pass t += 1 time.sleep(self.timer) urlsBase = find_urls(tweets) print "%s links encontrados no Twitter para %s" % (len(urlsBase), self.termo) return urlsBase
def gerar_urls_twitter(self, loops=2): if loops < 2: print 'pelo menos 2' pass urlsBase = [] t = 1 tweets = '' while t < loops: twitter = BuscaTwitter(self.termo) buscatwitter = twitter.formata_twitter( twitter.busca_twitter(tpp=100, p=t)) if buscatwitter: for tweet in buscatwitter: tweets += ' ' + tweet['texto'] else: pass t += 1 time.sleep(self.timer) urlsBase = find_urls(tweets) print '%s links encontrados no Twitter para %s' % (len(urlsBase), self.termo) return urlsBase
page_html = download('https://en.wikipedia.org/wiki/Artificial_intelligence', unicode=True) from pattern.web import URL, extension page_url = URL( 'https://upload.wikimedia.org/wikipedia/commons/f/f1/RougeOr_football.jpg') file = open('football' + extension(page_url.page), 'wb') file.write(page_url.download()) file.close() # ### Finding URLs within Text from pattern.web import find_urls print(find_urls('To search anything, go to www.google.com', unique=True)) # ### Making Asynchronous Requests for Webpages from pattern.web import asynchronous, time, Google asyn_req = asynchronous(Google().search, 'artificial intelligence', timeout=4) while not asyn_req.done: time.sleep(0.1) print('searching...') print(asyn_req.value) print(find_urls(asyn_req.value, unique=True)) # ### Getting Search Engine Results with APIs
from pattern.web import download page_html = download('https://en.wikipedia.org/wiki/Artificial_intelligence', unicode=True) #You can also download files from webpages, for example, images using the URL method: from pattern.web import URL, extension page_url = URL( 'https://upload.wikimedia.org/wikipedia/commons/f/f1/RougeOr_football.jpg') file = open('football' + extension(page_url.page), 'wb') file.write(page_url.download()) file.close() #%%%Finding URLs within Text #You can use the findurl method to extract URLs from text strings. Here is an example: from pattern.web import find_urls print(find_urls('To search anything, go to www.google.com', unique=True)) #%%Parsing PDF Documments #The Pattern library contains PDF object that can be used to parse a PDF document. PDF (Portable Document Format) is a cross platform file which contains images, texts, and fonts in a stand-alone document. from pattern.web import URL, PDF pdf_doc = URL('http://demo.clab.cs.cmu.edu/NLP/syllabus_f18.pdf').download() pdf_doc print(PDF(pdf_doc.decode('utf-8'))) #%% Clearning Cache from pattern.web import cache cache.clear()
print 'path',path #crawler parameters #path = 'data/algsang' #inlinks_min=1 #depth=7 dirList=os.listdir(path) for fname in dirList[:]: pagelist =os.path.join(path,fname) print 'pagelist',pagelist try: url=web.URL(pagelist) chaine=url.download(cached=False) new_urls = map(lambda x: url_uniformer(x.split('">')[0]),web.find_urls(chaine, unique=True)) if 'Google Search' in pagelist: new_urls = map(lambda x:x.split("&")[0],new_urls) for new_url in new_urls[:]: if not check_forbidden((new_url,'')) and not new_url in pages: pages[new_url]=inlinks_min except: pass print 'pages init',len(pages) print 'pages',pages db_name=os.path.join(result_path,query+'_crawl.db') try: os.mkdir(result_path)
unzip_file_into_dir(path,corpus_out) path=corpus_out print 'Path: ',path if seeks_search == 1: print "Seeks search enabled. Creating Seeks file in %s" % path make_seeds(query,path,nb_results=nb_results) dirList=os.listdir(path) print 'List of files in path: ',dirList for fname in dirList[:]: pagelist =os.path.join(path,fname) try: url=web.URL(pagelist) chaine=url.download(cached=False) new_urls = map(lambda x: url_uniformer(x.split('">')[0]),web.find_urls(chaine, unique=True)) if 'Google Search' in pagelist: new_urls = map(lambda x:x.split("&")[0],new_urls) for new_url in new_urls[:]: print "Checking for forbidden URL..." if not check_forbidden((new_url,'')) and not new_url in pages: pages[new_url]=inlinks_min except: pass print 'Pages init: ', len(pages) print 'Pages: ', pages print "Naming database..." db_name=os.path.join(result_path,query+'_crawl.db')