Exemple #1
0
 def test_find_urls(self):
     # Assert URL finder with common URL notations.
     for url in ("http://domain.co.uk", "https://domain.co.uk",
                 "www.domain.cu.uk", "domain.com", "domain.org",
                 "domain.net"):
         self.assertEqual(web.find_urls("(" + url + ".")[0], url)
     # Assert case-insensitive and <a href="">.
     # Assert several matches in string.
     self.assertEqual(
         web.find_urls("<a href=\"HTTP://domain.net\">")[0],
         "HTTP://domain.net")
     self.assertEqual(web.find_urls("domain.com, domain.net"),
                      ["domain.com", "domain.net"])
     print "pattern.web.find_urls()"
Exemple #2
0
 def test_find_urls(self):
     # Assert URL finder with common URL notations.
     for url in (
       "http://domain.co.uk",
       "https://domain.co.uk",
       "www.domain.cu.uk",
       "domain.com",
       "domain.org",
       "domain.net"):
         self.assertEqual(web.find_urls("("+url+".")[0], url)
     # Assert case-insensitive and <a href="">.
     # Assert several matches in string.
     self.assertEqual(web.find_urls("<a href=\"HTTP://domain.net\">")[0], "HTTP://domain.net")
     self.assertEqual(web.find_urls("domain.com, domain.net"), ["domain.com", "domain.net"])
     print "pattern.web.find_urls()"
Exemple #3
0
def parseURL(url, force=False):
    "Parses the given url and saves it to Database"
    try:
        wr = WebResource.objects.get(url=url)
    except MultipleObjectsReturned:
        WebResource.objects.filter(url=url).delete()
    except:
        pass

    wr, created = WebResource.objects.get_or_create(url=url)
    if created or force:
        # print "Parsing and Caching {0}".format(url)
        try:
            a = newspaper.Article(url)
            a.download()
            a.parse()
            text = a.text
            title = a.title

            if 'books.google' in url:
                text = ''

            wr.text = str(text.encode('utf-8', 'replace').lower())
            wr.title = a.title
            wr.urls = ",".join(
                find_urls(strip_between("<body*>", "</body", text)))
            wr.save()
            print "  PARSED ", url
        except:
            print "  Failed"
    return wr
Exemple #4
0
 def gerar_urls_twitter(self, loops=2):
     if loops < 2:
         print "pelo menos 2"
         pass
     urlsBase = []
     t = 1
     tweets = ""
     while t < loops:
         twitter = BuscaTwitter(self.termo)
         buscatwitter = twitter.formata_twitter(twitter.busca_twitter(tpp=100, p=t))
         if buscatwitter:
             for tweet in buscatwitter:
                 tweets += " " + tweet["texto"]
         else:
             pass
         t += 1
         time.sleep(self.timer)
     urlsBase = find_urls(tweets)
     print "%s links encontrados no Twitter para %s" % (len(urlsBase), self.termo)
     return urlsBase
Exemple #5
0
 def gerar_urls_twitter(self, loops=2):
     if loops < 2:
         print 'pelo menos 2'
         pass
     urlsBase = []
     t = 1
     tweets = ''
     while t < loops:
         twitter = BuscaTwitter(self.termo)
         buscatwitter = twitter.formata_twitter(
             twitter.busca_twitter(tpp=100, p=t))
         if buscatwitter:
             for tweet in buscatwitter:
                 tweets += ' ' + tweet['texto']
         else:
             pass
         t += 1
         time.sleep(self.timer)
     urlsBase = find_urls(tweets)
     print '%s links encontrados no Twitter para %s' % (len(urlsBase),
                                                        self.termo)
     return urlsBase
page_html = download('https://en.wikipedia.org/wiki/Artificial_intelligence',
                     unicode=True)

from pattern.web import URL, extension

page_url = URL(
    'https://upload.wikimedia.org/wikipedia/commons/f/f1/RougeOr_football.jpg')
file = open('football' + extension(page_url.page), 'wb')
file.write(page_url.download())
file.close()

# ### Finding URLs within Text

from pattern.web import find_urls

print(find_urls('To search anything, go to www.google.com', unique=True))

# ### Making Asynchronous Requests for Webpages

from pattern.web import asynchronous, time, Google

asyn_req = asynchronous(Google().search, 'artificial intelligence', timeout=4)
while not asyn_req.done:
    time.sleep(0.1)
    print('searching...')

print(asyn_req.value)

print(find_urls(asyn_req.value, unique=True))

# ### Getting Search Engine Results with APIs
Exemple #7
0
from pattern.web import download
page_html = download('https://en.wikipedia.org/wiki/Artificial_intelligence',
                     unicode=True)

#You can also download files from webpages, for example, images using the URL method:

from pattern.web import URL, extension
page_url = URL(
    'https://upload.wikimedia.org/wikipedia/commons/f/f1/RougeOr_football.jpg')
file = open('football' + extension(page_url.page), 'wb')
file.write(page_url.download())
file.close()

#%%%Finding URLs within Text
#You can use the findurl method to extract URLs from text strings. Here is an example:
from pattern.web import find_urls
print(find_urls('To search anything, go to www.google.com', unique=True))

#%%Parsing PDF Documments
#The Pattern library contains PDF object that can be used to parse a PDF document. PDF (Portable Document Format) is a cross platform file which contains images, texts, and fonts in a stand-alone document.

from pattern.web import URL, PDF
pdf_doc = URL('http://demo.clab.cs.cmu.edu/NLP/syllabus_f18.pdf').download()
pdf_doc
print(PDF(pdf_doc.decode('utf-8')))

#%% Clearning Cache
from pattern.web import cache
cache.clear()
Exemple #8
0
print 'path',path
#crawler parameters
#path = 'data/algsang'
#inlinks_min=1
#depth=7


dirList=os.listdir(path)
for fname in dirList[:]:
	pagelist =os.path.join(path,fname)
	print 'pagelist',pagelist
	try:
		url=web.URL(pagelist)
		chaine=url.download(cached=False)
		new_urls = map(lambda x: url_uniformer(x.split('">')[0]),web.find_urls(chaine, unique=True))
		if 'Google Search' in pagelist:
			 new_urls = map(lambda x:x.split("&amp;")[0],new_urls)
		for new_url in new_urls[:]:
			if not check_forbidden((new_url,'')) and not new_url in pages:
				pages[new_url]=inlinks_min
	except:
		pass
print 'pages init',len(pages)
print 'pages',pages

db_name=os.path.join(result_path,query+'_crawl.db')


try:	
	os.mkdir(result_path)
Exemple #9
0
		unzip_file_into_dir(path,corpus_out)
		path=corpus_out
		print 'Path: ',path

if seeks_search == 1:
	print "Seeks search enabled. Creating Seeks file in %s" % path
	make_seeds(query,path,nb_results=nb_results)

dirList=os.listdir(path)
print 'List of files in path: ',dirList
for fname in dirList[:]:
	pagelist =os.path.join(path,fname)
	try:
		url=web.URL(pagelist)
		chaine=url.download(cached=False)
		new_urls = map(lambda x: url_uniformer(x.split('">')[0]),web.find_urls(chaine, unique=True))
		if 'Google Search' in pagelist:
			 new_urls = map(lambda x:x.split("&amp;")[0],new_urls)
		for new_url in new_urls[:]:
			print "Checking for forbidden URL..."
			if not check_forbidden((new_url,'')) and not new_url in pages:
				pages[new_url]=inlinks_min
	except:
		pass
print 'Pages init: ', len(pages)
print 'Pages: ', pages

print "Naming database..."
db_name=os.path.join(result_path,query+'_crawl.db')