def agregarInformacionDocumento(self, url, contenido):
        """Metodo para obtener diferentes partes del documento"""
        try:
            unaUrl = URL(url)
            if not 'pdf' in extension(unaUrl.page):
                html = contenido
                unElemento = Element(self.descargarContenidoHtml(url))
                body = self.getBody(unElemento)
                urlValues = self.getUrlValues(unElemento)
                titulo = self.getTitulo(unElemento)

                html = self.verificarContenidoVacio(html)
                body = self.verificarContenidoVacio(body)
                urlValues = self.verificarContenidoVacio(urlValues)
                titulo = self.verificarContenidoVacio(titulo)

                self.mongodb.setInformacionDocumento(html, url, titulo,
                                                     urlValues, body)
            else:
                html = self.verificarContenidoVacio(contenido)
                body = ""
                urlValues = ""
                titulo = ""
                self.mongodb.setInformacionDocumento(html, url, titulo,
                                                     urlValues, body)
        except Exception as e:
            print str(e)
Example #2
0
    def start(self,scraperLinks,progress,directorio,id_request,searchKey):
        unConfig = config()
        step=0
        progress.set_totalScraping(len(scraperLinks))
        progress.set_scrapingState('Ejecutando')

        # ordenar por el peso de los documentos
        self.rankear(scraperLinks,searchKey)

        scraperLinks = sorted(scraperLinks, key=lambda k: k['totalScore'])
        scraperLinks = self.unificarLista(scraperLinks)
        self.crearTop50(scraperLinks,directorio,unConfig)

        progress.totalNodes = len(scraperLinks)
        for link in scraperLinks:
            if not progress.get_stop():
                step+=1
                progress.set_scrapingProgress(step)
                url=URL(link['link'])
                fileNameJson = str(step).zfill(2)+"_"+url.domain+'.json'
                fileNameDocument = str(step).zfill(2)+"_"+url.domain
                if extension(url.page) == ".pdf":
                    fileNameDocument += ".pdf"
                else:
                    fileNameDocument += ".html"
                try:
                    self.fileGenerator.json(link,fileNameJson,fileNameDocument,link,id_request,directorio)
                except Exception,e:
                    print str(e)
                    pass
            else:
                progress.set_scrapingState('Detenido')
                print 'Detenido'
                break
 def descargarContenidoHtml(self, url):
     try:
         unaUrl = URL(url)
         if "pdf" in extension(unaUrl.page):
             return self.descargarPDF(unaUrl)
         else:
             return unaUrl.download()
     except Exception as e:
         try:
             return self.urlLibDescarga(url)
         except Exception as e:
             print "except " + str(e)
             print url
 def descargarContenido(self, url):
     """Metodo para descargar el contenido de los documentos webs siendo url o pdf"""
     try:
         unaUrl = URL(url)
         if "pdf" in extension(unaUrl.page):
             return self.descargarPDF(unaUrl)
         else:
             return plaintext(unaUrl.download())
     except Exception as e:
         try:
             return plaintext(self.urlLibDescarga(url))
         except Exception as e:
             print "except " + str(e)
             print url
Example #5
0
    def buscar_Flickr(self, texto):
        """ Busca en Flickr 6 imagenes y las guarda """
        engine = Flickr(license=None, throttle=0.5, language='es')
        i = 0
        for result in engine.search(texto,
                                    count=6,
                                    cached=True,
                                    copyright=False):
            self.espera(i)

            directorio = os.path.join('imagenes', 'busqueda',
                                      str(i) + extension(result.url))
            f = open(directorio, 'wb')
            f.write(result.download(timeout=10))
            f.close()
            i += 1
Example #6
0
# This example downloads an image from Flickr (http://flickr.com).
# Acquiring the image data takes three Flickr queries:
# - the first query with Flickr.search() retrieves a list of results,
# - the second query is executed behind the scenes in the FlickResult.url property,
# - the third query downloads the actual image data using this URL.
# It is a good idea to cache results from Flickr locally,
# which is what the cached=True parameter does.

# You should obtain your own license key at:
# http://www.flickr.com/services/api/
# Otherwise you will be sharing the default key with all users of this module.
engine = Flickr(license=None)

q = "duracell bunny"
results = engine.search(q, size=MEDIUM, sort=RELEVANCY, cached=True)
for img in results:
    # print img.url # Retrieving the actual image URL executes an additional query.
    print img.description
    print img.author
    print

# Download and save the image:
img = results[0]
data = img.download()
path = q.replace(" ", "_") + extension(img.url)
f = open(path, "wb")
f.write(data)
f.close()
print "Download:", img.url
print "Saved as:", path
Example #7
0
 def test_extension(self):
     # Assert filename extension.
     v = web.extension(os.path.join("pattern", "test", "test-web.py.zip"))
     self.assertEqual(v, ".zip")
     print "pattern.web.extension()"
Example #8
0
# For macOS SSL issue when downloading file(s) from external sources
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# ### Accessing Web Pages

from pattern.web import download

page_html = download('https://en.wikipedia.org/wiki/Artificial_intelligence',
                     unicode=True)

from pattern.web import URL, extension

page_url = URL(
    'https://upload.wikimedia.org/wikipedia/commons/f/f1/RougeOr_football.jpg')
file = open('football' + extension(page_url.page), 'wb')
file.write(page_url.download())
file.close()

# ### Finding URLs within Text

from pattern.web import find_urls

print(find_urls('To search anything, go to www.google.com', unique=True))

# ### Making Asynchronous Requests for Webpages

from pattern.web import asynchronous, time, Google

asyn_req = asynchronous(Google().search, 'artificial intelligence', timeout=4)
while not asyn_req.done:
Example #9
0
 def test_extension(self):
     # Assert filename extension.
     v = web.extension(os.path.join("pattern", "test", "test-web.py.zip"))
     self.assertEqual(v, ".zip")
     print "pattern.web.extension()"
Example #10
0
from pattern.web import Flickr, extension
from pattern.web import RELEVANCY, LATEST, INTERESTING # Image sort order.
from pattern.web import SMALL, MEDIUM, LARGE           # Image size.

# This example downloads an image from Flickr (http://flickr.com).
# Acquiring the image data takes three Flickr queries: 
# - the first query with Flickr.search() retrieves a list of results,
# - the second query is executed behind the scenes in the FlickResult.url property,
# - the third query downloads the actual image data using this URL.
# It is a good idea to cache results from Flickr locally,
# which is what the cached=True parameter does.

# You should obtain your own license key at:
# http://www.flickr.com/services/api/
# Otherwise you will be sharing the default key with all users of this module.
engine = Flickr(license=None)

q = "duracell bunny"
results = engine.search(q, size=MEDIUM, sort=RELEVANCY, cached=True)
for img in results:
    #print img.url # Retrieving the actual image URL executes an additional query.
    print img.description
    print img.author
    print

# Download and save the image:
img = results[0]
data = img.download()
f = open(q.replace(" ","_") + extension(img.url), "w")
f.write(data)
f.close()
Example #11
0
# This example downloads an image from Flickr (http://flickr.com).
# Acquiring the image data takes three Flickr queries:
# 1) Flickr.search() retrieves a list of results,
# 2) FlickrResult.url retrieves the image URL (behind the scenes),
# 3) FlickrResult.download() visits FlickrResult.url and downloads the content.

# It is a good idea to cache results from Flickr locally,
# which is what the cached=True parameter does.

# You should obtain your own license key at:
# http://www.flickr.com/services/api/
# Otherwise you will be sharing the default key with all users of pattern.web.
engine = Flickr(license=None)

q = "duracell bunny"
results = engine.search(q, size=MEDIUM, sort=RELEVANCY, cached=False)
for img in results:
    #print(img.url)  # Retrieving the actual image URL executes a query.
    print(img.text)
    print(img.author)
    print("")

# Download and save one of the images:
img = results[0]
data = img.download()
path = q.replace(" ", "_") + extension(img.url)
f = open(path, "wb")
f.write(data)
f.close()
print("Download: %s" % img.url)
print("Saved as: %s" % path)
Example #12
0
def save_image(url, figure):
    url = URL(url)
    f = open('illustrations/' + figure + extension(url.page), 'wb')
    f.write(url.download())
    f.close()
Example #13
0
from pattern.web import Flickr, extension
from pattern.web import RELEVANCY, LATEST, INTERESTING  # Image sort order.
from pattern.web import SMALL, MEDIUM, LARGE  # Image size.

# This example downloads an image from Flickr (http://flickr.com).
# Acquiring the image data takes three Flickr queries:
# - the first query with Flickr.search() retrieves a list of results,
# - the second query is executed behind the scenes in the FlickResult.url property,
# - the third query downloads the actual image data using this URL.
# It is a good idea to cache results from Flickr locally,
# which is what the cached=True parameter does.

# You should obtain your own license key at:
# http://www.flickr.com/services/api/
# Otherwise you will be sharing the default key with all users of this module.
engine = Flickr(license=None)

q = "duracell bunny"
results = engine.search(q, size=MEDIUM, sort=RELEVANCY, cached=True)
for img in results:
    #print img.url # Retrieving the actual image URL executes an additional query.
    print img.description
    print img.author
    print

# Download and save the image:
img = results[0]
data = img.download()
f = open(q.replace(" ", "_") + extension(img.url), "w")
f.write(data)
f.close()
def save_image(url, figure):
    url = URL(url)
    f = open('illustrations/' + figure + extension(url.page), 'wb')
    f.write(url.download())
    f.close()
 def isPDF(self, param):
     url = URL(param)
     if "pdf" in extension(url.page):
         return 1
     else:
         return 0
from pattern.web import URL, DOM, extension, MIMETYPE_IMAGE
from pattern.web import Element, download
import urllib
import datetime

#libraries to check urllib (legacy vs not), pattern, requests
url = URL("http://www.dot.ca.gov/dist1/d1tmc/allcams.php")
dom = DOM(url.download(cached = True))
i = 0
try :
    for e in dom.by_tag('img'):
        if (extension(e.attr['src']) == '.jpg'):
            print(e.attr['src'])
            urllib.request.urlretrieve(e.attr['src'], "data/test/urllib{0}.jpg".format(i))
            #image = download(e.attr['src'], unicode= False, timeout= 5)
            #f = open("data/test/pattern{0}.jpg".format(i), 'wb')
            #f.write(image)
            i += 1
except:
    print ("error")
        
"""
image = "http://www1.dot.ca.gov/cwwp2/data/d1/cctv/image/us101northofcushingcreeklookingsouth/us101northofcushingcreeklookingsouth.jpg"
url = URL(image)
print (url.mimetype in MIMETYPE_IMAGE)
urllib.request.urlretrieve(image, 'data/test2.jpg')
"""