Esempio n. 1
0
def run(limit = -1):
  if limit != -1 and store.count() > limit:
    print 'Limit: ' + str(limit) + ' Count: ' + str(store.count())
    print 'Voila, We are done !!'
    import sys
    sys.exit()
    return

  if swap.current.is_empty():
    if swap.next.is_empty() :
      print 'Somehow, We are done'
    else:
      # we exchange current and new, run again
      swap.exchange()
      return run(limit)
    

  #get the url from current-to be crawled list of urls
  url = swap.current.pop()
  
  if not url or not valid_url(url):
    print '"'+url+'" is not a valid url, giving up on it'
    return run(limit)

  f = Fetcher(url)

  next_urls = store.process(f.url(), f.child_urls())
  
  for url in next_urls:
    swap.next.push(url)

  return run(limit)
Esempio n. 2
0
  def __init__(self, url):
    if not valid_url(url):
      raise Exception('Invalid url passed to fetcher: ' + url)

    self._url = url
    self._parsed = None
    self._origin = None
    self._response = None
    self._soup = None
    self._child_urls = []
Esempio n. 3
0
  def process_href(self, href):
    # return None if href points to the instantiating url
    if len(href) == 0 or href == '/' or href == self.url() : return None

    #if href is a complete url, return it
    if valid_url(href) : return href

    #if href starts with '#' or '/#' return None, as we don't want the same url in children
    try:
      if href.index('#') == 0 or href.index('/#') == 0 : return None
    except ValueError:
      pass

    #Now href is a segment, fetched from page on self.url(), so we prepend the origin to href,
    # and try again
    href = href[0] == '/' and href or '/' + href
    return self.process_href(self.origin() + href)
Esempio n. 4
0
def get_main_image_from_urls(urls, title=''):
    try:
        valid_urls = []
        if len(urls) == 0:
            return ''

        for u in urls:
            url = util.valid_url(u[0])

            if url != '':
                valid_urls.append(url)

                if similarity.get_similarity(u[1], title) > ALT_SIMILARITY:
                    return url
                    break

        for url in valid_urls:

            try:

                if url != '':
                    result = urlfetch.fetch(url)
                    if result.status_code == 200:
                        file = cStringIO.StringIO(result.content)
                        im = Image.open(file)
                        size = result.headers["Content-Length"]
                        height, width = im.size
                        if not size or size == '':
                            size = 0
                        #print get_size_ratio(height, width)
                        #print url
                        if int(size) > IMAGE_SIZE and get_size_ratio(height, width) < IMAGE_SIZE_RATIO:
                            return url
                            break

            except Exception, ex:
                logging.error('get_main_image_from_urls: %s' % ex.message)
                continue

        return ''