Ejemplo n.º 1
0
 def request(self):
     headers = prepare_headers(self.feed)
     # using google bot header to trick tumblr rss...
     headers['User-Agent'] = GOOGLE_BOT_UA
     return jarr_get(self.get_url(),
                     timeout=conf.crawler.timeout,
                     user_agent=conf.crawler.user_agent,
                     headers=headers)
Ejemplo n.º 2
0
 def request(self):
     headers = prepare_headers(self.feed)
     # using google bot header to trick tumblr rss...
     headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; "\
             "+http://www.google.com/bot.html)"
     return jarr_get(self.get_url(),
                     timeout=conf.crawler.timeout,
                     user_agent=conf.crawler.user_agent,
                     headers=headers)
Ejemplo n.º 3
0
def try_get_icon_url(url, *splits):
    for split in splits:
        if split is None:
            continue
        rb_url = rebuild_url(url, split)
        response = None
        # if html in content-type, we assume it's a fancy 404 page
        try:
            response = jarr_get(rb_url, conf.crawler.timeout,
                                conf.crawler.user_agent)
            response.raise_for_status()
            content_type = response.headers.get('content-type', '')
        except Exception:
            logger.exception('something went wrong while fetching %r', rb_url)
        else:
            if response.ok and 'html' not in content_type and response.content:
                return response.url
    return None
Ejemplo n.º 4
0
 def request(self):
     return jarr_get(self.get_url(),
                     timeout=conf.crawler.timeout,
                     user_agent=conf.crawler.user_agent,
                     headers=prepare_headers(self.feed))
Ejemplo n.º 5
0
 def http_get(url):
     try:
         return jarr_get(url)
     except (ReadTimeout, TimeoutError):
         return jarr_get(url, user_agent=GOOGLE_BOT_UA)