コード例 #1
0
def _source_fetch(source):
    fetch_type = None
    url = util.first_present([source.fetch_url_override, source.url])
    markup = url_fetch(url)
    if markup:
        results = []
        rpcs = []
        def got_result(res):
            if res: results.append(res)
        def add_rpc(rpc):
            rpcs.append(rpc)
        for fn in fetch_functions_for_source(source):
            fn(source, markup, url, add_rpc, got_result)
        while len(rpcs):
            rpcs[0].wait()
            del rpcs[0]
        result = results[0] if len(results) else None
        if result:
            debug("SF: Fetched {0} as {1} source with {2} entries".format(url, result.method, len(result.entries)))
        else:
            warning("SF: Couldn't fetch {0} using any method".format(url))
        if result:
            debug("SF: starting brand fetch")
            result.brand = extract_brand(markup, source.url)
            debug("SF: done with brand fetch")
        return result
    else:
        print "URL error fetching {0}".format(source.url)
    return None
コード例 #2
0
def detect_fetch_data(source):
    url = util.first_present([source.fetch_url_override, source.url])
    
    twitter_data = twitter_source_fetch.twitter_fetch_data_from_url(url)
    if twitter_data:
        return twitter_data, None
    
    markup = util.url_fetch(url)
    if not markup:
        return None, None
    
    # is this an rss feed itself?
    feed = parse_as_feed(markup)
    if feed:
        return {"type": "rss", "url": url}, feed
    
    # try finding some linked rss:
    soup = bs4.BeautifulSoup(markup, 'lxml')
    feed_url = rss_tools.find_linked_rss(soup, url)
    if feed_url:
        return {"type": "rss", "url": feed_url}, None
    
    wp_rss_link = url + "/?feed=rss"
    feed = parse_as_feed(util.url_fetch(wp_rss_link))
    if feed:
        return {"type": "rss", "url": wp_rss_link}, feed
    
    # is there a twitter account linked?
    twitter_data = twitter_source_fetch.linked_twitter_fetch_data(soup)
    if twitter_data:
        return twitter_data, None
    
    return None, None
コード例 #3
0
 def fetch_normal():
     response = url_fetch(url, return_response_obj=True)
     # print 'INFO', response.info()
     if response and response.info().getheader('content-type', 'text/html').lower().split(';')[0].strip() == 'text/html':
         markup = response.read()
     else:
         print 'BAD MIME TYPE' if response else 'NO SUCCESSFUL RESPONSE'
         markup = None
 
     if markup:
         # process markup:
         markup_soup = BeautifulSoup(markup, 'lxml')
         og_title = find_meta_value(markup_soup, 'og:title')
         og_image = find_meta_value(markup_soup, 'og:image')
         og_description = find_meta_value(markup_soup, 'og:description')
         title_field = find_title(markup_soup)
     
         article.site_name = find_meta_value(markup_soup, 'og:site_name')
     
         # find author:
         article.author = find_author(markup_soup)
     
         # parse and process article content:
         content.html = article_extractor.extract(markup, article.url)
         doc_soup = BeautifulSoup(content.html, 'lxml')
     
         article.title = first_present([og_title, title_field, article.title])
         article.top_image = make_url_absolute(first_present([article.top_image, og_image]))
     
         populate_article_json(article, content)
     
         # compute description:
         description = None
         if og_description and len(og_description.strip()):
             description = truncate(og_description.strip(), words=40)
         elif content.text and len(content.text.strip()) > 0:
             description = truncate(content.text, words=40)
         article.description = re.sub(r"[\r\n\t ]+", " ", description).strip() if description else None
             
         return True
     else:
         return False