Example #1
0
def detect_fetch_data(source):
    url = util.first_present([source.fetch_url_override, source.url])
    
    twitter_data = twitter_source_fetch.twitter_fetch_data_from_url(url)
    if twitter_data:
        return twitter_data, None
    
    markup = util.url_fetch(url)
    if not markup:
        return None, None
    
    # is this an rss feed itself?
    feed = parse_as_feed(markup)
    if feed:
        return {"type": "rss", "url": url}, feed
    
    # try finding some linked rss:
    soup = bs4.BeautifulSoup(markup, 'lxml')
    feed_url = rss_tools.find_linked_rss(soup, url)
    if feed_url:
        return {"type": "rss", "url": feed_url}, None
    
    wp_rss_link = url + "/?feed=rss"
    feed = parse_as_feed(util.url_fetch(wp_rss_link))
    if feed:
        return {"type": "rss", "url": wp_rss_link}, feed
    
    # is there a twitter account linked?
    twitter_data = twitter_source_fetch.linked_twitter_fetch_data(soup)
    if twitter_data:
        return twitter_data, None
    
    return None, None
Example #2
0
def rss_fetch(data, feed_content):
    url = data['url']
    if not feed_content:
        markup = util.url_fetch(url)
        if markup:
            feed_content = parse_as_feed(markup)
    
    if not feed_content:
        return None
    
    parsed = feed_content
    
    source_entry_processor = create_source_entry_processor(url)
    feed_title = parsed['feed']['title']
    entries = []
    latest_date = None
    for entry in parsed['entries']:
        if 'link' in entry and 'title' in entry:
            # print entry
            link_url = urljoin(url, entry['link'].strip())
            title = entry['title']
            
            pub_time = entry.get('published_parsed', entry.get('updated_parsed'))
            if pub_time:
                published = datetime.datetime.fromtimestamp(mktime(pub_time))
            else:
                published = None
            result_entry = {"title": title, "url": link_url, "published": published}
            source_entry_processor(result_entry, entry)
            entries.append(result_entry)
    
    return FetchResult('rss', feed_title, entries)