コード例 #1
0
ファイル: Scraper.py プロジェクト: Fablr/Feed-Scraper
def serial_main(daemon_mode):
    cache = CorgiCache()

    while True:
        feeds = cache.get_all_feeds()

        for feed in feeds:
            scrap_feed(feed)

        if not daemon_mode:
            break
    return
コード例 #2
0
ファイル: Scraper.py プロジェクト: Fablr/Feed-Scraper
def async_main(daemon_mode):
    pool = Pool()
    cache = CorgiCache()

    while True:
        feeds = cache.get_all_feeds()

        for feed in feeds:
            pool.apply_async(func=scrap_feed, args=feed)

        pool.close()
        pool.join()

        if not daemon_mode:
            break
    return
コード例 #3
0
ファイル: Scraper.py プロジェクト: Fablr/Feed-Scraper
def scrap_feed(feed):
    etag = ""
    last_crawled = ""

    cache = CorgiCache()
    tokens = cache.get_token(use='scraper')

    if 'URL' not in feed:
        logging.error("no feed for {0}".format(feed))
        return

    if 'ETAG' in feed:
        etag = feed['ETAG']

    if 'CRAWLED' in feed:
        last_crawled = feed['CRAWLED']

    url = feed['URL']

    parser = PodcastFeedParser(url=url, etag=etag, last_request=last_crawled)
    feed['CRAWLED'] = formatdate()

    try:
        if parser.has_new_feed():
            logging.info("new feed for {0}".format(url))

            url = parser.get_new_feed()
            logging.info("new feed is {0}".format(url))

            feed['URL'] = url
            feed.save()
            parser = PodcastFeedParser(url=url)
            feed['CRAWLED'] = formatdate()

        if parser.get_blocked():
            logging.warning("feed blocked for {0}".format(url))

        try:
            publisher = parser.get_owner()
            pub_filter = {'name': publisher['name']}
            data = get_data(table_name='publisher', data_filter=pub_filter, token=tokens['TOKEN'])
            data = data.json()
            if 0 == len(data):
                data = post_data(table_name='publisher', data=publisher, token=tokens['TOKEN'])

                data = data.json()
                if 'id' not in data:
                    raise IOError
            else:
                data = data[0]

            publisher_id = data['id']
            title = parser.get_title()
            author = parser.get_author()
            summary = parser.get_summary()
            category = parser.get_category()
            explicit = parser.get_explicit()
            link = parser.get_link()
            podcast_copyright = parser.get_copyright()
            blocked = parser.get_blocked()
            complete = parser.get_complete()
            keywords = parser.get_keywords()

            pod_filter = {'publisher': publisher_id,
                          'title': title}

            data = get_data(table_name='podcast', data_filter=pod_filter, token=tokens['TOKEN'])
            data = data.json()
            if 0 == len(data):
                podcast = {'publisher': publisher_id,
                           'title': title,
                           'author': author,
                           'summary': summary,
                           'category': category,
                           'explicit': explicit,
                           'link': link,
                           'copyright': podcast_copyright,
                           'blocked': blocked,
                           'complete': complete,
                           'keywords': keywords}

                data = post_data(table_name='podcast', data=podcast, token=tokens['TOKEN'])

                data = data.json()
                if 'id' not in data:
                    raise IOError
            else:
                data = data[0]

            podcast_id = data['id']

            guids = []
            if 'GUIDS' in feed:
                guids = feed['GUIDS']
                episodes = parser.get_new_episodes(guids)
            else:
                episodes = parser.get_all_episodes()

            for episode in episodes:
                guids.append(episode['guid'])
                episode['podcast'] = podcast_id
                del episode['guid']
                post_data(table_name='episode', data=episode, token=tokens['TOKEN'])

            feed['GUIDS'] = guids

        except IOError:
            return

        try:
            feed['ETAG'] = parser.get_etag()
        except IOError:
            pass

        feed.save()
        logging.info("finished scraping, {0}".format(url))
    except IOError:
        return

    return