Beispiel #1
0
def main():
    opts = parse_args()

    log_to_stderr(verbose=opts.verbose, quiet=opts.quiet)

    if opts.urls:
        all_urls = opts.urls
    elif environ.get('MORPH_URLS'):
        all_urls = filter(None, environ['MORPH_URLS'].split())
    else:
        all_urls = set()

        for db_name in SOURCE_DBS:
            download_db(db_name)
            db = open_db(db_name)
            for table in show_tables(db):
                if table in SKIP_TABLES:
                    continue
                urls = select_urls(db, table)
                if urls:
                    log.info('read {} urls from {}.{}'.format(
                        len(urls), db_name, table))
                all_urls.update(urls)

    create_table_if_not_exists('url', with_scraper_id=False)

    dt = open_dt()
    failures = []  # tuple of (url, exception)

    for i, url in enumerate(sorted(all_urls)):
        log.info('scraping {} ({} of {})'.format(
            url, i + 1, len(all_urls)))

        try:
            html = scrape(url)

            soup = BeautifulSoup(html)
            row = dict(url=url, last_scraped=iso_now())
            row['twitter_handle'] = scrape_twitter_handle(
                soup, required=False)
            row['facebook_url'] = scrape_facebook_url(
                soup, required=False)

            log.debug('`url`: {}'.format(repr(row)))
            dt.upsert(row, 'url')
        except Exception as e:
            failures.append((url, e))
            print_exc()

    # show a summary of failures
    if failures:
        log.warn('Failed to scrape {} of {} URL{}:'.format(
            len(failures), len(all_urls),
            's' if len(failures) > 2 else ''))
        for url, e in failures:
            log.warn(u'  {}: {}'.format(url, repr(e)))

    if len(failures) > len(all_urls) * MAX_PROPORTION_FAILURES:
        raise Exception('too many failures')
Beispiel #2
0
def main():
    opts = parse_args()

    log_to_stderr(verbose=opts.verbose, quiet=opts.quiet)

    scraper_ids = opts.scraper_ids
    if not scraper_ids and environ.get('MORPH_COMPANY_SCRAPERS'):
        scraper_ids = environ['MORPH_COMPANY_SCRAPERS'].split(',')

    skip_scraper_ids = DISABLED_SCRAPERS
    if environ.get('MORPH_SKIP_COMPANY_SCRAPERS'):
        skip_scraper_ids.update(
            environ['MORPH_SKIP_COMPANY_SCRAPERS'].split(','))

    use_decimal_type_in_sqlite()

    run_scrapers(get_records_from_company_scraper,
                 scraper_ids=scraper_ids,
                 skip_scraper_ids=skip_scraper_ids,
                 default_freq=DEFAULT_SCRAPE_FREQ)