コード例 #1
0

if __name__ == '__main__':
    ''' This script should be called in the following way:
    $ python wsj_article_urls.py 'startdate' 'enddate'
    '''

    start_date, end_date = argv[1], argv[2]
    start_datetime = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    end_datetime = datetime.datetime.strptime(end_date, '%Y-%m-%d')

    while True:
        print('Scraping WSJ URLs from {0} to {1}'.format(start_date, end_date))

        # Get dates to search over
        dates = get_dates(start_date, end_date)

        urls = set()

        for date in dates:
            print(date)
            urls = get_urls(date, urls)

        # Convert urls set to a list and write to a txt file
        file_path = '../url_files/{0}'.format(
            get_file_name('wsj', start_date, end_date))
        with open(file_path, 'w') as f:
            f.write(json.dumps(list(urls)))
            f.close()

        start_datetime = start_datetime - datetime.timedelta(days=7)
コード例 #2
0
        num_bad_extractions += thread.result
    return num_bad_extractions


if __name__=='__main__':
    ''' This script should be called in the following way:
    $ python npr_scraper.py 'startdate' 'enddate' 'table (optional)'
    '''

    # Create MongoClient
    client = MongoClient()
    # Initialize the Database
    db = client['election_analysis']
    # Initialize table
    # If a table name has been provided use that, otherwise initialize 'articles' table
    if len(argv) > 3:
        tab = db[argv[3]]
    else:
        tab = db['articles']

    start_date, end_date = argv[1], argv[2]
    print 'Scraping NPR from {0} to {1}'.format(start_date, end_date)

    dates = get_dates(start_date, end_date)
    keywords = get_keywords_2016()

    num_bad_extractions = concurrent_scrape_npr(tab, keywords, dates)

    print 'NPR Scraping Done...'
    print 'Number of Bad Extractions = {0}'.format(num_bad_extractions)