return bool(tab.find({'url': url}).count()) if __name__=='__main__': ''' This script should be called in the following way: $ python gua_scraper.py 'startdate' 'enddate' 'table (optional)' ''' # Create MongoClient client = MongoClient() # Initialize the Database db = client['election_analysis'] # Initialize table # If a table name has been provided use that, otherwise initialize 'articles' table if len(argv) > 3: tab = db[argv[3]] else: tab = db['articles'] start_date, end_date = argv[1], argv[2] print 'Scraping the Guardian from {0} to {1}'.format(start_date, end_date) dates = get_week_tuples(start_date, end_date) searchterms = get_keywords_2016() for searchterm in searchterms: print searchterm # Guardian doesn't like the ' in o'malley if searchterm != "o'malley": scrape_guardian(tab, searchterm, dates)
end_datetime = datetime.datetime.strptime(end_date, '%Y-%m-%d') # Create MongoClient client = MongoClient() # Initialize the Database db = client['election_analysis'] while True: # Initialize table # If a table name has been provided use that, otherwise initialize 'articles' table if len(argv) > 3: tab = db[argv[3]] else: tab = db['gua_' + start_datetime.strftime('%Y%m%d') + '_' + end_datetime.strftime('%Y%m%d')] print 'Scraping the Guardian from {0} to {1}'.format( start_date, end_date) dates = get_week_tuples(start_date, end_date) searchterms = get_keywords_2016() for searchterm in searchterms: print searchterm # Guardian doesn't like the ' in o'malley if searchterm != "o'malley": scrape_guardian(tab, searchterm, dates) start_datetime = start_datetime - datetime.timedelta(days=7) end_datetime = end_datetime - datetime.timedelta(days=7) start_date = start_datetime.strftime('%Y-%m-%d') end_date = end_datetime.strftime('%Y-%m-%d')
print 'Done.' print '\n' def already_exists(tab, url): return bool(tab.find({'url': url}).count()) if __name__ == '__main__': ''' This script should be called in the following way: $ python nyt_scraper.py 'startdate' 'enddate' 'table (optional)' ''' # Create MongoClient client = MongoClient() # Initialize the Database db = client['election_analysis'] # Initialize table # If a table name has been provided use that, otherwise initialize 'articles' table if len(argv) > 3: tab = db[argv[3]] else: tab = db['articles'] keywords = get_keywords_2016() start_date, end_date = argv[1], argv[2] dates = get_week_tuples(start_date, end_date, strf='%Y%m%d') for searchterm in keywords: scrape_nyt(tab, searchterm, dates)