$ python wsj_article_urls.py 'startdate' 'enddate' ''' start_date, end_date = argv[1], argv[2] start_datetime = datetime.datetime.strptime(start_date, '%Y-%m-%d') end_datetime = datetime.datetime.strptime(end_date, '%Y-%m-%d') while True: print('Scraping WSJ URLs from {0} to {1}'.format(start_date, end_date)) # Get dates to search over dates = get_dates(start_date, end_date) urls = set() for date in dates: print(date) urls = get_urls(date, urls) # Convert urls set to a list and write to a txt file file_path = '../url_files/{0}'.format( get_file_name('wsj', start_date, end_date)) with open(file_path, 'w') as f: f.write(json.dumps(list(urls))) f.close() start_datetime = start_datetime - datetime.timedelta(days=7) end_datetime = end_datetime - datetime.timedelta(days=7) start_date = start_datetime.strftime('%Y-%m-%d') end_date = end_datetime.strftime('%Y-%m-%d')
if __name__ == '__main__': ''' This script should be called in the following way: $ python breitbart_all_urls.py ''' print('Scraping BB') # Initialize empty lists for urls to be appended to did_it_work, urls = get_urls_from_search(dates) print('BB Scraping Done...') print('There were a total of {0} urls'.format(len(urls))) if did_it_work: # Convert good_urls set to a list and write to a txt file file_path = '../url_files/{0}'.format(get_file_name('bb', start_date, end_date)) f = open(file_path, 'w') f.write(json.dumps(list(urls))) f.close() else: print("Didn't work") start_datetime = start_datetime - datetime.timedelta(days=7) end_datetime = end_datetime - datetime.timedelta(days=7) start_date = start_datetime.strftime('%Y-%m-%d') end_date = end_datetime.strftime('%Y-%m-%d')
if __name__ == '__main__': ''' This script should be called in the following way: $ python fox_scraper.py 'startdate' 'enddate' 'table (optional)' ''' # Create MongoClient client = MongoClient() # Initialize the Database db = client['election_analysis'] # Initialize table # If a table name has been provided use that, otherwise initialize 'articles' table if len(argv) > 3: tab = db[argv[3]] else: tab = db['articles'] start_date, end_date = argv[1], argv[2] print 'Scraping FOX URLs from {0} to {1}'.format(start_date, end_date) file_path = '../url_files/{0}'.format( get_file_name('fox', start_date, end_date)) urls = load_urls(file_path) bad_urls = [] for url in urls: result = add_to_mongo(tab, url) if result: bad_urls.append(result) print 'FOX Scraping Done...' print 'Number of Bad Extractions = {0}'.format(bad_urls)
print 'There were a total of {0} failed searches'.format(len(bad_searches)) # If there were any bad searchs we should try and make some attempts to redo the searchs in a non-threaded way attempt = 0 while attempt < 3 and len(bad_searches) > 0: # This will give us a tuple of (searchterm, date tuple) to research over searchterms_and_dates = list(bad_searches) # Reset our bad_searches to an empty set bad_searches = set() # Create a Firefox driver driver = webdriver.Firefox() for searchterm, date in searchterms_and_dates: good_urls, bad_searches = get_urls(driver, searchterm, [date], good_urls, bad_searches) driver.close() attempt += 1 print 'Total of {0} failed searches after attempt {1}'.format(len(bad_searches), attempt) # Convert good_urls set to a list and write to a txt file file_path = '../url_files/{0}'.format(get_file_name('fox', start_date, end_date)) with open(file_path, 'w') as f: f.write(json.dumps(list(good_urls))) f.close() # If there are any bad URLs, print how many there were and write them to a file for review print 'Number of Bad Searches = {0}'.format(len(bad_searches)) if len(bad_searches): file_path = '../url_files/{0}'.format(get_file_name('fox', start_date, end_date, bad=True)) with open(file_path, 'w') as f: f.write(json.dumps(list(bad_searches))) f.close()
while attempt < 3 and len(bad_searches) > 0: # This will give us a tuple of (searchterm, date tuple) to research over searchterms_and_dates = list(bad_searches) # Reset our bad_searches to an empty set bad_searches = set() # Create a Firefox driver driver = webdriver.Firefox() for searchterm, date in searchterms_and_dates: good_urls, bad_searches = get_urls(driver, searchterm, [date], good_urls, bad_searches) driver.close() attempt += 1 print 'Total of {0} failed searches after attempt {1}'.format( len(bad_searches), attempt) # Convert good_urls set to a list and write to a txt file file_path = '../url_files/{0}'.format( get_file_name('fox', start_date, end_date)) with open(file_path, 'w') as f: f.write(json.dumps(list(good_urls))) f.close() # If there are any bad URLs, print how many there were and write them to a file for review print 'Number of Bad Searches = {0}'.format(len(bad_searches)) if len(bad_searches): file_path = '../url_files/{0}'.format( get_file_name('fox', start_date, end_date, bad=True)) with open(file_path, 'w') as f: f.write(json.dumps(list(bad_searches))) f.close()
add_url = True elif keyword in article.text.lower(): add_url = True if add_url: urls.add(article.find('a').get('href')) return urls if __name__=='__main__': ''' This script should be called in the following way: $ python wsj_article_urls.py 'startdate' 'enddate' ''' keywords = get_keywords_2016() start_date, end_date = argv[1], argv[2] print 'Scraping WSJ URLs from {0} to {1}'.format(start_date, end_date) # Get dates to search over dates = get_dates(start_date, end_date) urls = set() for date in dates: urls = get_urls(date, keywords, urls) # Convert urls set to a list and write to a txt file file_path = '../url_files/{0}'.format(get_file_name('wsj', start_date, end_date)) with open(file_path, 'w') as f: f.write(json.dumps(list(urls))) f.close()
# attempt = 0 # while attempt < 3 and len(bad_searches) > 0: # # This will give us a tuple of (searchterm, date tuple) to research over # searchterms_and_dates = list(bad_searches) # # Reset our bad_searches to an empty set # bad_searches = set() # # Create a Firefox driver # driver = webdriver.Firefox() # for searchterm, date in searchterms_and_dates: # good_urls, bad_searches = get_urls(driver, searchterm, [date], good_urls, bad_searches) # driver.close() # attempt += 1 # print('Total of {0} failed searches after attempt {1}'.format(len(bad_searches), attempt)) # Convert good_urls set to a list and write to a txt file file_path = '../url_files/{0}'.format(get_file_name('fox', start_date, end_date)) with open(file_path, 'w') as f: f.write(json.dumps(list(good_urls))) f.close() # If there are any bad URLs, print how many there were and write them to a file for review print('Number of Bad Searches = {0}'.format(len(bad_searches))) if len(bad_searches): file_path = '../url_files/{0}'.format(get_file_name('fox', start_date, end_date, bad=True)) with open(file_path, 'w') as f: f.write(json.dumps(list(bad_searches))) f.close() start_datetime = start_datetime - datetime.timedelta(days=7) end_datetime = end_datetime - datetime.timedelta(days=7) start_date = start_datetime.strftime('%Y-%m-%d')
client = MongoClient() # Initialize the Database db = client['election_analysis'] while True: # Initialize table # If a table name has been provided use that, otherwise initialize 'articles' table if len(argv) > 3: tab = db[argv[3]] else: tab = db['nyt_' + start_datetime.strftime('%Y%m%d') + '_' + end_datetime.strftime('%Y%m%d')] print('Scraping NYT URLs from {0} to {1}'.format(start_date, end_date)) file_path = '../url_files/{0}'.format( get_file_name('nyt', start_date, end_date)) urls = load_urls(file_path) bad_urls = [] for url in urls: result = add_to_mongo(tab, url) if result: bad_urls.append(result) print('NYT Scraping Done...') print('Number of Bad Extractions = {0}'.format(bad_urls)) start_datetime = start_datetime - datetime.timedelta(days=7) end_datetime = end_datetime - datetime.timedelta(days=7) start_date = start_datetime.strftime('%Y-%m-%d') end_date = end_datetime.strftime('%Y-%m-%d')