$ python wsj_article_urls.py 'startdate' 'enddate'
    '''

    start_date, end_date = argv[1], argv[2]
    start_datetime = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    end_datetime = datetime.datetime.strptime(end_date, '%Y-%m-%d')

    while True:
        print('Scraping WSJ URLs from {0} to {1}'.format(start_date, end_date))

        # Get dates to search over
        dates = get_dates(start_date, end_date)

        urls = set()

        for date in dates:
            print(date)
            urls = get_urls(date, urls)

        # Convert urls set to a list and write to a txt file
        file_path = '../url_files/{0}'.format(
            get_file_name('wsj', start_date, end_date))
        with open(file_path, 'w') as f:
            f.write(json.dumps(list(urls)))
            f.close()

        start_datetime = start_datetime - datetime.timedelta(days=7)
        end_datetime = end_datetime - datetime.timedelta(days=7)
        start_date = start_datetime.strftime('%Y-%m-%d')
        end_date = end_datetime.strftime('%Y-%m-%d')



if __name__ == '__main__':
    ''' This script should be called in the following way:
    $ python breitbart_all_urls.py
    '''

    print('Scraping BB')
    # Initialize empty lists for urls to be appended to

    did_it_work, urls = get_urls_from_search(dates)

    print('BB Scraping Done...')
    print('There were a total of {0} urls'.format(len(urls)))

    if did_it_work:
        # Convert good_urls set to a list and write to a txt file
        file_path = '../url_files/{0}'.format(get_file_name('bb', start_date, end_date))
        f = open(file_path, 'w')
        f.write(json.dumps(list(urls)))
        f.close()
    else:
        print("Didn't work")

    start_datetime = start_datetime - datetime.timedelta(days=7)
    end_datetime = end_datetime - datetime.timedelta(days=7)
    start_date = start_datetime.strftime('%Y-%m-%d')
    end_date = end_datetime.strftime('%Y-%m-%d')
if __name__ == '__main__':
    ''' This script should be called in the following way:
    $ python fox_scraper.py 'startdate' 'enddate' 'table (optional)'
    '''
    # Create MongoClient
    client = MongoClient()
    # Initialize the Database
    db = client['election_analysis']
    # Initialize table
    # If a table name has been provided use that, otherwise initialize 'articles' table
    if len(argv) > 3:
        tab = db[argv[3]]
    else:
        tab = db['articles']

    start_date, end_date = argv[1], argv[2]
    print 'Scraping FOX URLs from {0} to {1}'.format(start_date, end_date)

    file_path = '../url_files/{0}'.format(
        get_file_name('fox', start_date, end_date))
    urls = load_urls(file_path)

    bad_urls = []
    for url in urls:
        result = add_to_mongo(tab, url)
        if result:
            bad_urls.append(result)

    print 'FOX Scraping Done...'
    print 'Number of Bad Extractions = {0}'.format(bad_urls)
    print 'There were a total of {0} failed searches'.format(len(bad_searches))

    # If there were any bad searchs we should try and make some attempts to redo the searchs in a non-threaded way
    attempt = 0
    while attempt < 3 and len(bad_searches) > 0:
        # This will give us a tuple of (searchterm, date tuple) to research over
        searchterms_and_dates = list(bad_searches)
        # Reset our bad_searches to an empty set
        bad_searches = set()
        # Create a Firefox driver
        driver = webdriver.Firefox()
        for searchterm, date in searchterms_and_dates:
            good_urls, bad_searches = get_urls(driver, searchterm, [date], good_urls, bad_searches)
        driver.close()
        attempt += 1
        print 'Total of {0} failed searches after attempt {1}'.format(len(bad_searches), attempt)

    # Convert good_urls set to a list and write to a txt file
    file_path = '../url_files/{0}'.format(get_file_name('fox', start_date, end_date))
    with open(file_path, 'w') as f:
        f.write(json.dumps(list(good_urls)))
        f.close()

    # If there are any bad URLs, print how many there were and write them to a file for review
    print 'Number of Bad Searches = {0}'.format(len(bad_searches))
    if len(bad_searches):
        file_path = '../url_files/{0}'.format(get_file_name('fox', start_date, end_date, bad=True))
        with open(file_path, 'w') as f:
            f.write(json.dumps(list(bad_searches)))
            f.close()
    while attempt < 3 and len(bad_searches) > 0:
        # This will give us a tuple of (searchterm, date tuple) to research over
        searchterms_and_dates = list(bad_searches)
        # Reset our bad_searches to an empty set
        bad_searches = set()
        # Create a Firefox driver
        driver = webdriver.Firefox()
        for searchterm, date in searchterms_and_dates:
            good_urls, bad_searches = get_urls(driver, searchterm, [date],
                                               good_urls, bad_searches)
        driver.close()
        attempt += 1
        print 'Total of {0} failed searches after attempt {1}'.format(
            len(bad_searches), attempt)

    # Convert good_urls set to a list and write to a txt file
    file_path = '../url_files/{0}'.format(
        get_file_name('fox', start_date, end_date))
    with open(file_path, 'w') as f:
        f.write(json.dumps(list(good_urls)))
        f.close()

    # If there are any bad URLs, print how many there were and write them to a file for review
    print 'Number of Bad Searches = {0}'.format(len(bad_searches))
    if len(bad_searches):
        file_path = '../url_files/{0}'.format(
            get_file_name('fox', start_date, end_date, bad=True))
        with open(file_path, 'w') as f:
            f.write(json.dumps(list(bad_searches)))
            f.close()
                add_url = True
            elif keyword in article.text.lower():
                add_url = True
        if add_url:
            urls.add(article.find('a').get('href'))
    return urls


if __name__=='__main__':
    ''' This script should be called in the following way:
    $ python wsj_article_urls.py 'startdate' 'enddate'
    '''

    keywords = get_keywords_2016()
    start_date, end_date = argv[1], argv[2]
    print 'Scraping WSJ URLs from {0} to {1}'.format(start_date, end_date)

    # Get dates to search over
    dates = get_dates(start_date, end_date)

    urls = set()

    for date in dates:
        urls = get_urls(date, keywords, urls)

    # Convert urls set to a list and write to a txt file
    file_path = '../url_files/{0}'.format(get_file_name('wsj', start_date, end_date))
    with open(file_path, 'w') as f:
        f.write(json.dumps(list(urls)))
        f.close()
        # attempt = 0
        # while attempt < 3 and len(bad_searches) > 0:
        #     # This will give us a tuple of (searchterm, date tuple) to research over
        #     searchterms_and_dates = list(bad_searches)
        #     # Reset our bad_searches to an empty set
        #     bad_searches = set()
        #     # Create a Firefox driver
        #     driver = webdriver.Firefox()
        #     for searchterm, date in searchterms_and_dates:
        #         good_urls, bad_searches = get_urls(driver, searchterm, [date], good_urls, bad_searches)
        #     driver.close()
        #     attempt += 1
        #     print('Total of {0} failed searches after attempt {1}'.format(len(bad_searches), attempt))

        # Convert good_urls set to a list and write to a txt file
        file_path = '../url_files/{0}'.format(get_file_name('fox', start_date, end_date))
        with open(file_path, 'w') as f:
            f.write(json.dumps(list(good_urls)))
            f.close()

        # If there are any bad URLs, print how many there were and write them to a file for review
        print('Number of Bad Searches = {0}'.format(len(bad_searches)))
        if len(bad_searches):
            file_path = '../url_files/{0}'.format(get_file_name('fox', start_date, end_date, bad=True))
            with open(file_path, 'w') as f:
                f.write(json.dumps(list(bad_searches)))
                f.close()

        start_datetime = start_datetime - datetime.timedelta(days=7)
        end_datetime = end_datetime - datetime.timedelta(days=7)
        start_date = start_datetime.strftime('%Y-%m-%d')
Beispiel #8
0
    client = MongoClient()
    # Initialize the Database
    db = client['election_analysis']
    while True:
        # Initialize table
        # If a table name has been provided use that, otherwise initialize 'articles' table
        if len(argv) > 3:
            tab = db[argv[3]]
        else:
            tab = db['nyt_' + start_datetime.strftime('%Y%m%d') + '_' +
                     end_datetime.strftime('%Y%m%d')]

        print('Scraping NYT URLs from {0} to {1}'.format(start_date, end_date))

        file_path = '../url_files/{0}'.format(
            get_file_name('nyt', start_date, end_date))
        urls = load_urls(file_path)

        bad_urls = []
        for url in urls:
            result = add_to_mongo(tab, url)
            if result:
                bad_urls.append(result)

        print('NYT Scraping Done...')
        print('Number of Bad Extractions = {0}'.format(bad_urls))

        start_datetime = start_datetime - datetime.timedelta(days=7)
        end_datetime = end_datetime - datetime.timedelta(days=7)
        start_date = start_datetime.strftime('%Y-%m-%d')
        end_date = end_datetime.strftime('%Y-%m-%d')