Esempio n. 1
0
def scrape_one_page_wayback(url, my_mongo):
    """
    url = 'https://web.archive.org/web/20140210061922/http://www.datatau.com/'
    scrape_one_page_wayback(url, my_mongo)
    """
    prefix_url = "https://web.archive.org"

    dt_str = url.split("/web/")[1]
    dt_ref = datetime.datetime.strptime(dt_str[:8], "%Y%m%d")

    response = single_query(url)

    links = extract_urls(response.text, prefix_url)
    # print links

    insert_links(my_mongo, links, linksourceversion="", subsource="", dt_ref=dt_ref)
if __name__ == '__main__':

    dir_name = '/Users/joyceduan/data/datascienceweekly/'

    mongo_client = MongoClient()
    links_collection, articles_collection = get_mongodb_collections(
        mongo_client)

    urls_weekly, dates = get_links_to_weekly()
    url_prefix = 'http://www.datascienceweekly.org'
    print urls_weekly
    print dates

    for i, url in enumerate(urls_weekly[1:]):
        print i, url, dates[i]
        response = single_query(url_prefix + url)
        if response:
            links = exctract_urls_from_text(response.text)
            insert_links(
                links_collection, links, linksource=linksource, dt_submit=dates[i])
        # except:
        #	print '!!error %s' % url_prefix+url

    '''
	fnames = os.listdir(dir_name)
	for fname in fnames:
		#fname = '1.html'
		links = extract_urls(fname, dir_name)
		insert_links(links_collection, links, linksource = linksource)
	'''
    link_query = {'gothtml': {'$exists': 0},