def scrape_one_page_wayback(url, my_mongo): """ url = 'https://web.archive.org/web/20140210061922/http://www.datatau.com/' scrape_one_page_wayback(url, my_mongo) """ prefix_url = "https://web.archive.org" dt_str = url.split("/web/")[1] dt_ref = datetime.datetime.strptime(dt_str[:8], "%Y%m%d") response = single_query(url) links = extract_urls(response.text, prefix_url) # print links insert_links(my_mongo, links, linksourceversion="", subsource="", dt_ref=dt_ref)
if __name__ == '__main__': dir_name = '/Users/joyceduan/data/datascienceweekly/' mongo_client = MongoClient() links_collection, articles_collection = get_mongodb_collections( mongo_client) urls_weekly, dates = get_links_to_weekly() url_prefix = 'http://www.datascienceweekly.org' print urls_weekly print dates for i, url in enumerate(urls_weekly[1:]): print i, url, dates[i] response = single_query(url_prefix + url) if response: links = exctract_urls_from_text(response.text) insert_links( links_collection, links, linksource=linksource, dt_submit=dates[i]) # except: # print '!!error %s' % url_prefix+url ''' fnames = os.listdir(dir_name) for fname in fnames: #fname = '1.html' links = extract_urls(fname, dir_name) insert_links(links_collection, links, linksource = linksource) ''' link_query = {'gothtml': {'$exists': 0},