def get_links_from_dir(): mongo_client = MongoClient() links_collection, articles_collection = get_mongodb_collections( mongo_client) dir_name = '/Users/joyceduan/data/kdnugget/' fnames = os.listdir(dir_name) for fname in fnames: #fname = '1.html' links = extract_urls(fname, dir_name) insert_links(links_collection, links, linksource=linksource)
def get_links_from_dir_linkedin(): mongo_client = MongoClient() links_collection, articles_collection = get_mongodb_collections(mongo_client) dir_name = "/Users/joyceduan/data/linkedin/" fnames = os.listdir(dir_name) print fnames for fname in fnames: # [0:1]: for testing links = extract_urls(fname, dir_name) dt_submit = None # print dt_submit # print '\n'.join([link[0][1] for link in links]) insert_links(links_collection, links, linksource="linkedin", dt_submit=dt_submit)
def get_links_from_dir(): mongo_client = MongoClient() links_collection, articles_collection = get_mongodb_collections(mongo_client) dir_name = "/Users/joyceduan/data/gradient/" fnames = os.listdir(dir_name) print fnames for fname in fnames: # [0:1]: for testing links = extract_urls(fname, dir_name) parts = fname.split("_") d = parts[2].split(".")[0] y = parts[0] m = parts[1] dt_submit = datetime.datetime(int(y), int(m), int(d)).strftime("%Y-%m-%d") # print dt_submit # print [link[0][1] for link in links] insert_links(links_collection, links, linksource=linksource, dt_submit=dt_submit)
#fname = '1.html' links = extract_urls(fname, dir_name) insert_links(links_collection, links, linksource=linksource) if __name__ == '__main__': mongo_client = MongoClient() links_collection, articles_collection = get_mongodb_collections( mongo_client) sections = [ 'opinions-interviews.html', 'meetings.html', 'publications.html', 'news-features.html' ] for y in [2014, 2015]: m_max = 13 if y == 2015: m_max = 7 for m in xrange(1, m_max): dt_submit = datetime.datetime(y, m, 1).strftime('%Y-%m-%d') for section in sections: str_m = "%02d" % m url_newsletter = 'http://www.kdnuggets.com/%s/%s/%s' % ( str(y), str_m, section) print url_newsletter raw_html = singel_query_raw_html_all_methods(url_newsletter) links = extract_urls_from_text(raw_html) print links #insert_links(links_collection, links, linksource = linksource) insert_links( links_collection, links, linksource=linksource, dt_submit=dt_submit) time.sleep(2)