def get_links_from_dir():
    mongo_client = MongoClient()
    links_collection, articles_collection = get_mongodb_collections(
        mongo_client)
    dir_name = '/Users/joyceduan/data/kdnugget/'
    fnames = os.listdir(dir_name)
    for fname in fnames:
        #fname = '1.html'
        links = extract_urls(fname, dir_name)
        insert_links(links_collection, links, linksource=linksource)
def get_links_from_dir_linkedin():
    mongo_client = MongoClient()
    links_collection, articles_collection = get_mongodb_collections(mongo_client)
    dir_name = "/Users/joyceduan/data/linkedin/"
    fnames = os.listdir(dir_name)
    print fnames
    for fname in fnames:  # [0:1]:  for testing

        links = extract_urls(fname, dir_name)

        dt_submit = None
        # print dt_submit
        # print '\n'.join([link[0][1] for link in links])
        insert_links(links_collection, links, linksource="linkedin", dt_submit=dt_submit)
def get_links_from_dir():
    mongo_client = MongoClient()
    links_collection, articles_collection = get_mongodb_collections(mongo_client)
    dir_name = "/Users/joyceduan/data/gradient/"
    fnames = os.listdir(dir_name)
    print fnames
    for fname in fnames:  # [0:1]:  for testing

        links = extract_urls(fname, dir_name)
        parts = fname.split("_")

        d = parts[2].split(".")[0]
        y = parts[0]
        m = parts[1]
        dt_submit = datetime.datetime(int(y), int(m), int(d)).strftime("%Y-%m-%d")
        # print dt_submit
        # print [link[0][1] for link in links]
        insert_links(links_collection, links, linksource=linksource, dt_submit=dt_submit)
        #fname = '1.html'
        links = extract_urls(fname, dir_name)
        insert_links(links_collection, links, linksource=linksource)

if __name__ == '__main__':
    mongo_client = MongoClient()
    links_collection, articles_collection = get_mongodb_collections(
        mongo_client)
    sections = [
        'opinions-interviews.html', 'meetings.html', 'publications.html', 'news-features.html'
    ]
    for y in [2014, 2015]:
        m_max = 13
        if y == 2015:
            m_max = 7
        for m in xrange(1, m_max):
            dt_submit = datetime.datetime(y, m, 1).strftime('%Y-%m-%d')
            for section in sections:
                str_m = "%02d" % m
                url_newsletter = 'http://www.kdnuggets.com/%s/%s/%s' % (
                    str(y), str_m, section)
                print url_newsletter

                raw_html = singel_query_raw_html_all_methods(url_newsletter)
                links = extract_urls_from_text(raw_html)
                print links
                #insert_links(links_collection, links, linksource = linksource)
                insert_links(
                    links_collection, links, linksource=linksource, dt_submit=dt_submit)
                time.sleep(2)