def test_get_content(): print '\nfunction: %s ' % inspect.stack()[0][3] url = 'http://petewarden.com/2015/04/20/why-gemm-is-at-the-heart-of-deep-learning/' out_fname = 'data/test.html' min_length = 100 raw_html = singel_query_raw_html_all_methods(url) print 'length: %i' % len(raw_html) print 'html written out to file %s' % out_fname with open(out_fname, 'w') as in_fh: #in_fh.write(unicodedata.normalize('NFKD', raw_html).encode('ascii','ignore')) in_fh.write(raw_html.encode('ascii', 'ignore')) n.assert_greater(len(raw_html), min_length) print ''
#fname = '1.html' links = extract_urls(fname, dir_name) insert_links(links_collection, links, linksource=linksource) if __name__ == '__main__': mongo_client = MongoClient() links_collection, articles_collection = get_mongodb_collections( mongo_client) sections = [ 'opinions-interviews.html', 'meetings.html', 'publications.html', 'news-features.html' ] for y in [2014, 2015]: m_max = 13 if y == 2015: m_max = 7 for m in xrange(1, m_max): dt_submit = datetime.datetime(y, m, 1).strftime('%Y-%m-%d') for section in sections: str_m = "%02d" % m url_newsletter = 'http://www.kdnuggets.com/%s/%s/%s' % ( str(y), str_m, section) print url_newsletter raw_html = singel_query_raw_html_all_methods(url_newsletter) links = extract_urls_from_text(raw_html) print links #insert_links(links_collection, links, linksource = linksource) insert_links( links_collection, links, linksource=linksource, dt_submit=dt_submit) time.sleep(2)