def test_all(): main = scraper.get_soup("http://caixin.com/") links = get_links(main) num_links = len(links) i = 0 for link in links: # add this 1 line print link print i url, category, date = link print url print category print date article = scraper.get_soup(url) meta = scraper.get_meta(article) print meta content = get_content(article) if content: for line in get_content(article): print line else: print "PHOTOS ONLY" i += 1 print("SUCCESFULLY RETRIEVED " + str(i) + "/" + str(num_links))
def test_all(): root = "http://www.nanzao.com" main = scraper.get_soup(root) if main: print "Fetched index page" all_links = get_links(main) num_links = len(all_links) print "num_links", num_links i = 0 for url, cat in all_links: print "URL AND CAT" print url, cat soup = scraper.get_soup(root + url) if soup: publishdate = get_publishdate(soup) print "PUBLISHDATE" print publishdate content = get_content(soup) print "CONTENT" print content meta = scraper.get_meta(soup) print "META" print meta print "ARTICLE NUM", i i += 1 print "NUMBER OF SUCCESSFULLY FETCHED ARTICLES: ", i print "TOTAL NUMBER OF ARTICLES", num_links
def test_all(): print "test11" main = scraper.get_soup("http://www.infzm.com/") print "test22" links = get_links(main) num_links = len(links) i = 0 for link in links: print i print link article = scraper.get_soup(link) meta = scraper.get_meta(article) print meta content = get_content(article) print content i += 1 print("SUCCESFULLY RETRIEVED " + str(i) + "/" + str(num_links))
def test_all(): main = scraper.get_soup("http://cyol.com") links = get_links(main) num_links = len(links) #import pdb; pdb.set_trace() i = 0 for link in links: print i url, category, date = link print url print category print date article = scraper.get_soup(url) meta = scraper.get_meta(article) print meta author = get_author(article) print author content = get_content(article) print content i += 1 print("SUCCESFULLY RETRIEVED " + str(i) + "/" + str(num_links))
def test_all(): """Test scraper on all articles""" main = scraper.get_soup("http://www.huanqiu.com/") if main: print "got the main soup" all_links = get_links(main) num_links = len(all_links) i = 0 for link in all_links: # add this one line print link category = get_category(link) print category soup = scraper.get_soup(link) if soup: encoded_content = [line for line in get_content(soup)] all_meta = scraper.get_meta(soup) author = get_author(soup) publishdate = get_publishdate(soup) parsedate = time.strftime("%Y-%m-%d %H:%M:%S") #for testing if encoded_content: for p in encoded_content: print p print all_meta print author print publishdate print parsedate print "ARTICLE NUMBER", i i += 1 print "NUMBER OF SUCCESSFULLY FETCHED ARTICLES: ", i print "TOTAL NUMBER OF ARTICLES", num_links