def get_news_from_url(): """ This function finds all news items from mongodb which do not have a full text. It then downloads full text for all such items""" # Parser setup g = Goose({'browser_user_agent': 'Mozilla'}) total_items = db.news_data.find({ "full_article_text": { "$exists": False } }).count() uf._printer_G('Total Items to retrive: %d' % (total_items)) i = 1 for d in db.news_data.find({"full_article_text": {"$exists": False}}): # print d # code.interact( local=locals() ) uf._printer_G('---%d of %d---' % (i, total_items)) i += 1 startTime = time.time() #TODO, only print _id and uuid. As we add more alert sources in addition # to google-alerts. We may or may not have titles from them. Possibly # only have the urls print '_id :', str(d['_id']) print 'uuid :', d['uuid'] print 'Downloading: ', d['url'] print 'Alert on :', d['alert_title'] #consider not printering this print 'news_id :', d['news_id'] #consider not printering new_data = {} new_data['full_article_title'] = "" new_data['full_article_text'] = "" new_data['full_article_domain'] = "" new_data['full_article_publish_date'] = "" try: with uf.Timeout(5): article = g.extract(url=d['url']) print 'article.title :', article.title print 'article.domain :', article.domain # print article.cleaned_text print 'article.publish_date:', article.publish_date new_data['full_article_title'] = article.title new_data['full_article_text'] = article.cleaned_text new_data['full_article_domain'] = article.domain new_data['full_article_publish_date'] = article.publish_date # code.interact( local=locals() ) except uf.Timeout.Timeout: print '[ERROR] Timeout. This item (uuid=%s) will be empty' % ( d['uuid']) except: print 'py-goose retrival failed!' db.news_data.find_one_and_update({"_id": d['_id']}, {"$set": new_data}) uf._printer_('Done in %4.2fs' % (time.time() - startTime))
input_html_file = 'google-alerts.html' output_csv = input_html_file + '.csv' #'feed_list_ejcbsdhansbn.csv' uf._printer_Y('Input File : ' + input_html_file) uf._printer_Y('Output File : ' + output_csv) soup = BeautifulSoup(open(input_html_file).read(), "lxml") alerts_soup = soup.find("div", {"id": "manage-alerts-div"}) if alerts_soup is None: uf._error("No div with id:manage-alerts-div.\nQuit---") quit() all_li = alerts_soup.findAll("li") if len(all_li) < 1: uf._error("No Alerts\nQuit-----") quit() fp_out = open(output_csv, 'w') for li in all_li: tag_text = li.find("div", {"class": "query_div"}).get_text().strip() rss_url = li.find("a")['href'] uf._printer_('---') uf._printer_(' tag : ' + tag_text) uf._printer_G('rss_url : ' + rss_url) fp_out.write("%s,%s\n" % (tag_text, rss_url)) fp_out.close() uf._printer_G("Done!")
print 'Downloading: ', d['url'] print 'Alert on :', d['alert_title'] #consider not printering this print 'news_id :', d['news_id'] #consider not printering new_data = {} new_data['full_article_title'] = "" new_data['full_article_text'] = "" new_data['full_article_domain'] = "" new_data['full_article_publish_date'] = "" try: with uf.Timeout(5): article = g.extract(url=d['url']) print 'article.title :', article.title print 'article.domain :', article.domain # print article.cleaned_text print 'article.publish_date:', article.publish_date new_data['full_article_title'] = article.title new_data['full_article_text'] = article.cleaned_text new_data['full_article_domain'] = article.domain new_data['full_article_publish_date'] = article.publish_date # code.interact( local=locals() ) except uf.Timeout.Timeout: print '[ERROR] Timeout. This item (uuid=%s) will be empty' % ( d['uuid']) except: print 'py-goose retrival failed!' db.news_data.find_one_and_update({"_id": d['_id']}, {"$set": new_data}) uf._printer_('Done in %4.2fs' % (time.time() - startTime))
ob.download_alerts() ob.insert_into_db(db) # # More Alert Sources (future work) # As we have more sources for alerts, they will go here. These will basically # put urls in mongodb in db.sun_dance.news_data. Try and have similar interface to # google-alerts # # # URL Loop - Loop over all the items in mongodb which do not have full text of # articles # uf._printer_G('+++++\n +++++ Start www crawl\n +++++') get_news_from_url() run_done_in = time.time() - startTimerun uf._printer_('Run#%d completed in %4.2f sec on %s' % (run, run_done_in, str(datetime.now()))) run = run + 1 sleep_for = repeat_every_sec - run_done_in if sleep_for > 0: uf._printer_('Sleeping....zZzz for %4.2f sec' % (sleep_for)) time.sleep(sleep_for) # # Delete Raw files # uf._printer_('rm -rf %s' % (storage_folder)) shutil.rmtree(storage_folder)
def make_folder_if_not_exist(folder): if not os.path.exists(folder): print tcol.OKGREEN, 'Make Directory : ', folder, tcol.ENDC os.makedirs(folder) else: print tcol.WARNING, 'Directory already exists : Not creating :', folder, tcol.ENDC uf._debug('Alerts DB :' + ALERTS_DB) uf._debug('Open file : ' + CSV_FILENAME) make_folder_if_not_exist(ALERTS_DB) csvReader = csv.reader(open(CSV_FILENAME)) for row_i, row in enumerate(csvReader): uf._printer_('---' + str(row_i)) uf._debug('row : ' + str(row), lvl=2) tag = row[0] url = row[1] alert_id = url.strip().split('/')[-1] alert_user_id = url.strip().split('/')[-2] uf._printer_G('alert_id=%s ; user=%s ; tag=%s' % (alert_id, alert_user_id, tag)) # Download uf._debug('URL:%s' % (url)) startTime = time.time() response = urllib2.urlopen(url) html = response.read()