def get_news_sentence(self, answer): #Create a database of news articles about the subject of te question cg = NewsCorpusGenerator('temp_news_corpus', 'sqlite') links = cg.google_news_search(answer, 'Standard', 5) cg.generate_corpus(links) conn = sqlite3.connect('temp_news_corpus/corpus.db') news_strings = [] for row in conn.execute('SELECT body FROM articles'): news_strings.append( str(row).decode('unicode_escape').encode('ascii', 'ignore')) os.remove('temp_news_corpus/corpus.db') # Remove the database for n in news_strings[1:]: summary = summarize(n) if (summary != u"" and summary != []): if (summary[0:3] == '(u"'): return summary[3:] else: return summary return ''
def trainModelFromTopics(s): if s.verbose: s.logger.info("trainModelFromTopics : Topics :" + str(s.topics)) ncg = NewsCorpusGenerator(s.corpus_dir, 'mongo', mongo_db_name='DomainModelCorpora', domain=s.domain) article_links = [] for t in s.topics: # Extract Content & Create Corpus article_links.extend(s.crawl_links(t, s.domain)) # 1. crawl the topics if s.verbose: s.logger.info(("Total %d links to extract" % len(article_links)) + " links ==>" + str(article_links)) # 2. store results in mongoDB ncg.generate_corpus(article_links) if s.verbose: s.logger.info("trainModelFromTopics : Stats:" + str(ncg.get_stats()))
# Commodities commodities_terms = ['silver','gold','commodities'] commo = get_links(commodities_terms,'Commodities') print len(commo) article_links.extend(commo) # Fraud & Insider Trading fraud_terms = ['insider trading','Ponzi Scheme','finance fraud'] fraud = get_links(fraud_terms,'Fraud') print len(fraud) article_links.extend(fraud) # Litigation lit_terms = ['company settlement','company litigation','company lawsuit'] lit = get_links(lit_terms,'Litigation') print len(lit) article_links.extend(lit) # Earning Reports er_terms = ['earning reports','quarterly results','financial statement'] er = get_links(er_terms,'Earning_Reports') print len(er) article_links.extend(er) # Extract Content & Create Corpus print "Total %d links to extract" % len(article_links) ex.generate_corpus(article_links) print ex.get_stats()
import os from news_corpus_builder import NewsCorpusGenerator from iab_cat_load import iab_tier2 # Location to save generated corpus news_corpus_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'news_data') # Save results to sqlite or files per article ex = NewsCorpusGenerator(news_corpus_dir) for subcategory, category in iab_tier2.iteritems(): print 'Getting search result for [' + subcategory + '] in [' + category + ']' # Retrieve 50 links related to the search term dogs and assign a category of Pet to the retrieved links links = ex.google_news_search(subcategory, category, 100) print 'saving...' # Generate and save corpus try: ex.generate_corpus(links) except: pass
import os from news_corpus_builder import NewsCorpusGenerator from iab_cat_load import iab_tier2 # Location to save generated corpus news_corpus_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'news_data') # Save results to sqlite or files per article ex = NewsCorpusGenerator(news_corpus_dir) for subcategory, category in iab_tier2.iteritems(): print 'Getting search result for [' + subcategory + '] in [' + category + ']' # Retrieve 50 links related to the search term dogs and assign a category of Pet to the retrieved links links = ex.google_news_search(subcategory, category, 100) print 'saving...' # Generate and save corpus try: ex.generate_corpus(links) except: pass