def get_news_sentence(self, answer): #Create a database of news articles about the subject of te question cg = NewsCorpusGenerator('temp_news_corpus', 'sqlite') links = cg.google_news_search(answer, 'Standard', 5) cg.generate_corpus(links) conn = sqlite3.connect('temp_news_corpus/corpus.db') news_strings = [] for row in conn.execute('SELECT body FROM articles'): news_strings.append( str(row).decode('unicode_escape').encode('ascii', 'ignore')) os.remove('temp_news_corpus/corpus.db') # Remove the database for n in news_strings[1:]: summary = summarize(n) if (summary != u"" and summary != []): if (summary[0:3] == '(u"'): return summary[3:] else: return summary return ''
def trainModelFromTopics(s): if s.verbose: s.logger.info("trainModelFromTopics : Topics :" + str(s.topics)) ncg = NewsCorpusGenerator(s.corpus_dir, 'mongo', mongo_db_name='DomainModelCorpora', domain=s.domain) article_links = [] for t in s.topics: # Extract Content & Create Corpus article_links.extend(s.crawl_links(t, s.domain)) # 1. crawl the topics if s.verbose: s.logger.info(("Total %d links to extract" % len(article_links)) + " links ==>" + str(article_links)) # 2. store results in mongoDB ncg.generate_corpus(article_links) if s.verbose: s.logger.info("trainModelFromTopics : Stats:" + str(ncg.get_stats()))
from news_corpus_builder import NewsCorpusGenerator from pprint import pprint import sys file_path = '/Users/skillachie/hand_selected_articles.txt' corpus_dir = '/Users/skillachie/finance_corpus' category_total = 300 article_links = [] ex = NewsCorpusGenerator(corpus_dir,'sqlite') # Add hand selected articles article_links.extend(ex.read_links_file(file_path)) def get_links(terms,category): category_articles = [] article_count = int(category_total/len(terms)) for term in terms: category_articles.extend(ex.google_news_search(term,category,article_count)) return category_articles # Policy Articles policy_terms = ['SEC','monetary','fed','fiscal'] policy = get_links(policy_terms,'Policy') print len(policy) article_links.extend(policy) # International Finance if_terms = ['global finance','imf','ECB','RMB devaluation','international finance']
import os from news_corpus_builder import NewsCorpusGenerator from iab_cat_load import iab_tier2 # Location to save generated corpus news_corpus_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'news_data') # Save results to sqlite or files per article ex = NewsCorpusGenerator(news_corpus_dir) for subcategory, category in iab_tier2.iteritems(): print 'Getting search result for [' + subcategory + '] in [' + category + ']' # Retrieve 50 links related to the search term dogs and assign a category of Pet to the retrieved links links = ex.google_news_search(subcategory, category, 100) print 'saving...' # Generate and save corpus try: ex.generate_corpus(links) except: pass
def __init__(s, topicsfile=None, pdfSource=None, textfile=None, mongohost='localhost', domainsourcefile=None, domain=None, modelStore=None, modelName='', word2vecflag=None, removeFlag=False, renameflag=False, newFlag=False, listFlag=False, addFlag=False, logger=None, verbose=False): s.topicsfile = topicsfile s.textfile = textfile s.removeFlag = removeFlag s.renameflag = renameflag s.newFlag = newFlag s.listFlag = listFlag s.addFlag = addFlag s.pdfSource = pdfSource s.mongohost = mongohost s.domainsourcefile = domainsourcefile s.domain = domain s.collectionname = '' # note the domain may have embedded spaces. These get replaced by the '_' character. if (s.domain is not None): s.collectionname = s.domain.replace(' ', '_') else: s.collectionname = 'articles' if (s.domain is not None): s.domain_external_name = s.domain.replace(' ', '_') else: s.domain_external_name = '' s.modelStore = modelStore s.modelName = s.internal2external(s.domain) s.corpus_dir = 'NewsCorpus' # s.article_links = [] s.domain_total = 300 s.word2vecflag = word2vecflag s.topics = [] s.logger = logger s.verbose = verbose if s.topicsfile: retval = s.readConfigFromIniFile() if not retval: sys.exit(-1) if s.verbose: s.logger.info("__init__ : domain :" + str(s.domain) + " external model :" + str(s.domain_external_name)) # mongo instances s.maD = MongoAccessor(host=mongohost, port='27017', db='vizwiz', collection='Domains', logger=s.logger) s.maC = MongoAccessor(host=mongohost, port='27017', db='DomainModelCorpora', collection=s.collectionname, logger=s.logger) # gridFs instance s.setModelName() s.gfs = GridFSModel(modelName=s.modelName, host=mongohost, port='27017', logger=s.logger) # does the <domain>_mdl already exist? s.existingModel = s.getDomainModel() if s.verbose: s.logger.info("Existing Model in Mongo==>" + str(s.existingModel)) s.lineSentence = None s.sentences = [] if s.domain: s.ncg = NewsCorpusGenerator(s.corpus_dir, 'mongo', mongo_db_name='DomainModelCorpora', domain=s.domain)