Beispiel #1
0
 def get_news_sentence(self, answer):
     #Create a database of news articles about the subject of te question
     cg = NewsCorpusGenerator('temp_news_corpus', 'sqlite')
     links = cg.google_news_search(answer, 'Standard', 5)
     cg.generate_corpus(links)
     conn = sqlite3.connect('temp_news_corpus/corpus.db')
     news_strings = []
     for row in conn.execute('SELECT body FROM articles'):
         news_strings.append(
             str(row).decode('unicode_escape').encode('ascii', 'ignore'))
     os.remove('temp_news_corpus/corpus.db')  # Remove the database
     for n in news_strings[1:]:
         summary = summarize(n)
         if (summary != u"" and summary != []):
             if (summary[0:3] == '(u"'):
                 return summary[3:]
             else:
                 return summary
     return ''
Beispiel #2
0
    def trainModelFromTopics(s):
        if s.verbose:
            s.logger.info("trainModelFromTopics : Topics :" + str(s.topics))
        ncg = NewsCorpusGenerator(s.corpus_dir,
                                  'mongo',
                                  mongo_db_name='DomainModelCorpora',
                                  domain=s.domain)

        article_links = []
        for t in s.topics:
            # Extract Content & Create Corpus
            article_links.extend(s.crawl_links(t, s.domain))
        # 1. crawl the topics
        if s.verbose:
            s.logger.info(("Total %d links to extract" % len(article_links)) +
                          " links ==>" + str(article_links))
        # 2. store results in mongoDB
        ncg.generate_corpus(article_links)
        if s.verbose:
            s.logger.info("trainModelFromTopics : Stats:" +
                          str(ncg.get_stats()))
Beispiel #3
0
from news_corpus_builder import NewsCorpusGenerator
from pprint import pprint
import sys
file_path = '/Users/skillachie/hand_selected_articles.txt'
corpus_dir = '/Users/skillachie/finance_corpus'

category_total = 300

article_links = []
ex = NewsCorpusGenerator(corpus_dir,'sqlite')

# Add hand selected articles 
article_links.extend(ex.read_links_file(file_path))


def get_links(terms,category):
    category_articles = []
    article_count = int(category_total/len(terms))
    for term in terms:
        category_articles.extend(ex.google_news_search(term,category,article_count))
    return category_articles


# Policy Articles
policy_terms = ['SEC','monetary','fed','fiscal']
policy = get_links(policy_terms,'Policy')
print len(policy)
article_links.extend(policy)

# International Finance 
if_terms = ['global finance','imf','ECB','RMB devaluation','international finance']
import os
from news_corpus_builder import NewsCorpusGenerator
from iab_cat_load import iab_tier2


# Location to save generated corpus
news_corpus_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'news_data')

# Save results to sqlite or  files per article
ex = NewsCorpusGenerator(news_corpus_dir)


for subcategory, category in iab_tier2.iteritems():
    print 'Getting search result for [' + subcategory + '] in [' + category + ']'
    # Retrieve 50 links related to the search term dogs and assign a category of Pet to the retrieved links
    links = ex.google_news_search(subcategory, category, 100)
    print 'saving...'
    # Generate and save corpus
    try:
        ex.generate_corpus(links)
    except:
        pass
Beispiel #5
0
import os
from news_corpus_builder import NewsCorpusGenerator
from iab_cat_load import iab_tier2

# Location to save generated corpus
news_corpus_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                               'news_data')

# Save results to sqlite or  files per article
ex = NewsCorpusGenerator(news_corpus_dir)

for subcategory, category in iab_tier2.iteritems():
    print 'Getting search result for [' + subcategory + '] in [' + category + ']'
    # Retrieve 50 links related to the search term dogs and assign a category of Pet to the retrieved links
    links = ex.google_news_search(subcategory, category, 100)
    print 'saving...'
    # Generate and save corpus
    try:
        ex.generate_corpus(links)
    except:
        pass
Beispiel #6
0
    def __init__(s,
                 topicsfile=None,
                 pdfSource=None,
                 textfile=None,
                 mongohost='localhost',
                 domainsourcefile=None,
                 domain=None,
                 modelStore=None,
                 modelName='',
                 word2vecflag=None,
                 removeFlag=False,
                 renameflag=False,
                 newFlag=False,
                 listFlag=False,
                 addFlag=False,
                 logger=None,
                 verbose=False):
        s.topicsfile = topicsfile
        s.textfile = textfile
        s.removeFlag = removeFlag
        s.renameflag = renameflag
        s.newFlag = newFlag
        s.listFlag = listFlag
        s.addFlag = addFlag
        s.pdfSource = pdfSource
        s.mongohost = mongohost
        s.domainsourcefile = domainsourcefile
        s.domain = domain
        s.collectionname = ''

        # note the domain may have embedded spaces. These get replaced by the '_' character.
        if (s.domain is not None):
            s.collectionname = s.domain.replace(' ', '_')
        else:
            s.collectionname = 'articles'
        if (s.domain is not None):
            s.domain_external_name = s.domain.replace(' ', '_')
        else:
            s.domain_external_name = ''

        s.modelStore = modelStore
        s.modelName = s.internal2external(s.domain)
        s.corpus_dir = 'NewsCorpus'
        # s.article_links = []
        s.domain_total = 300

        s.word2vecflag = word2vecflag
        s.topics = []
        s.logger = logger
        s.verbose = verbose
        if s.topicsfile:
            retval = s.readConfigFromIniFile()
            if not retval: sys.exit(-1)
        if s.verbose:
            s.logger.info("__init__ : domain :" + str(s.domain) +
                          " external model :" + str(s.domain_external_name))
        # mongo instances
        s.maD = MongoAccessor(host=mongohost,
                              port='27017',
                              db='vizwiz',
                              collection='Domains',
                              logger=s.logger)
        s.maC = MongoAccessor(host=mongohost,
                              port='27017',
                              db='DomainModelCorpora',
                              collection=s.collectionname,
                              logger=s.logger)
        # gridFs instance
        s.setModelName()
        s.gfs = GridFSModel(modelName=s.modelName,
                            host=mongohost,
                            port='27017',
                            logger=s.logger)
        # does the <domain>_mdl already exist?
        s.existingModel = s.getDomainModel()
        if s.verbose:
            s.logger.info("Existing Model in Mongo==>" + str(s.existingModel))
        s.lineSentence = None
        s.sentences = []

        if s.domain:
            s.ncg = NewsCorpusGenerator(s.corpus_dir,
                                        'mongo',
                                        mongo_db_name='DomainModelCorpora',
                                        domain=s.domain)