Example #1
0
 def _load_nonpubs(self):
     self._nonpubs = set()
     cfr = ConfigReader()
     path = os.path.abspath(cfr.get('NONPUBS'))
     with open(path) as f:
         for line in f.readlines():
             line = line.strip().lower()
             if len(line) > 0:
                 self._nonpubs.add(line)
Example #2
0
    def _load(self, company_id):
        cfr = ConfigReader()
        p1 = cfr.get('ROOT_ORIGINAL')
        p2 = cfr.get('FORMATTED_PR')
        p3 = os.path.join(p1, p2)
        metafile = common.get_list_file_name(company_id)

        path_text = os.path.join(p3, str(company_id))
        path_meta = os.path.join(p3, metafile)

        pr_text = self._load_text(path_text)
        self._load_meta(path_meta, pr_text)
Example #3
0
 def get_linkpages(self):
     all_html = []
     cfr = ConfigReader()
     root = cfr.get('ROOT_ORIGINAL')
     path1 = cfr.get('PR_SOURCES')
     path2 = os.path.join(root, path1)
     path = os.path.join(path2, 'microsoft.html')
     print 'collecting links from source file'
     with open(path) as f:
         lines = f.readlines()
         all_html.append(''.join(lines))
     return all_html
Example #4
0
    def write_tokens(self, company_id):
        tokenizer = Tokenizer()
        br = ConfigReader().get('MARKER_BR')

        # uncomment this block when done with press releases
        #        dic = {}
        #        articles = ArticleLoader(company_id).get_articles()
        #        for i, key in enumerate(articles):
        #            if i % 50 == 0:
        #                print 'pickling article {0} of {1} for company {2}'.format(i+1, len(articles), company_id)
        #            article = articles[key]
        #            text = article.headline() + '\n\n' + article.body()
        #            self._add_tokens_to_dic(tokenizer, dic, article.id(), text, br)
        #
        #        output_path = common.get_pickled_news_tokens_path(company_id)
        #        self._pickle(company_id, dic, output_path)

        dic = {}
        releases = ReleaseLoader(company_id).get_releases()
        for i, key in enumerate(releases):
            if i % 50 == 0:
                print 'pickling release {0} of {1} for company {2}'.format(
                    i + 1, len(releases), company_id)
            release = releases[key]
            text = release.title() + '\n\n' + release.body()
            self._add_tokens_to_dic(tokenizer, dic, release.id(), text, br)

        output_path = common.get_pickled_pr_tokens_path(company_id)
        self._pickle(company_id, dic, output_path)
Example #5
0
 def __init__(self, company_id, matches_name):
     self._company_id = company_id
     self._matchloader = MatchLoader(company_id, matches_name)
     self._tokens = TokenLoader(company_id)
     self._releases = ReleaseLoader(company_id).get_releases()
     self._articles = ArticleLoader(company_id).get_articles()
     self._br = ConfigReader().get('MARKER_BR')
Example #6
0
    def write(self):
        br = ConfigReader().get('MARKER_BR')
        for release_id in self._release_ids:

            tokens = self._tokens.get_release_tokens(release_id, False)
            text = ' '.join(tokens)
            text = text.replace(br, '\n')
            path = self._get_filepath(common.DOCTYPE_PR, release_id)
            with open(path, 'w') as f:
                f.write(text)

        for article_id in self._article_ids:
            tokens = self._tokens.get_article_tokens(article_id, False)
            text = ' '.join(tokens)
            text = text.replace(br, '\n')
            path = self._get_filepath(common.DOCTYPE_NEWS, article_id)
            with open(path, 'w') as f:
                f.write(text)
Example #7
0
def get_sentiment_words_neg_path(company_id):
    cr = ConfigReader()
    dir_path = cr.get('SENTIMENT_WORDS_NEG')
    return os.path.join(dir_path, str(company_id))
Example #8
0
def get_pickled_matches_path(company_id, subset_name):
    cr = ConfigReader()
    dir_path = cr.get('PICKLED_MATCHES')
    path = os.path.join(dir_path, subset_name)
    filename = '{0}.pickle'.format(company_id)
    return os.path.join(path, filename)
Example #9
0
 def __init__(self, company_id):
     self._load_dictionaries(company_id)
     self._exclude_tokens = set(string.punctuation)
     self._exclude_tokens.add(ConfigReader().get('MARKER_BR'))
Example #10
0
def get_postags_path():
    cr = ConfigReader()
    return cr.get('POSTAGS')
Example #11
0
def get_pickled_pr_tokens_path(company_id):
    cr = ConfigReader()
    dir_path = cr.get('PICKLED_TOKENS_PR')
    filename = '{0}.pickle'.format(company_id)
    return os.path.join(dir_path, filename)
Example #12
0
 def __init__(self):
     self._br = ConfigReader().get('MARKER_BR')
Example #13
0
 def _get_input_path(self):
     cfr = ConfigReader()
     root = cfr.get('ROOT_ORIGINAL')
     path1 = cfr.get('DOWNLOADED_NEWS')
     path2 = os.path.join(root, path1)
     return os.path.join(path2, self._company_id)
Example #14
0
def get_blocks_path(blocks_name):
    cr = ConfigReader()
    dir_path = cr.get('BLOCKS')
    return os.path.join(dir_path, blocks_name)
Example #15
0
def get_pairs_path(pairs_name):
    cr = ConfigReader()
    dir_path = cr.get('PAIRS')
    return os.path.join(dir_path, pairs_name)
Example #16
0
def get_art_duplicates_path(company_id):
    cr = ConfigReader()
    dir_path = cr.get('DUPLICATES_ART')
    return os.path.join(dir_path, str(company_id))
Example #17
0
def get_text_path(text_name, company_id):
    cr = ConfigReader()
    dir_path = cr.get('TEXT')
    subdir = os.path.join(dir_path, text_name)
    return os.path.join(subdir, str(company_id))
Example #18
0
def get_sents_path(sents_name, company_id):
    cr = ConfigReader()
    dir_path = cr.get('SENTS')
    subdir = os.path.join(dir_path, sents_name)
    return os.path.join(subdir, str(company_id))
Example #19
0
 def _load_path(self):
     cfr = ConfigReader()
     root = cfr.get('ROOT_ORIGINAL')
     path1 = cfr.get('DOWNLOADED_PR')
     path2 = os.path.join(root, path1)
     self._path_dir = os.path.join(path2, self._company_id)
Example #20
0
def get_pickled_subset_path(subset_name, doctype):
    cr = ConfigReader()
    base = cr.get('PICKLED_SUBSETS')
    dir_path = os.path.join(base, subset_name)
    filename = '{0}.pickle'.format(doctype)
    return os.path.join(dir_path, filename)
Example #21
0
 def __init__(self, company_id, matches_name):
     self._matchloader = MatchLoader(company_id, matches_name)
     self._tokens = TokenLoader(company_id)
     self._br = ConfigReader().get('MARKER_BR')
Example #22
0
def get_sentiment_scores_path(company_id):
    cr = ConfigReader()
    dir_path = cr.get('SENTIMENT_SCORES')
    return os.path.join(dir_path, str(company_id))
Example #23
0
def get_quotes_path(company_id):
    cr = ConfigReader()
    dir_path = cr.get('QUOTES')
    filename = '{0}'.format(company_id)
    return os.path.join(dir_path, filename)
Example #24
0
def get_subjlexicon_path(stemmed=False):
    cr = ConfigReader()
    if stemmed:
        return cr.get('SUBJLEXICON-STEMMED')
    else:
        return cr.get('SUBJLEXICON')