def write_tokens(self, company_id): tokenizer = Tokenizer() br = ConfigReader().get('MARKER_BR') # uncomment this block when done with press releases # dic = {} # articles = ArticleLoader(company_id).get_articles() # for i, key in enumerate(articles): # if i % 50 == 0: # print 'pickling article {0} of {1} for company {2}'.format(i+1, len(articles), company_id) # article = articles[key] # text = article.headline() + '\n\n' + article.body() # self._add_tokens_to_dic(tokenizer, dic, article.id(), text, br) # # output_path = common.get_pickled_news_tokens_path(company_id) # self._pickle(company_id, dic, output_path) dic = {} releases = ReleaseLoader(company_id).get_releases() for i, key in enumerate(releases): if i % 50 == 0: print 'pickling release {0} of {1} for company {2}'.format( i + 1, len(releases), company_id) release = releases[key] text = release.title() + '\n\n' + release.body() self._add_tokens_to_dic(tokenizer, dic, release.id(), text, br) output_path = common.get_pickled_pr_tokens_path(company_id) self._pickle(company_id, dic, output_path)
def __init__(self, company_id, matches_name): self._company_id = company_id self._matchloader = MatchLoader(company_id, matches_name) self._tokens = TokenLoader(company_id) self._releases = ReleaseLoader(company_id).get_releases() self._articles = ArticleLoader(company_id).get_articles() self._br = ConfigReader().get('MARKER_BR')
def _load_nonpubs(self): self._nonpubs = set() cfr = ConfigReader() path = os.path.abspath(cfr.get('NONPUBS')) with open(path) as f: for line in f.readlines(): line = line.strip().lower() if len(line) > 0: self._nonpubs.add(line)
def get_linkpages(self): all_html = [] cfr = ConfigReader() root = cfr.get('ROOT_ORIGINAL') path1 = cfr.get('PR_SOURCES') path2 = os.path.join(root, path1) path = os.path.join(path2, 'microsoft.html') print 'collecting links from source file' with open(path) as f: lines = f.readlines() all_html.append(''.join(lines)) return all_html
def _load(self, company_id): cfr = ConfigReader() p1 = cfr.get('ROOT_ORIGINAL') p2 = cfr.get('FORMATTED_PR') p3 = os.path.join(p1, p2) metafile = common.get_list_file_name(company_id) path_text = os.path.join(p3, str(company_id)) path_meta = os.path.join(p3, metafile) pr_text = self._load_text(path_text) self._load_meta(path_meta, pr_text)
def write(self): br = ConfigReader().get('MARKER_BR') for release_id in self._release_ids: tokens = self._tokens.get_release_tokens(release_id, False) text = ' '.join(tokens) text = text.replace(br, '\n') path = self._get_filepath(common.DOCTYPE_PR, release_id) with open(path, 'w') as f: f.write(text) for article_id in self._article_ids: tokens = self._tokens.get_article_tokens(article_id, False) text = ' '.join(tokens) text = text.replace(br, '\n') path = self._get_filepath(common.DOCTYPE_NEWS, article_id) with open(path, 'w') as f: f.write(text)
def __init__(self, company_id): self._load_dictionaries(company_id) self._exclude_tokens = set(string.punctuation) self._exclude_tokens.add(ConfigReader().get('MARKER_BR'))
def get_pickled_pr_tokens_path(company_id): cr = ConfigReader() dir_path = cr.get('PICKLED_TOKENS_PR') filename = '{0}.pickle'.format(company_id) return os.path.join(dir_path, filename)
def get_pickled_matches_path(company_id, subset_name): cr = ConfigReader() dir_path = cr.get('PICKLED_MATCHES') path = os.path.join(dir_path, subset_name) filename = '{0}.pickle'.format(company_id) return os.path.join(path, filename)
def _get_input_path(self): cfr = ConfigReader() root = cfr.get('ROOT_ORIGINAL') path1 = cfr.get('DOWNLOADED_NEWS') path2 = os.path.join(root, path1) return os.path.join(path2, self._company_id)
def get_subjlexicon_path(stemmed=False): cr = ConfigReader() if stemmed: return cr.get('SUBJLEXICON-STEMMED') else: return cr.get('SUBJLEXICON')
def get_pairs_path(pairs_name): cr = ConfigReader() dir_path = cr.get('PAIRS') return os.path.join(dir_path, pairs_name)
def __init__(self): self._br = ConfigReader().get('MARKER_BR')
def get_art_duplicates_path(company_id): cr = ConfigReader() dir_path = cr.get('DUPLICATES_ART') return os.path.join(dir_path, str(company_id))
def get_blocks_path(blocks_name): cr = ConfigReader() dir_path = cr.get('BLOCKS') return os.path.join(dir_path, blocks_name)
def get_text_path(text_name, company_id): cr = ConfigReader() dir_path = cr.get('TEXT') subdir = os.path.join(dir_path, text_name) return os.path.join(subdir, str(company_id))
def get_sents_path(sents_name, company_id): cr = ConfigReader() dir_path = cr.get('SENTS') subdir = os.path.join(dir_path, sents_name) return os.path.join(subdir, str(company_id))
def get_postags_path(): cr = ConfigReader() return cr.get('POSTAGS')
def _load_path(self): cfr = ConfigReader() root = cfr.get('ROOT_ORIGINAL') path1 = cfr.get('DOWNLOADED_PR') path2 = os.path.join(root, path1) self._path_dir = os.path.join(path2, self._company_id)
def get_pickled_subset_path(subset_name, doctype): cr = ConfigReader() base = cr.get('PICKLED_SUBSETS') dir_path = os.path.join(base, subset_name) filename = '{0}.pickle'.format(doctype) return os.path.join(dir_path, filename)
def __init__(self, company_id, matches_name): self._matchloader = MatchLoader(company_id, matches_name) self._tokens = TokenLoader(company_id) self._br = ConfigReader().get('MARKER_BR')
def get_sentiment_scores_path(company_id): cr = ConfigReader() dir_path = cr.get('SENTIMENT_SCORES') return os.path.join(dir_path, str(company_id))
def get_quotes_path(company_id): cr = ConfigReader() dir_path = cr.get('QUOTES') filename = '{0}'.format(company_id) return os.path.join(dir_path, filename)
def get_sentiment_words_neg_path(company_id): cr = ConfigReader() dir_path = cr.get('SENTIMENT_WORDS_NEG') return os.path.join(dir_path, str(company_id))