def _load(self, company_id): cfr = ConfigReader() p1 = cfr.get('ROOT_ORIGINAL') p2 = cfr.get('FORMATTED_PR') p3 = os.path.join(p1, p2) metafile = common.get_list_file_name(company_id) path_text = os.path.join(p3, str(company_id)) path_meta = os.path.join(p3, metafile) pr_text = self._load_text(path_text) self._load_meta(path_meta, pr_text)
def get_linkpages(self): all_html = [] cfr = ConfigReader() root = cfr.get('ROOT_ORIGINAL') path1 = cfr.get('PR_SOURCES') path2 = os.path.join(root, path1) path = os.path.join(path2, 'microsoft.html') print 'collecting links from source file' with open(path) as f: lines = f.readlines() all_html.append(''.join(lines)) return all_html
def _load_nonpubs(self): self._nonpubs = set() cfr = ConfigReader() path = os.path.abspath(cfr.get('NONPUBS')) with open(path) as f: for line in f.readlines(): line = line.strip().lower() if len(line) > 0: self._nonpubs.add(line)
def _load_path(self): cfr = ConfigReader() root = cfr.get('ROOT_ORIGINAL') path1 = cfr.get('DOWNLOADED_PR') path2 = os.path.join(root, path1) self._path_dir = os.path.join(path2, self._company_id)
def get_pickled_matches_path(company_id, subset_name): cr = ConfigReader() dir_path = cr.get('PICKLED_MATCHES') path = os.path.join(dir_path, subset_name) filename = '{0}.pickle'.format(company_id) return os.path.join(path, filename)
def get_pickled_pr_tokens_path(company_id): cr = ConfigReader() dir_path = cr.get('PICKLED_TOKENS_PR') filename = '{0}.pickle'.format(company_id) return os.path.join(dir_path, filename)
def get_sentiment_scores_path(company_id): cr = ConfigReader() dir_path = cr.get('SENTIMENT_SCORES') return os.path.join(dir_path, str(company_id))
def get_subjlexicon_path(stemmed=False): cr = ConfigReader() if stemmed: return cr.get('SUBJLEXICON-STEMMED') else: return cr.get('SUBJLEXICON')
def get_quotes_path(company_id): cr = ConfigReader() dir_path = cr.get('QUOTES') filename = '{0}'.format(company_id) return os.path.join(dir_path, filename)
def get_pairs_path(pairs_name): cr = ConfigReader() dir_path = cr.get('PAIRS') return os.path.join(dir_path, pairs_name)
def get_blocks_path(blocks_name): cr = ConfigReader() dir_path = cr.get('BLOCKS') return os.path.join(dir_path, blocks_name)
def get_art_duplicates_path(company_id): cr = ConfigReader() dir_path = cr.get('DUPLICATES_ART') return os.path.join(dir_path, str(company_id))
def get_text_path(text_name, company_id): cr = ConfigReader() dir_path = cr.get('TEXT') subdir = os.path.join(dir_path, text_name) return os.path.join(subdir, str(company_id))
def get_sents_path(sents_name, company_id): cr = ConfigReader() dir_path = cr.get('SENTS') subdir = os.path.join(dir_path, sents_name) return os.path.join(subdir, str(company_id))
def get_postags_path(): cr = ConfigReader() return cr.get('POSTAGS')
def _get_input_path(self): cfr = ConfigReader() root = cfr.get('ROOT_ORIGINAL') path1 = cfr.get('DOWNLOADED_NEWS') path2 = os.path.join(root, path1) return os.path.join(path2, self._company_id)
def get_pickled_subset_path(subset_name, doctype): cr = ConfigReader() base = cr.get('PICKLED_SUBSETS') dir_path = os.path.join(base, subset_name) filename = '{0}.pickle'.format(doctype) return os.path.join(dir_path, filename)
def get_sentiment_words_neg_path(company_id): cr = ConfigReader() dir_path = cr.get('SENTIMENT_WORDS_NEG') return os.path.join(dir_path, str(company_id))