def get_clean_doc(self, texts, field, selected_words): if self.clean_doc is not None: return self.clean_doc = {} name = "{}/{}_stem_dic.p".format(self.flags.data_path, self.name) self.stem_dic = load_pickle(self.stem_dic, name, {}) assert len(self.stem_dic) for text in texts: name = "{}/{}_clean_doc_{}.p".format(self.flags.data_path, self.name, text) if os.path.exists(name): self.clean_doc[text] = pickle.load(open(name, 'rb')) else: print("gen", name) word_lists = [ ] # list of lists, each item is a list of words for each sample df_per_sample_word_lists(self.data[text], field, word_lists) # this function is in place. clean_lists = [] for c, word_list in enumerate(word_lists): word_list = rm_stop_words(word_list) word_list = rm_punctuation(word_list) word_list = stem(word_list, self.stem_dic) word_list = [ word for word in word_list if word in selected_words ] clean_lists.append(word_list) if c % 1000 == 0: print("{} docs cleaned {}".format(c, word_list[:10])) pickle.dump(clean_lists, open(name, 'wb')) self.clean_doc[text] = clean_lists
def get_per_sample_words_count(self, texts, field, silent=0): """ Each sample is a document. Input: texts: ["train","text"] """ if self.sample_words_count is not None: return self.sample_words_count = {} self.get_global_words_count(texts, [field], 1) for text in texts: name = "{}/{}_sample_count_{}.p".format(self.flags.data_path, self.name, text) if os.path.exists(name): self.sample_words_count[text] = pickle.load(open(name, 'rb')) else: print("gen", name) word_lists = [ ] # list of lists, each item is a list of words for each sample df_per_sample_word_lists(self.data[text], field, word_lists) # this function is in place. word_counts = [] for word_list in word_lists: word_list = rm_stop_words(word_list) word_list = rm_punctuation(word_list) word_list = stem(word_list, self.stem_dic) word_counts.append(Counter(word_list)) pickle.dump(word_counts, open(name, 'wb')) self.sample_words_count[text] = word_counts if silent == 0: print("\n{} sample words count done".format(text))
def get_global_words_count(self, texts, fields=["Text"], silent=0): """ build self.words_count: {"train":Counter, "test":Counter} Input: texts: ["train","text"] """ if self.words_count is not None: return self.words_count = {} name = "{}/{}_stem_dic.p".format(self.flags.data_path, self.name) self.stem_dic = load_pickle(self.stem_dic, name, {}) for text in texts: name = "{}/{}_total_count_{}.p".format(self.flags.data_path, self.name, text) if os.path.exists(name): self.words_count[text] = pickle.load(open(name, 'rb')) else: print("gen", name) word_list = [] df_global_word_container(self.data[text], fields, word_list) # global word container means this is for the entire dataset, not per sample # this function is in place. word_list = rm_stop_words(word_list) word_list = rm_punctuation(word_list) word_list = stem(word_list, self.stem_dic) word_count = Counter(word_list) pickle.dump(word_count, open(name, 'wb')) self.words_count[text] = word_count if silent == 0: print("\nnumber of different words in {}:".format(text), len(self.words_count[text])) k = 10 print("Top {} most common words in {}".format(k, text), self.words_count[text].most_common(k)) name = "{}/{}_stem_dic.p".format(self.flags.data_path, self.name) save_pickle(self.stem_dic, name) self.global_word_count = Counter() for i, j in self.words_count.items(): self.global_word_count = self.global_word_count + j