def save_dictionary( dic: corpora.Dictionary, filename: str ) -> None: dic.save(filename) print("saved dictionary: {} items to {}".format( len(dic.values()), filename ))
def test_dict_interface(self): """Test Python 2 dict-like interface in both Python 2 and 3.""" d = Dictionary(self.texts) self.assertTrue(isinstance(d, Mapping)) self.assertEqual(list(zip(d.keys(), d.values())), list(d.items())) # Even in Py3, we want the iter* members. self.assertEqual(list(d.items()), list(d.iteritems())) self.assertEqual(list(d.keys()), list(d.iterkeys())) self.assertEqual(list(d.values()), list(d.itervalues())) # XXX Do we want list results from the dict members in Py3 too? if not PY3: self.assertTrue(isinstance(d.items(), list)) self.assertTrue(isinstance(d.keys(), list)) self.assertTrue(isinstance(d.values(), list))
thing = d[x]["text"].lower().translate( str.maketrans('', '', string.punctuation)) tfiltered = list(filter(lambda w: not w in s, thing.split())) #tfiltered = map(lambda x: lemmatizer.lemmatize(x), tfiltered) #tfiltered = list(tfiltered) corpus.append(tfiltered) dct = Dictionary(corpus) bow_corpus = [dct.doc2bow(line) for line in corpus] term_doc_mat = corpus2csc(bow_corpus) from collections import OrderedDict document = corpus names = dct.values() occurrences = OrderedDict( (name, OrderedDict((name, 0) for name in names)) for name in names) # Find the co-occurrences: for l in document: for i in range(len(l)): print(l[i - word_window:i] + l[i + word_window:]) for item in l[i - word_window:i] + l[i + word_window:]: occurrences[l[i]][item] += 1 # Print the matrix wcounts = dict()
metadata = pd.read_csv("..\\data\\absrecord.csv") print(len(metadata['filename'].values)) fullvocab = [] from preprocessor import preprocess, flatten for record in range(len(metadata)): # print(100*record/len(metadata)) fullvocab.append(preprocess(str(metadata.iloc[record]['body']))[0]) print(fullvocab) maindict = Dictionary(fullvocab) i = 0 fulldict = [] for document in fullvocab: temp = [] print(100 * i / len(fullvocab)) i += 1 document = list(sorted(set(document))) for token in document: if token in list(maindict.values()): for key, value in list(maindict.items()): if token == value: temp.append({"id": key, "name": token}) # print({"id":key, "name":token}) fulldict.append(temp) b = metadata['filename'].values print(fulldict) a = pd.DataFrame({'keywords': fulldict}) metadata.append(a) metadata.to_csv("..\\data\\keywords.csv")
def get_headers(df, attr): documents = df[attr] dictionary = Dictionary(documents) return list(dictionary.values())
class CMVCorpus(object): logger = logging.getLogger(__name__) def __init__(self, config): self.config = config self._path = config.data_dir[0] self.max_data_size = config.max_data_size self.max_utt_len = config.max_utt_len self.tokenize = get_chat_tokenize() self.train_corpus, self.test_corpus = self._read_file( os.path.join(self._path)) self._build_vocab(config.max_vocab_cnt) print("Done loading corpus") def _process_dialog(self, data): new_dialog = [] all_lens = [] all_dialog_lens = [] for raw_dialog in data: dialog = { "title": self.tokenize(raw_dialog['title'].lower()), "op": self.tokenize(raw_dialog["content"].lower()), "pos_conv_lst": [], "neg_conv_lst": [] } for i, turns in enumerate( raw_dialog['comments']): # for each comment lst if turns["win"]: conv_lst = dialog["pos_conv_lst"] else: conv_lst = dialog["neg_conv_lst"] new_utt_lst = [] for turn in turns["utt_lst"]: argument = self.tokenize(turn.lower()) all_lens.append(len(argument)) new_utt_lst.append(argument) conv_lst.append(new_utt_lst) all_dialog_lens.append(len(new_utt_lst)) new_dialog.append(dialog) # cut for the max data size if len(new_dialog) >= self.max_data_size: break print("Max utt len %d, mean utt len %.2f" % (np.max(all_lens), float(np.mean(all_lens)))) print("Max dialog len %d, mean dialog len %.2f" % (np.max(all_dialog_lens), float(np.mean(all_dialog_lens)))) return new_dialog def _build_vocab(self, max_vocab_cnt): all_words = [] for dialog in self.train_corpus: all_words.append(dialog["op"] + dialog["title"]) for turns in dialog["pos_conv_lst"] + dialog["neg_conv_lst"]: for turn in turns: all_words.append(turn) self.vocab_bow = Dictionary(all_words) raw_vocab_size = len(self.vocab_bow) raw_wc = np.sum(list(self.vocab_bow.dfs.values())) # build useless stopwords vocab (e.g, very few words, single ascii words, some punctuation ,."') self.vocab_bow.filter_extremes(no_below=10, keep_n=max_vocab_cnt) bad_ids = HTML_STOPWORDS + ['cmv'] self.vocab_bow.filter_tokens( list(map(self.vocab_bow.token2id.get, bad_ids))) self.vocab_bow.compactify() self.vocab_seq = copy.deepcopy(self.vocab_bow) # for sequence model self.vocab_seq.token2id[self.vocab_seq[0]] = len(self.vocab_seq) self.vocab_seq.token2id[PAD] = 0 self.vocab_seq.token2id[UNK] = len(self.vocab_seq) self.vocab_seq.compactify() self.pad_wid = self.vocab_seq.token2id.get(PAD) len_1_words = list( filter( lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w not in ["[", "]", "$", "?", "!", "\"", "'", "i", "a" ] and True or False, self.vocab_bow.values())) self.vocab_bow.filter_tokens( list(map(self.vocab_bow.token2id.get, len_1_words))) # some makeup words # makeup_lst = [PAD] # for w in makeup_lst: # self.vocab_bow.token2id[w] = len(self.vocab_bow) # self.vocab_bow.compactify() # self.pad_wid = self.vocab_bow.token2id.get(PAD) # here we keep stopwords and some meaningful punctuations non_stopwords = filter( lambda w: re.match(r"^[\w\d_-]*$", w) and w not in STOPWORDS and True or False, self.vocab_bow.values()) self.vocab_bow_stopwords = copy.deepcopy(self.vocab_bow) self.vocab_bow_stopwords.filter_tokens( map(self.vocab_bow_stopwords.token2id.get, non_stopwords)) self.vocab_bow_stopwords.compactify() self.vocab_bow_non_stopwords = copy.deepcopy(self.vocab_bow) self.vocab_bow_non_stopwords.filter_tokens( map(self.vocab_bow_non_stopwords.token2id.get, self.vocab_bow_stopwords.values())) self.vocab_bow_non_stopwords.compactify() remain_wc = np.sum(list(self.vocab_bow.dfs.values())) min_count = np.min(list(self.vocab_bow.dfs.values())) # create vocabulary list sorted by count print( "Load corpus with train size %d, " "test size %d raw vocab size %d vocab size %d at cut_off %d OOV rate %f" % (len(self.train_corpus), len(self.test_corpus), raw_vocab_size, len(self.vocab_bow), min_count, 1 - float(remain_wc) / raw_wc)) def _read_file(self, path): with open(path, 'r') as f: data = json.load(f) return self._process_dialog(data["train"]), self._process_dialog( data["test"]) def _sent2id_seq(self, sent, vocab): return list( filter(lambda x: x is not None, [vocab.token2id.get(t) for t in sent])) def _sent2id_bow(self, sent, vocab): if sent: return vocab.doc2bow(sent) else: return [] def _to_id_corpus(self, data, vocab_seq, vocab_bow): results = [] word_cnt = 0 msg_cnt = 0 for dialog in data: # convert utterance and feature into numeric numbers id_dialog = Pack(title=self._sent2id_seq(dialog["title"], vocab_seq), op=self._sent2id_seq(dialog["op"], vocab_seq), pos_conv_seq_lst=[], pos_conv_bow_lst=[], neg_conv_seq_lst=[], neg_conv_bow_lst=[]) for turns in dialog["pos_conv_lst"]: new_turns_bow = [] new_turns_seq = [] for turn in turns: id_turn_seq = self._sent2id_seq(turn, vocab_seq) id_turn_bow = self._sent2id_bow(turn, vocab_bow) if id_turn_seq and id_turn_bow: # filter empty utt new_turns_bow.append(id_turn_bow) new_turns_seq.append(id_turn_seq) word_cnt += len(id_turn_seq) msg_cnt += 1 if new_turns_seq and new_turns_bow: id_dialog["pos_conv_bow_lst"].append(new_turns_bow) id_dialog["pos_conv_seq_lst"].append(new_turns_seq) for turns in dialog["neg_conv_lst"]: new_turns_bow = [] new_turns_seq = [] for turn in turns: id_turn_seq = self._sent2id_seq(turn, vocab_seq) id_turn_bow = self._sent2id_bow(turn, vocab_bow) if id_turn_seq and id_turn_bow: # filter empty utt new_turns_bow.append(id_turn_bow) new_turns_seq.append(id_turn_seq) word_cnt += len(id_turn_seq) msg_cnt += 1 if new_turns_seq and new_turns_bow: id_dialog["neg_conv_bow_lst"].append(new_turns_bow) id_dialog["neg_conv_seq_lst"].append(new_turns_seq) if id_dialog.pos_conv_bow_lst and id_dialog.neg_conv_bow_lst: results.append(id_dialog) print("Load seq with %d msgs, %d words" % (msg_cnt, word_cnt)) return results, msg_cnt, word_cnt def _to_id_corpus_bow(self, data, vocab): results = [] word_cnt = 0 msg_cnt = 0 for dialog in data: # convert utterance and feature into numeric numbers id_dialog = Pack(title=self._sent2id_bow(dialog["title"], vocab), op=self._sent2id_bow(dialog["op"], vocab), pos_conv_bow_lst=[], neg_conv_bow_lst=[]) for turns in dialog["pos_conv_lst"]: new_turns = [] for turn in turns: id_turn = self._sent2id_bow(turn, vocab) if id_turn: # filter empty utt new_turns.append(id_turn) word_cnt += np.sum([j for i, j in id_turn]) msg_cnt += 1 if new_turns: id_dialog["pos_conv_bow_lst"].append(new_turns) for turns in dialog["neg_conv_lst"]: new_turns = [] for turn in turns: id_turn = self._sent2id_bow(turn, vocab) if id_turn: # filter empty utt new_turns.append(id_turn) word_cnt += np.sum([j for i, j in id_turn]) msg_cnt += 1 if new_turns: id_dialog["neg_conv_bow_lst"].append(new_turns) if id_dialog.pos_conv_bow_lst and id_dialog.neg_conv_bow_lst: results.append(id_dialog) print("Load bow with %d msgs, %d words" % (msg_cnt, word_cnt)) return results, msg_cnt, word_cnt def get_corpus_bow(self, keep_stopwords=True): if keep_stopwords: vocab = self.vocab_bow else: vocab = self.vocab_bow_non_stopwords id_train = self._to_id_corpus_bow(self.train_corpus, vocab) id_test = self._to_id_corpus_bow(self.test_corpus, vocab) return Pack(train=id_train, test=id_test, vocab_size=len(vocab)) def get_corpus_seq(self): vocab = self.vocab_seq id_train = self._to_id_corpus_seq(self.train_corpus, vocab) id_test = self._to_id_corpus_seq(self.test_corpus, vocab) return Pack(train=id_train, test=id_test, vocab_size=len(vocab)) def get_corpus(self): id_train = self._to_id_corpus(self.train_corpus, self.vocab_seq, self.vocab_bow) id_test = self._to_id_corpus(self.test_corpus, self.vocab_seq, self.vocab_bow) # id_valid = self._to_id_corpus(self.valid_corpus, self.vocab_seq, self.vocab_bow) return Pack(train=id_train, test=id_test, vocab_size=len(self.vocab_bow))
df['lemmatized_text'] = list( map(lambda sentence: list(map(lemm.lemmatize, sentence)), df.stopwords_removed)) p_stemmer = nltk.stem.porter.PorterStemmer() df['stemmed_text'] = list( map(lambda sentence: list(map(p_stemmer.stem, sentence)), df.lemmatized_text)) stem_words(train_data) # Vectorize words dictionary = Dictionary(documents=train_data.stemmed_text.values) dictionary.save('model/dictionary.txtdic') print("Found {} words.".format(len(dictionary.values()))) #dictionary.filter_extremes(no_above=0.8, no_below=3) dictionary.compactify() # Reindexes the remaining words after filtering print("Left with {} words.".format(len(dictionary.values()))) #Make a BOW ( Bag of Words ) for every document def document_to_bow(df): df['bow'] = list( map(lambda doc: dictionary.doc2bow(doc), df.stemmed_text)) document_to_bow(train_data) # we make a function such that later on when we make the submission, we don't need to write duplicate code def lda_preprocessing(df):
class FasttextTfIdfTransformer: def __init__(self, model=None, dictionary=None, corpus_file=None, size=256, window=7, min_count=4, iter=30, min_n=4, max_n=5, word_ngrams=1, no_above=0.5, filter_n_most_frequent=100, do_filter_tokens=True, workers=multiprocessing.cpu_count() - 1, ft_prefix="ft_", token_column=None, inplace=True, store_train_data=False, skip_fit=False, skip_transform=False, normalize_word_vectors=True): self.size = size self.window = window self.min_count = min_count self.iter = iter self.min_n = min_n self.max_n = max_n self.word_ngrams = word_ngrams self.workers = workers self.token_column = token_column self.model = None assert type(self.token_column) == str self.ft_prefix = ft_prefix self.skip_fit = skip_fit self.skip_transform = skip_transform self.inplace = inplace self.normalize_word_vectors = normalize_word_vectors self.store_train_data = store_train_data self.train = None self.model = model self.no_above = no_above self.word_set = None self.filter_n_most_frequent = filter_n_most_frequent self.do_filter_tokens = do_filter_tokens self.dictionary = dictionary if model is None and corpus_file is not None: self.dictionary = Dictionary( map(lambda s: s.split(), load_list_per_line(corpus_file))) print("Total Unique Tokens = %s" % (len(self.dictionary))) self.dictionary.filter_extremes(no_below=self.min_count, no_above=self.no_above, keep_n=1000000) self.dictionary.filter_n_most_frequent(self.filter_n_most_frequent) print("Total Unique Tokens after filtering = %s" % (len(self.dictionary))) self.word_set = set(self.dictionary.values()) self.model = FastText(corpus_file=corpus_file, size=self.size, window=self.window, min_count=self.min_count, iter=self.iter, min_n=self.min_n, max_n=self.max_n, word_ngrams=self.word_ngrams, workers=self.workers, bucket=8000000, alpha=0.03, negative=10, ns_exponent=0.5) if (model is None or dictionary is None) and corpus_file is None: raise ValueError("No data given to initialise FastText Model") assert self.dictionary is not None and self.model is not None def fit(self, X, y='ignored'): gc.collect() if self.store_train_data: self.train = (X, y) if self.skip_fit: return self if type(X) == pd.DataFrame: X = X[self.token_column].values else: raise ValueError() assert self.dictionary is not None and self.model is not None self.dictionary.add_documents(X) dct = self.dictionary print("Total Unique Tokens = %s" % (len(dct))) dct.filter_extremes(no_below=self.min_count, no_above=self.no_above, keep_n=1000000) dct.filter_n_most_frequent(self.filter_n_most_frequent) print("Total Unique Tokens after filtering = %s" % (len(dct))) self.word_set = set(dct.values()) print("FastText Modelling Started at %s" % (str(pd.datetime.now()))) self.model.build_vocab(X, update=True) self.model.train(X, total_examples=self.model.corpus_count, epochs=self.model.epochs) print("FastText Modelling done at %s" % (str(pd.datetime.now()))) print("FastText Vocab Length = %s, Ngrams length = %s" % (len( self.model.wv.vectors_ngrams), len(self.model.wv.vectors_vocab))) gc.collect() return self def fit_stored(self): X, y = self.train return self.fit(X, y) def partial_fit(self, X, y=None): self.fit(X, y='ignored') def transform_one(self, token_array): tokens2vec = [ self.model.wv[token] if token in self.model.wv else np.full( self.size, 0) for token in token_array ] if np.sum(tokens2vec) == 0: return np.full(self.size, 0) return np.average(tokens2vec, axis=0) def transform(self, X, y='ignored'): print("Fasttext Transforms start at: %s" % (str(pd.datetime.now()))) if self.skip_transform: return X if type(X) == pd.DataFrame: Input = X[self.token_column].values else: raise ValueError() if not self.inplace: X = X.copy() uniq_tokens = set(more_itertools.flatten(Input)) print("Number of Unique Test Tokens for Fasttext transform %s" % len(uniq_tokens)) if self.do_filter_tokens: uniq_tokens = uniq_tokens.intersection(self.word_set) print( "Number of Unique Test Tokens after filtering for Fasttext transform %s" % len(uniq_tokens)) empty = np.full(self.size, 0) token2vec = { k: self.model.wv[k] if k in self.model.wv else empty for k in uniq_tokens } token2vec = {k: v / np.linalg.norm(v) for k, v in token2vec.items()} def tokens2vec(token_array): empty = np.full(self.size, 0) if len(token_array) == 0: return empty return [ token2vec[token] if token in uniq_tokens else empty for token in token_array ] ft_vecs = list(map(tokens2vec, Input)) results = list( map( lambda x: np.average( x, axis=0, ) if np.sum(x) != 0 else np.full(300, 0), ft_vecs)) text_df = pd.DataFrame(list(map(list, results))) text_df.columns = [ self.ft_prefix + str(i) for i in range(0, self.size) ] text_df.index = X.index X[list(text_df.columns)] = text_df gc.collect() print("Fasttext Transforms done at: %s" % (str(pd.datetime.now()))) return X def inverse_transform(self, X, copy=None): raise NotImplementedError() def fit_transform(self, X, y='ignored'): self.fit(X) return self.transform(X)
def save_dictionary(dic: corpora.Dictionary, filename: str) -> None: dic.save(filename) print("saved dictionary: {} items to {}".format(len(dic.values()), filename))
# coherencemodel2 = CoherenceModel( # model=lda, texts=data, dictionary=id2word, coherence='c_v') # coherence_arr.append(coherencemodel2.get_coherence()) f2 = open('models_online.pkl', 'wb') count = 0 # The loop simulates arrival of new documents from Google Alerts in batches of STEP_SIZE for i in range(INITIAL_DOC_SIZE, len(data_lemmatized)-STEP_SIZE, STEP_SIZE): # new_docs is the list of STEP_SIZE new documents which have arrived new_docs = data_lemmatized[i:i+STEP_SIZE] pruned_docs = [] for doc in new_docs: pruned_data = [] for x in doc: if x in id2word.values(): pruned_data.append(x) pruned_docs.append(pruned_data) new_docs = pruned_docs print('Pruning Done') # Updating Dictionary # id2word.add_documents(new_docs) # id2word.filter_extremes(no_below=5, no_above=0.95, # keep_n=1800) prev_corpus = copy.deepcopy(corpus) # Converting Documents to doc2bow format so that they can be fed to models corpus = [id2word.doc2bow(doc) for doc in new_docs] count += 1
p_stemmer = nltk.stem.porter.PorterStemmer() df_fon['stemmed_text'] = list(map(lambda sentence: list(map(p_stemmer.stem, sentence)), df_fon.lemmatized_text)) stem_words(df_Moive_train) from gensim.corpora import Dictionary #Sözcükleri vektörle dictionary = Dictionary(documents=df_Moive_train.stemmed_text.values) print("Bulunan kelimeler: {}".format(len(dictionary.values()))) dictionary.filter_extremes(no_above=0.8, no_below=3) dictionary.compactify() # Filtrelemeden sonra kalan kelimeleri yeniden indeksler print("Kalan kelimeler: {}".format(len(dictionary.values()))) #her belge için bir BOW def document_to_bow(df_fon): df_fon['bow'] = list(map(lambda doc: dictionary.doc2bow(doc), df_fon.stemmed_text)) document_to_bow(df_Moive_train)