def test_dict_interface(self): """Test Python 2 dict-like interface in both Python 2 and 3.""" d = Dictionary(self.texts) self.assertTrue(isinstance(d, Mapping)) self.assertEqual(list(zip(d.keys(), d.values())), list(d.items())) # Even in Py3, we want the iter* members. self.assertEqual(list(d.items()), list(d.iteritems())) self.assertEqual(list(d.keys()), list(d.iterkeys())) self.assertEqual(list(d.values()), list(d.itervalues()))
def create_dictionary(model=None, text=None): if model is not None and text is not None: gensim_dic = Dictionary() gensim_dic.doc2bow(model.wv.vocab.keys(), allow_update=True) w2index = {v: k + 1 for k, v in gensim_dic.items()} w2vec = {word: model[word] for word in w2index.keys()} def word2id(text): #将分词后文本转化为字典索引的形式,并补齐 data = [] for te in text: word_2_id = [] try: #不在词典中的分词索引为0 for word in te: word_2_id.append(w2index[word]) except: word_2_id.append(0) data.append(word_2_id) return data text = word2id(text) text = sequence.pad_sequences(text, maxlen=100) return w2index, w2vec, text else: print('data is None')
def test_dictionaries(): dictionary = Dictionary(TOKEN_SETS) # it maps tokens to numeric indices: assert list(dictionary.items()) == [(0, 'all'), (1, 'kings'), (2, 'men'), (3, 'the'), (4, 'ate'), (5, 'hens'), (6, 'and'), (7, 'got'), (8, 'sleep'), (9, 'they'), (10, 'tired'), (11, 'to'), (12, 'until'), (13, 'went'), (14, 'zzz')] assert dictionary.token2id == { 'all': 0, 'kings': 1, 'men': 2, 'the': 3, 'ate': 4, 'hens': 5, 'and': 6, 'got': 7, 'sleep': 8, 'they': 9, 'tired': 10, 'to': 11, 'until': 12, 'went': 13, 'zzz': 14 }
def create_dictionaries(p_model): gensim_dict = Dictionary() gensim_dict.doc2bow(p_model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 词语的索引,从1开始编号 w2vec = {word: p_model[word] for word in w2indx.keys()} # 词语的词向量 return w2indx, w2vec
def create_dictionaries(model=None, conbined=None): if (model is not None) and (conbined is not None): gensim_dic = Dictionary() gensim_dic.doc2bow(model.wv.vocab.keys(), allow_update=True) w2index = {v: k + 1 for k, v in gensim_dic.items()} #单词 索引 w2vec = {word: model[word] for word in w2index.keys()} #单词 向量 def parse_dataset(combined): ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2index[word]) except: new_txt.append(0) data.append(new_txt) return data conbined = parse_dataset(conbined) #输入要求长度一致,所以句子要截取同样长度,不足最大长度补零 conbined = sequence.pad_sequences(conbined, maxlen=100) return w2index, w2vec, conbined else: print('data is None')
def create_dictionaries(model=None,combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # freqxiao10->0 所以k+1 w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引,(k->v)=>(v->k) w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): # 闭包-->临时使用 ''' Words become integers ''' data=[] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) # freqxiao10->0 data.append(new_txt) return data # word=>index combined=parse_dataset(combined) combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec,combined else: print ('No data provided...')
def create_dictionaries(model=None, combined=None): maxlen = 100 ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries ''' if (combined is not None) and model is not None: gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k+1 for k, v in gensim_dict.items()} # 所有词频数超过10的词语的索引 w2vec = {word: model[word] for word in w2indx.keys()} # 所有词频数超过10的词语的词向量 def parse_dataset(combined): ''' Words become integers ''' data=[] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined=parse_dataset(combined) combined=sequence.pad_sequences(combined, maxlen=maxlen) return w2indx, w2vec, combined else: print('No data provide')
def create_dictionaries(model=None, sen_lst=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries ''' if (sen_lst is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} #所有频数超过10的词语的索引 w2vec = {word: model[word] for word in w2indx.keys()} #所有频数超过10的词语的词向量 def parse_dataset(sen_lst): ''' Words become integers ''' data = [] for sentence in sen_lst: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parse_dataset(sen_lst) global MAX_LEN combined = sequence.pad_sequences( combined, maxlen=MAX_LEN) #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def create_dictionaries(data=None, model=None): if (data is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k+1 for k, v in gensim_dict.items()} w2vec = {word: model[word] for word in w2indx.keys()} def parse_dataset(dataset): ''' Words become integers ''' for key in dataset.keys(): txt = dataset[key].lower().replace('\n', '').split() new_txt = [] for word in txt: try: new_txt.append(w2indx[word]) except: new_txt.append(0) dataset[key] = new_txt return data data = parse_dataset(data) return w2indx, w2vec, data else: print('No data provided...')
def create_dictionaries(model=None, combined=None): """ 返回索引,单词向量矩阵和具有统一长度和索引的句子 """ if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # 有单词向量的单词的索引不为0 w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 将所有对应的向量整合到向量矩阵中 w2vec = {word: model[word] for word in w2indx.keys()} def parse_dataset(combined): data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parse_dataset(combined) # 用keras的pad_sequences函数统一句子的长度 combined = sequence.pad_sequences(combined, maxlen=max_len) return w2indx, w2vec, combined else: logging.warning('No data provided...')
def test_streaming(): generator = token_stream(NOVELS_DIRPATH) # it can be constructed via a generator: dictionary = Dictionary(generator) token_items = list(dictionary.items()) assert len(token_items) == 1969 assert token_items[0:4] == [(0, 'a'), (1, 'about'), (2, 'accommodate'), (3, 'admire')]
def test_statistical_trimming(): dictionary = Dictionary(TOKEN_SETS) # no_below and no_above like min_df and max_df, except... # + no_below: absolute number of documents # + no_above: percentage of documents dictionary.filter_extremes(no_below=2, no_above=0.99) # it excludes terms not meeting the filter conditions: assert list(dictionary.items()) == [(0, 'kings'), (1, 'the')] assert dictionary.token2id == {'kings': 0, 'the': 1}
def test_dict_interface(self): """Test Python 2 dict-like interface in both Python 2 and 3.""" d = Dictionary(self.texts) self.assertTrue(isinstance(d, Mapping)) self.assertEqual(list(zip(d.keys(), d.values())), list(d.items())) # Even in Py3, we want the iter* members. self.assertEqual(list(d.items()), list(d.iteritems())) self.assertEqual(list(d.keys()), list(d.iterkeys())) self.assertEqual(list(d.values()), list(d.itervalues())) # XXX Do we want list results from the dict members in Py3 too? if not PY3: self.assertTrue(isinstance(d.items(), list)) self.assertTrue(isinstance(d.keys(), list)) self.assertTrue(isinstance(d.values(), list))
def word2vec_init(model=None): # 加载词向量模型,并计算对应的词嵌入向量矩阵 global word2vec, word2idx, embed_weight model = Word2Vec.load( './model/word2vec') if not model else model # 加载词向量模型 dic = Dictionary() dic.doc2bow(model.wv.vocab.keys(), allow_update=True) word2idx = {token: idx + 1 for idx, token in dic.items()} word2vec = {word: model[word] for word in dic.values()} embed_weight = zeros((len(word2idx) + 1, embed_dim)) for word, idx in word2idx.items(): embed_weight[idx, :] = word2vec[word] # 词向量矩阵,第一行是0向量
def create_dictionaries(model=None, text=None): maxlen = 100 # 向量截断长度 """Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries """ if (text is not None) and model is not None: # 可以理解为python中的字典对象, 其Key是字典中的词,其Val是词对应的唯一数值型ID gensim_dict = Dictionary() # 函数doc2bow()只是计算每个唯一的词的出现频率,将词转化整型词id并且将结果作为稀疏向量返回 gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # 获取给定词的索引 w2indx = {v: k + 1 for k, v in gensim_dict.items() } # 所有词频数超过10的词语的索引 k->key v->value # 输出给定词的词向量 w2vec = {word: model[word] for word in w2indx.keys()} # 所有词频数超过10的词语的词向量 # 文本变数字 def parse_dataset(text): """ Words become integers """ data = [] for sentence in text: # print(sentence) # 已经分词好的句子 new_txt = [] for word in sentence: try: # print(word)# 单个分词 new_txt.append(w2indx[word]) # print(w2indx[word]) # 索引 except: new_txt.append(0) # print(new_txt) # 所有词向量 data.append(new_txt) return data text = parse_dataset(text) # 将多个序列截断或补齐为相同长度 text = sequence.pad_sequences(text, maxlen=maxlen) return w2indx, w2vec, text else: print('No data provide')
def _tfidf_gensim(table, input_col, output_col_name="sparse_vectors", tf_weighing='n', df_weighing='t', document_normalization='c'): out_table = table.copy() tokens = out_table[input_col] smartirs = tf_weighing + df_weighing + document_normalization dictionary = Dictionary(tokens) word_count_vector_list = [dictionary.doc2bow(text) for text in tokens] tfidf_model = TfidfModel(word_count_vector_list, smartirs=smartirs) tfidf_vector_list = [*tfidf_model[word_count_vector_list]] sparse_matrix = corpus2csc(tfidf_vector_list, num_terms=len(dictionary.token2id)).T rb = BrtcReprBuilder() dictionary_data = [[ index, word, tfidf_model.dfs[index], tfidf_model.idfs[index] ] for index, word in dictionary.items()] dictionary_table = pd.DataFrame(data=dictionary_data, columns=['index', 'word', 'count', 'idf']) dictionary_table = dictionary_table.sort_values(["count"], ascending=[False]) rb.addMD( strip_margin(""" | ## TFIDF Result | ### Dictionary | {table1} """.format(table1=pandasDF2MD(dictionary_table)))) out_table[output_col_name] = csr_matrix_to_sparse_vector_json_list( sparse_matrix) model = _model_dict('tfidf_model') model['dictionary_table'] = dictionary_table model['dictionary'] = dictionary model['tfidf_model'] = tfidf_model model['input_col'] = input_col model['output_col_name'] = output_col_name model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': model}
def generate_id2wec(model_path): """ :param word2vec_model: 词向量模型位置 :return: dictionary文字编号填入w2id,二维列表词向量embedding_weights """ model = Word2Vec.load(model_path) gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2id = {v: k + 1 for k, v in gensim_dict.items()} # 词语的索引,从1开始编号 w2vec = {word: model[word] for word in w2id.keys()} # 词语的词向量 n_vocabs = len(w2id) + 1 embedding_weights = np.zeros((n_vocabs, 100)) for w, index in w2id.items(): # 从索引为1的词语开始,用词向量填充矩阵 embedding_weights[index, :] = w2vec[w] return w2id, embedding_weights
def load_data_and_to_vector(window_size=10, n_dims=100, pad_max_length=200): ''' :param window_size: 窗口大小 :param n_dims: 词向量长度 :return: ''' words = np.load('../../data/company/tokenized.npy') y = np.load('../../data/company/label.npy') test = np.load('../../data/company/test.npy') total = np.concatenate((words, test)) w2v = Word2Vec(size=n_dims, window=window_size, workers=4, min_count=1) w2v.build_vocab(total) w2v.train(total, total_examples=w2v.corpus_count, epochs=w2v.iter) _dict = Dictionary() # 将一个raw string 转换为根据本词典构构造的向量 _dict.doc2bow(w2v.wv.vocab.keys(), allow_update=True) # w2index is a dict of {word: index} and w2vector is a dict of {word : vector(np.array)} w2index = {v: k + 1 for k, v in _dict.items()} # 词语的索引 w2vector = {word: w2v[word] for word in w2index.keys()} # 词语向量 # 转换序列为索引 _sequence = [] for _s in words: _sequence.append([w2index[w] for w in _s]) _tests = [] for _s in test: _tests.append([w2index[w] for w in _s]) padded_words = sequence.pad_sequences(_sequence, maxlen=pad_max_length) padded_test = sequence.pad_sequences(_tests, maxlen=pad_max_length) # ------- embed start ------- n_symbols = len(w2index) + 1 # 因为pad了0 # every number in index table has n_dims 维的vector embedding_weights = np.zeros((n_symbols, n_dims)) # 填入vector for word, index in w2index.items(): embedding_weights[index, :] = w2vector[word] # -------- embed end ------- return n_symbols, embedding_weights, pad_max_length, padded_words, y, padded_test
def get_word2idx(corpus, w2i_path, keep_tokens, token_limit): if iom.check_exists(w2i_path): logger.info("Found dictionary! Loading...") word2id = iom.load_pickle(w2i_path) else: logger.info("Dictionary not found! Creating...") id2word = Dictionary(corpus, prune_at=2000000) # filter out too freq/infreq words id2word.filter_extremes(keep_n=token_limit, no_below=2, keep_tokens=keep_tokens) word2id = {v: k for k, v in id2word.items()} iom.save_pickle(word2id, w2i_path) return word2id
def create_dictionaries(model=None, X=None): """创建词语字典,并返回每个词语的索引,词向量,以及每个句子所对应的词语索引 Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries """ if (X is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2v_ind = {v: k + 1 for k, v in gensim_dict.items()} # 所有频数超过10的词语的索引 w2vec = {word: model[word] for word in w2v_ind.keys()} # 所有频数超过10的词语的词向量 X = parse_dataset(X, w2v_ind) # 每个句子所含词语对应的索引,所有句子中含有频数小于10的词语,索引为0 X = sequence.pad_sequences(X, maxlen=setting.VOCABULARY_MAXLEN) return w2v_ind, w2vec, X else: print('No data provided...')
def word2vector(X_train): """训练词向量""" logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) wv_model = Word2Vec(X_train, size=wv_size, window=6, sg=1, min_count=5, workers=multiprocessing.cpu_count(), iter=10) gensim_dict = Dictionary() # 创建词语词典 gensim_dict.doc2bow(wv_model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 词语的索引,从1开始编号 w2vec = {word: wv_model[word] for word in w2indx.keys()} # 词语的词向量 return w2indx, w2vec
def create_eta(topic_defs: [TopicDef], etadict: corpora.Dictionary, ntopics: int) -> np.ndarray: # create a (ntopics, nterms) matrix and fill with 1 eta = np.full(shape=(ntopics, len(etadict)), fill_value=1) for topic_idx, topic_def in enumerate( topic_defs): # for each word in the list of priors for word in topic_def.words: keyindex = [ index for index, term in etadict.items() if term == word ] # find word in dict if (len(keyindex) > 0): # if it's in the dictionary eta[topic_idx, keyindex[0]] = 1e10 # put a large number in there else: print( f'create_eta: word "{word}" of topic {topic_def.name} not found in dictionary' ) eta = np.divide( eta, eta.sum(axis=0)) # normalize so probabilities sum to 1 over all topics return eta
if i > 0 and (i % 10000) == 0: print(i) # If you wanna test something.. if 0 < maxdoc <= i: break sentence = [line.split()] dictionary.add_documents(sentence) i += 1 dictionary.filter_extremes() print("Extracting terms...") with open(path + 'terms.csv', 'wb') as out: csvw = csv.writer(out) for item in dictionary.items(): row = list() row.append(str(item[0])) row.append(item[1].encode('utf-8')) csvw.writerow(row) print("Writing word-sentence Matrix ... ") with open(path + 'bow.imat.txt', 'wb') as out: with open(corpus_path, "r") as corpus_file: csvw = csv.writer(out) i = 0 for line in corpus_file: sentence = line.split() bow = dictionary.doc2bow(sentence) for word in bow:
lda = LdaModel(common_corpus, num_topics=50, passes = 100) #%% aaa = CoherenceModel( lda, texts = datagensim, dictionary=dct,coherence='c_npmi',window_size=40,topn = 5) aaa.get_coherence() #%% Building from https://github.com/akashgit/autoencoding_vi_for_topic_models import pickle as pk dataAkash = np.load(r'C:\Users\Matteo\Desktop\autoencoding_vi_for_topic_models-master\autoencoding_vi_for_topic_models-master\data\20news_clean\train.txt.npy', encoding="bytes") dataAkashTest = np.load(r'C:\Users\Matteo\Desktop\autoencoding_vi_for_topic_models-master\autoencoding_vi_for_topic_models-master\data\20news_clean\test.txt.npy', encoding="bytes") dct = pk.load( open( r'C:\Users\Matteo\Desktop\autoencoding_vi_for_topic_models-master\autoencoding_vi_for_topic_models-master\data\20news_clean\vocab.pkl', "rb" )) inv_dct = {v: k for k, v in dct.items()} # build text document dataAkashText = [] i = 0 for d in dataAkash: tmp = [] for w in d: tmp += [inv_dct[w]] i +=1 dataAkashText += [tmp] dataAkashTextTest = [] i = 0 for d in dataAkashTest: tmp = []
if not token.is_punct and not nlp.vocab[str(token)].is_stop \ and ((not str(token).startswith('ne_') and len(str(token)) >= min_word_char_num) or \ (str(token).startswith('ne_') and len(str(token)) >= min_word_char_num + 3))] processed_news_list.append(processed_news) #processed_news_list = [news.split() for news in news_with_NE] dictionary = Dictionary(processed_news_list) # remove words with too few document frequency dictionary.filter_extremes(no_below=min_doc_tf) bow_news = [dictionary.doc2bow(doc) for doc in processed_news_list] bow_news = [news for news in bow_news if len(news)>0] # find the ids of the ne dict_token2id = dictionary.token2id dict_id2token = dict(dictionary.items()) tokens = list(dict_token2id.keys()) ne_tokens = [token for token in tokens if token.startswith('ne_')] ne_token_ids = [dict_token2id[token] for token in ne_tokens] ne_token_ids = set(ne_token_ids) # ne term weighting # add max token frequency tuple in documents bow_news = [news + [(-1, max([t[1] for t in news]))] for news in bow_news] bow_news = [news + [relationNum(news, ne_token_ids, dict_id2token)] for news in bow_news] bow_news = [[(t[0], t[1]+news[-2][1] * news[-1][t[0]]) if t[0] in ne_token_ids else (t[0], t[1]) \ for t in news[:-2]] for news in bow_news] # dictionary.save(os.path.join(data_dir, 'ne8_%s_%s_%s_weighting.dict'%(topn_concepts, gamma,lambd)))
def train_val_test(dataset: pd.DataFrame, dictionary: Dictionary, test_size: float, val_size: float) -> Dict[str, Any]: # Make train val test index num_docs = len(dataset) vaSize = int(np.floor(val_size * num_docs)) tsSize = int(np.floor(test_size * num_docs)) trSize = int(num_docs - vaSize - tsSize) idx_permute = np.random.permutation(num_docs).astype(int) print('Reading data....') # Make sure our text column is of type list dataset['text'] = dataset['text'].apply(lambda x: x.split(' ')) word2id = dict([(w, j) for j, w in dictionary.items()]) id2word = dict([(j, w) for j, w in dictionary.items()]) # Remove words not in train_data print('Starting vocabulary : {}'.format(len(dictionary))) vocab = list(dictionary) docs_tr = [[ word2id[w] for w in dataset['text'][idx_permute[idx_d]] if w in word2id ] for idx_d in range(trSize)] timestamps_tr = pd.DataFrame( dataset['timeslice'][idx_permute[range(trSize)]]) idx_tr = idx_permute[range(trSize)] docs_ts = [[ word2id[w] for w in dataset['text'][idx_permute[idx_d + trSize]] if w in word2id ] for idx_d in range(tsSize)] timestamps_ts = pd.DataFrame(dataset['timeslice'][idx_permute[range( trSize, trSize + tsSize)]]) idx_ts = idx_permute[range(trSize, trSize + tsSize)] docs_va = [[ word2id[w] for w in dataset['text'][idx_permute[idx_d + trSize + tsSize]] if w in word2id ] for idx_d in range(vaSize)] timestamps_va = pd.DataFrame(dataset['timeslice'][idx_permute[range( tsSize + trSize, num_docs)]]) idx_va = idx_permute[range(tsSize + trSize, num_docs)] print( ' Number of documents in train set : {} [this should be equal to {} and {}]' .format(len(docs_tr), trSize, len(timestamps_tr))) print( ' Number of documents in test set : {} [this should be equal to {} and {}]' .format(len(docs_ts), tsSize, len(timestamps_ts))) print( ' Number of documents in validation set: {} [this should be equal to {} and {}]' .format(len(docs_va), vaSize, len(timestamps_va))) # Split test set in 2 halves, the first containing the first half of the words in documents, and second part the second # half of words in documents. Will be use to gather test completion perplexity. print('Splitting test documents in 2 halves...') docs_ts_h1 = [[w for i, w in enumerate(doc) if i <= len(doc) / 2.0 - 1] for doc in docs_ts] docs_ts_h2 = [[w for i, w in enumerate(doc) if i > len(doc) / 2.0 - 1] for doc in docs_ts] print('Creating lists of words...') words_tr = create_list_words(docs_tr) words_ts = create_list_words(docs_ts) words_ts_h1 = create_list_words(docs_ts_h1) words_ts_h2 = create_list_words(docs_ts_h2) words_va = create_list_words(docs_va) print(' Total number of words used in train set : ', len(words_tr)) print(' Total number of words used in test set : ', len(words_ts)) print( ' Total number of words used in test firt set (first half of documents words): ', len(words_ts_h1)) print( ' Total number of words used in test firt set (first half of documents words): ', len(words_ts_h2)) print(' Total number of words used in val set : ', len(words_va)) n_docs_tr = len(docs_tr) n_docs_ts = len(docs_ts) n_docs_ts_h1 = len(docs_ts_h1) n_docs_ts_h2 = len(docs_ts_h2) n_docs_va = len(docs_va) # Get doc indices print('Getting doc indices...') doc_indices_tr = create_doc_indices(docs_tr) doc_indices_ts = create_doc_indices(docs_ts) doc_indices_ts_h1 = create_doc_indices(docs_ts_h1) doc_indices_ts_h2 = create_doc_indices(docs_ts_h2) doc_indices_va = create_doc_indices(docs_va) print('Creating bow representation...') bow_tr = create_bow(doc_indices_tr, words_tr, n_docs_tr, len(vocab)) bow_ts = create_bow(doc_indices_ts, words_ts, n_docs_ts, len(vocab)) bow_ts_h1 = create_bow(doc_indices_ts_h1, words_ts_h1, n_docs_ts_h1, len(vocab)) bow_ts_h2 = create_bow(doc_indices_ts_h2, words_ts_h2, n_docs_ts_h2, len(vocab)) bow_va = create_bow(doc_indices_va, words_va, n_docs_va, len(vocab)) print(' Train bag of words shape : {}'.format(bow_tr.shape)) print(' Test bag of words shape : {}'.format(bow_ts.shape)) print(' Test set 1 bag of words shape : {}'.format(bow_ts_h1.shape)) print(' Test set 2 bag of words shape : {}'.format(bow_ts_h2.shape)) print(' Val bag of words shape : {}'.format(bow_va.shape)) print('\nMost import words in train BOW : \n') print(get_most_important_words(bow_tr, id2word)) print('\nMost import words in val BOW : \n') print(get_most_important_words(bow_va, id2word)) print('\nMost import words in test BOW : \n') print(get_most_important_words(bow_ts, id2word)) print('\nDone splitting data.') return dict(BOW_train=bow_tr, BOW_test=bow_ts, BOW_test_h1=bow_ts_h1, BOW_test_h2=bow_ts_h2, BOW_val=bow_va, timestamps_train=timestamps_tr, timestamps_test=timestamps_ts, timestamps_val=timestamps_va, train_vocab_size=len(vocab), train_num_times=len(np.unique(timestamps_tr['timeslice'])), idx_train=idx_tr, idx_test=idx_ts, idx_val=idx_va)
metadata = pd.read_csv("..\\data\\absrecord.csv") print(len(metadata['filename'].values)) fullvocab = [] from preprocessor import preprocess, flatten for record in range(len(metadata)): # print(100*record/len(metadata)) fullvocab.append(preprocess(str(metadata.iloc[record]['body']))[0]) print(fullvocab) maindict = Dictionary(fullvocab) i = 0 fulldict = [] for document in fullvocab: temp = [] print(100 * i / len(fullvocab)) i += 1 document = list(sorted(set(document))) for token in document: if token in list(maindict.values()): for key, value in list(maindict.items()): if token == value: temp.append({"id": key, "name": token}) # print({"id":key, "name":token}) fulldict.append(temp) b = metadata['filename'].values print(fulldict) a = pd.DataFrame({'keywords': fulldict}) metadata.append(a) metadata.to_csv("..\\data\\keywords.csv")
if __name__ == '__main__': common_texts = [ ['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey'] ] common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] #This setp generates the id2token for k, v in common_dictionary.items(): pass id2word = common_dictionary.id2token ctm = CTMModel(common_corpus, num_topics=3, id2word=id2word) print("done") ## Larger Test do_process = True if do_process: import nltk nltk.download('wordnet') medical_df = get_transcription_data() docs = numpy.array(medical_df['transcription']) # Use LDA to preprocess - later make a base class and refactor. lda = LDAAnalysis(docs)