def main(): parser = argparse.ArgumentParser(description="take text feature") parser.add_argument("-t", "--type", type=str, choices=("db", "file"), default="file", help="db/file") parser.add_argument("-s", "--source", type=str, help="file path/sql script") parser.add_argument("-n", "--name", type=str, help="output file name") parser.add_argument("-k", "--topk", type=int, default=500, help="top k words") parser.add_argument("-w", "--word_category", default="v,vd,vn,vf,a,ad,an,ag,al", type=str, help="word category") args = parser.parse_args() source_from = args.type source = args.source name = args.name k_num = args.topk word_category = args.word_category.split(",") print word_category if source_from == "db": comments_df = preprocess.get_data_from_db(source) elif source_from == "file": comments_df = preprocess.read_comment_from_file(source) else: return comments_list = list(comments_df["comment"].values) cutted, word_category_list = utils.word_cut(comments_list) word_weight_flag = utils.tfidf(cutted, word_category_list, "tfidf_" + name) key_word = utils.get_topK(word_weight_flag, "top_k_" + name, k=k_num, category_list=word_category)
def prune_on_keywords(self, thres): data_words, note_ids, id2word, corpus = utils.preprocess( self.df, 5, ['NOUN', 'VERB'], STOP_WORDS, 'tokens_phrases') tfidf_matrix, tf_dicts, post_appear_dict = utils.tfidf(data_words) keywords = { i: utils.get_tfidfs_thres(tfidf_matrix[i], thres) for i, m in enumerate(tfidf_matrix) } word2id = {v: k for k, v in id2word.items()} tfidf_corpus = [[(word2id[pair[0]], pair[1]) for pair in post.items()] for post in keywords]
def load_data(data_name): timer = utils.timer(name='main') data_path = './data/' + data_name user_pref_file = data_path + '/U_BPR.npy' item_pref_file = data_path + '/V_BPR.npy' item_content_file = data_path + '/item_features.txt' train_file = data_path + '/train.csv' test_file = data_path + '/test.csv' vali_file = data_path + '/vali.csv' dat = {} # load preference data timer.tic() dat['u_pref'] = np.load(user_pref_file) dat['v_pref'] = np.load(item_pref_file) timer.toc('loaded U:%s,V:%s' % (str(dat['u_pref'].shape), str(dat['v_pref'].shape))).tic() # pre-process preference data _, dat['u_pref'] = utils.standardize(dat['u_pref']) _, dat['v_pref'] = utils.standardize_2(dat['v_pref']) timer.toc('standardized U,V').tic() # load item(article) content data # load_svmlight_file(file): 读取svmlight格式的数据文件,文件存放格式 # <label> <feature-id>:<feature-value> <feature-id>:<feature-value> ... # 其中 zero_based 选项,如果为 False 的话会将所有的 indices 减 1 # 返回 (X, y),其中 X 是 scipy.sparse matrix,y 是 numpy.ndarray item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) # tfidf 文本特征化 item_content = tfidf(item_content) # svd 特征降维 u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5) item_content = u * s # 特征标准化 _, item_content = utils.standardize(item_content) dat['item_content'] = item_content timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() # load split train = pd.read_csv(train_file, dtype=np.int32) dat['user_list'] = train['uid'].values dat['item_list'] = train['iid'].values timer.toc('read train triplets %s' % str(train.shape)) dat['test_eval'] = data.load_eval_data(test_file) dat['vali_eval'] = data.load_eval_data(vali_file) return dat
def get_nps(data): def parse_np(index): np = '' closing = 0 for elem in tree[index:]: if elem[0] == '(': closing += 1 else: match = re.findall("\)", elem) np += elem.replace(')', '').strip() + ' ' closing -= len(match) if closing <= 0: break return np.replace('-LRB- ', '(').replace(' -RRB-', ')').replace('-LRB-', '(').replace('-RRB-', ')').strip().lower() nps = [] nps_condition = {} for guess in data: tree = guess['parse'].split() for i, elem in enumerate(tree): if elem == '(NP': np = parse_np(i) nps.append(np) if guess['condition'] not in nps_condition: nps_condition[guess['condition']] = [] nps_condition[guess['condition']].append(np) # print 'Most frequent descriptions' # print 10 * '-' nps = nltk.FreqDist(nps) # v = sorted(freq.items(), key=operator.itemgetter(1), reverse=True)[:30] # for np in v: # print np[0], np[1] print 10 * '-' print 'Most distinctive descriptions per condition' nps_condition = utils.tfidf(nps_condition, 10) for condition in nps_condition: print 'Condition: ', condition print 10 * '-' for np in nps_condition[condition]: print np[0], np[1], nps[np[0]] print 10 * '-'
def main(): parser = argparse.ArgumentParser(description='take text feature') parser.add_argument('-t', '--type', type=str, choices=('db', 'file'), default='file', help='db/file') parser.add_argument('-s', '--source', type=str, help='file path/sql script') parser.add_argument('-n', '--name', type=str, help='output file name') parser.add_argument('-k', '--topk', type=int, default=500, help='top k words') parser.add_argument('-w', '--word_category', default='v,vd,vn,vf,a,ad,an,ag,al', type=str, help='word category') args = parser.parse_args() source_from = args.type source = args.source name = args.name k_num = args.topk word_category = args.word_category.split(',') print word_category if source_from == 'db': comments_df = preprocess.get_data_from_db(source) elif source_from == 'file': comments_df = preprocess.read_comment_from_file(source) else: return comments_list = list(comments_df['comment'].values) cutted, word_category_list = utils.word_cut(comments_list) word_weight_flag = utils.tfidf(cutted, word_category_list, 'tfidf_' + name) key_word = utils.get_topK(word_weight_flag, 'top_k_' + name, k=k_num, category_list=word_category)
def get_nps_attractiveness(faces): def parse_np(index): np = '' closing = 0 for elem in tree[index:]: if elem[0] == '(': closing += 1 else: match = re.findall("\)", elem) np += elem.replace(')', '').strip() + ' ' closing -= len(match) if closing <= 0: break return np.replace('-LRB- ', '(').replace(' -RRB-', ')').replace('-LRB-', '(').replace('-RRB-', ')').strip().lower() nps = [] nps_attractiveness = {} for face in faces: tree = face['parse'].split() for i, elem in enumerate(tree): if elem == '(NP': np = parse_np(i) nps.append(np) attract = face['responses']['attractive'].lower() if attract not in nps_attractiveness: nps_attractiveness[attract] = [] nps_attractiveness[attract].append(np) print 10 * '-' print 'Most frequent descriptions per attractiveness' nps_attractiveness = utils.tfidf(nps_attractiveness, 10) nps = nltk.FreqDist(nps) for attract in nps_attractiveness: print 'Race: ', attract print 10 * '-' for np in nps_attractiveness[attract]: print np[0], np[1], nps[np[0]] print 10 * '-'
def common_words_faces_typicality(data): typicality = set(map(lambda face: face['responses']['typical'], data)) voc, gvoc = {}, [] for typical in typicality: voc[typical] = [] for face in filter(lambda face: face['responses']['typical'] == typical, data): for i, word in enumerate(face['tokens']): if 'NN' in face['pos_tag'][i] or 'JJ' in face['pos_tag'][i]: voc[typical].append(word.lower()) gvoc.append(word.lower()) tfidf = utils.tfidf(voc, 10) gvoc = nltk.FreqDist(gvoc) print 'Most common words in Faces per typicality:' for typical in typicality: print typical for word in tfidf[typical]: print word, gvoc[word[0]] print 20 * '-'
def common_words_faces_attractiveness(data): attractiveness = set(map(lambda face: face['responses']['attractive'], data)) voc, gvoc = {}, [] for attract in attractiveness: voc[attract] = [] for face in filter(lambda face: face['responses']['attractive'] == attract, data): for i, word in enumerate(face['tokens']): if 'NN' in face['pos_tag'][i] or 'JJ' in face['pos_tag'][i]: voc[attract].append(word.lower()) gvoc.append(word.lower()) tfidf = utils.tfidf(voc, 10) gvoc = nltk.FreqDist(gvoc) print 'Most common words in Faces per attractiveness:' for attract in attractiveness: print attract for word in tfidf[attract]: print word, gvoc[word[0]] print 20 * '-'
def common_words_guesses(data): conditions = map(lambda x: x['condition'], data) voc, gvoc = {}, [] for condition in set(conditions): voc[condition] = [] f = filter(lambda guess: guess['condition'] == condition, data) for guess in f: for i, word in enumerate(guess['tokens']): if 'NN' in guess['pos_tag'][i] or 'NN' in guess['pos_tag'][i]: voc[condition].append(word.lower()) gvoc.append(word.lower()) # voc[condition] = nltk.FreqDist(voc[condition]) tfidf = utils.tfidf(voc, 10) gvoc = nltk.FreqDist(gvoc) print 'Most common words in Guesses per condition:' for condition in tfidf: print 'Condition:', condition for word in tfidf[condition]: print word, gvoc[word[0]] print 20 * '-'
# add ratio features data['ratio_title'] = data['word_in_title']/data['len_of_query'] data['ratio_description'] = data['word_in_description']/data['len_of_query'] data['attribute'] = data['search_term']+"\t"+data['brand'] data['ratio_brand'] = data['word_in_brand']/data['len_of_query'] data['ratio_attr'] = data['word_in_attr']/data['len_of_query'] data['ratio_attr_title'] = data['word_in_attr_title']/data['len_of_query'] data['brand_ratio'] = data['word_in_brand']/data['len_of_brand'] data['attr_ratio'] = data['word_in_attr']/data['len_of_attr'] data['attr_title_ratio'] = data['word_in_attr_title']/data['len_of_attr_title'] data['title_ratio'] = data['word_in_title']/data['len_of_title'] data['description_ratio'] = data['word_in_description']/data['len_of_description'] # add bm25 features desc_tf, desc_idf, desc_length, desc_ave_length = utils.tfidf(data, 'product_description') data['desc_BM25'] = data.apply(lambda x: utils.BM25_score( x, desc_tf, desc_idf, desc_length, desc_ave_length), axis=1) attr_tf, attr_idf, attr_length, attr_ave_length = utils.tfidf(data, 'attr') data['attr_BM25'] = data.apply(lambda x: utils.BM25_score( x, attr_tf, attr_idf, attr_length, attr_ave_length), axis=1) title_tf, title_idf, title_length, title_ave_length = utils.tfidf(data, 'product_title') data['title_BM25'] = data.apply(lambda x: utils.BM25_score( x, title_tf, title_idf, title_length, title_ave_length), axis=1) attr_title_tf, attr_title_idf, attr_title_length, attr_title_ave_length = utils.tfidf(data, 'attr_title') data['attr_title_BM25'] = data.apply(lambda x: utils.BM25_score( x, attr_title_tf, attr_title_idf, attr_title_length, attr_title_ave_length), axis=1) brand_tf, brand_idf, brand_length, brand_ave_length = utils.tfidf(data, 'brand') data['brand_BM25'] = data.apply(lambda x: utils.BM25_score( x, brand_tf, brand_idf, brand_length, brand_ave_length), axis=1)
def get_data(): node_information = pd.read_csv( 'node_information.csv', header=None, names=['ID', 'Year', 'Title', 'Authors', 'Journal', 'Abstract']) node_information = pd.read_csv( 'node_information.csv', header=None, names=['ID', 'Year', 'Title', 'Authors', 'Journal', 'Abstract']) training_set = pd.read_csv('training_set.txt', header=None, names=['Target', 'Source', 'Edge'], delim_whitespace=True) #testing_set = pd.read_csv('testing_set.txt', header=None, names=['Target', 'Source'], delim_whitespace=True) print("Get valid IDs") valid_ids = set() for element in training_set.values: valid_ids.add(element[0]) valid_ids.add(element[1]) print("Select valid indices from valid IDs") index_valid = [ i for i, element in enumerate(node_information.values) if element[0] in valid_ids ] node_info = node_information.iloc[index_valid] print("Get index for nodes") IDs = [] ID_pos = {} for element in node_info.values: ID_pos[element[0]] = len(IDs) IDs.append(element[0]) print("Add ID column for merging") training_set['Target_ID'] = training_set.apply(lambda row: ID_pos[row[0]], axis=1) training_set['Source_ID'] = training_set.apply(lambda row: ID_pos[row[1]], axis=1) print("Merge") train = pd.merge(training_set, node_information, how='left', left_on='Target_ID', right_index=True) train = pd.merge(train, node_information, how='left', left_on='Source_ID', right_index=True, suffixes=['_target', '_source']) #train.to_csv('train_blank.csv', index=False) #train = pd.read_csv('train_blank.csv') #train.to_csv('train.csv', index=False) t = time() print("Add overlapping titles") train['Overlap_title'] = train.apply(lambda row: overlap(row, 'Title'), axis=1) print("Add common_authors") train['Common_authors'] = train.apply(lambda row: common(row, 'Authors'), axis=1) print("Add overlapping abstract") train['Overlap_abstract'] = train.apply( lambda row: overlap(row, 'Abstract'), axis=1) print("Date difference") train['Date_diff'] = (train['Year_source'] - train['Year_target']).abs() print(time() - t) #train.to_csv('train_basic.csv', index=False) #print("Loading set") #train = pd.read_csv('train_basic.csv') #print("Loaded") t = time() print("Tfidf") tfidf_vect = TfidfVectorizer(stop_words="english") abstracts_source = train['Abstract_source'].values abstracts_target = train['Abstract_target'].values all_abstracts = np.concatenate((abstracts_source, abstracts_target)) tfidf_vect.fit(all_abstracts) print("tf_idf fitted") vect_source = tfidf_vect.transform(abstracts_source) print("source transformed") vect_target = tfidf_vect.transform(abstracts_target) print("target transformed") train['Tfidf_cosine_abstracts_nolim'] = tfidf(vect_source, vect_target) print(time() - t) #train.to_csv('train_basic_tfidf.csv', index=False) #train = pd.read_csv('train_basic_tfidf.csv') t = time() print("Tfidf") tfidf_vect = TfidfVectorizer(stop_words="english") titles_source = train['Title_source'].values titles_target = train['Title_target'].values all_abstracts = np.concatenate((titles_source, titles_target)) tfidf_vect.fit(all_abstracts) print("tf_idf fitted") vect_source = tfidf_vect.transform(titles_source) print("source transformed") vect_target = tfidf_vect.transform(titles_target) print("target transformed") train['Tfidf_cosine_titles'] = tfidf(vect_source, vect_target) print(time() - t) #train.to_csv('train_basic_tfidf_title.csv', index=False) #train = pd.read_csv('train_basic_tfidf_title.csv') return train
def embed_posts(self, min_count, allowed_pos, stopwords, preprocess_option): if self.option == 'tfidf+lsi': logging.info( 'CBF - Using TFIDF vectors, LSI for dimension reduction') data_words, note_ids, id2word, corpus = utils.preprocess( self.all_note_contents, min_count, allowed_pos, stopwords, preprocess_option) #self.post_bows = pd.DataFrame(data={'NoteID':note_ids,'BoW':data_words}).set_index('NoteID') logging.debug('[CBF] - %d non-empty posts', len(corpus)) logging.debug('[CBF] - %s extracted %d tokens/phrases', preprocess_option, len(id2word)) tfidf_matrix, tf_dicts, post_appear_dict = utils.tfidf(data_words) word2id = {v: k for k, v in id2word.items()} tfidf_corpus = [[(word2id[pair[0]], pair[1]) for pair in post.items()] for post in tfidf_matrix] model = LsiModel(tfidf_corpus, num_topics=self.feature_size, id2word=id2word) for i, post_tfidf in enumerate(tfidf_corpus): note_id = note_ids[i] if not note_id in self.items: post_repr = model[post_tfidf] self.items[note_id] = [ p[1] for p in post_repr if len(post_repr) == self.feature_size ] self.model = model return True elif self.option == 'tfidf+keywords+lsi': logging.info( 'CBF - Using TFIDF vectors on only 1/3 keywords of each post, LSI for dimension reduction' ) data_words, note_ids, id2word, corpus = utils.preprocess( self.all_note_contents, min_count, allowed_pos, stopwords, preprocess_option) #self.post_bows = pd.DataFrame(data={'NoteID':note_ids,'BoW':data_words}).set_index('NoteID') print('CBF - %d non-empty posts' % len(corpus)) print('CBF - %s BoW extracted %d tokens/phrases' % (preprocess_option, len(id2word))) tfidf_matrix, tf_dicts, post_appear_dict = utils.tfidf(data_words) keywords = { i: utils.get_top_tfidfs( tfidf_matrix[i], len(tfidf_matrix[i]) // 3) # TODO: have over-threshold phrases as the keyword for i, m in enumerate(tfidf_matrix) } word2id = {v: k for k, v in id2word.items()} tfidf_corpus = [[(word2id[pair[0]], pair[1]) for pair in post.items()] for post in keywords] model = LsiModel(tfidf_corpus, num_topics=self.feature_size, id2word=id2word) for i, post_tfidf in enumerate(tfidf_corpus): note_id = note_ids[i] self.items[note_id] = model[post_tfidf] self.model = model return True elif self.option == 'KCB': from gensim import models data_words, note_ids, id2word, corpus = utils.preprocess( self.all_note_contents, min_count, allowed_pos, stopwords, preprocess_option) tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] corpus_tfidf_lst = [] for doc in corpus_tfidf: doc.sort(key=operator.itemgetter(1)) doc = doc[-len(doc) // self.top_ratio:] corpus_tfidf_lst.append(doc) # print('kt',corpus_tfidf_lst) lsi_model = models.LsiModel(corpus_tfidf_lst, id2word=id2word, num_topics=self.feature_size ) # initialize an LSI transformation corpus_lsi = lsi_model[corpus_tfidf_lst] for i, post_repr in enumerate(corpus_lsi): note_id = note_ids[i] self.items[note_id] = [ p[1] for p in post_repr if len(post_repr) == self.feature_size ] self.model = lsi_model return True elif self.option == 'word_emb+wmd': # Load pretrained FastText embeddings self.model = FastText.load('cleaned_data/all_notes_model') # print('using model:',self.model) # Cannot get post embeddings from word embeddings # It cannot stand alone as a CBF method return False elif self.option == 'keyword+word_emb+wmd': # Load pretrained FastText embeddings self.model = FastText.load('cleaned_data/all_notes_model') # print('using model:',self.model) # Cannot get post embeddings from word embeddings # It cannot stand alone as a CBF method return False elif self.option == 'keyword+ft_word_emb+sif': # Using SIF on keyword --> sentence embedding data_words, note_ids, id2word, corpus = utils.preprocess( self.all_note_contents, min_count, allowed_pos, stopwords, preprocess_option) self.post_bows = pd.DataFrame(data={ 'NoteID': note_ids, 'BoW': data_words }).set_index('NoteID') logging.debug('CBF - %d non-empty posts', len(corpus)) logging.debug('CBF - %s BoW extracted %d tokens/phrases', preprocess_option, len(id2word)) sentence_list = [] note_ids_lookup = [] for note_id, post in self.post_bows.iterrows(): word_list = [] for word in post: word_emd = self.model[word] word_list.append(Word(word, word_emd)) if len(word_list ) > 0: # did we find any words (not an empty set) sentence_list.append(Sentence(word_list)) note_ids_lookup.append( note_id ) # in case there are some posts of 0 length, thus not included in this sentence_embs = {} sentence_vectors = sentence_to_vec( sentence_list, self.feature_size) # all vectors converted together if len(sentence_vectors) == len(sentence_list): for i in range(len(sentence_vectors)): # map: note_id -> vector sentence_embs[note_ids_lookup[i]] = sentence_vectors[i] self.items = sentence_embs return True elif self.option == 'ft_word_emb+sif': # Using SIF on whole text --> sentence embedding data_words, note_ids, id2word, corpus = utils.preprocess_raw( self.all_note_contents) self.post_bows = pd.DataFrame(data={ 'NoteID': note_ids, 'BoW': data_words }).set_index('NoteID') sentence_list = [] note_ids_lookup = [] for note_id, post in self.post_bows.iterrows(): word_list = [] for word in post: word_emd = self.model[word] word_list.append(Word(word, word_emd)) if len(word_list ) > 0: # did we find any words (not an empty set) sentence_list.append(Sentence(word_list)) note_ids_lookup.append( note_id ) # in case there are some posts of 0 length, thus not included in this sentence_embs = {} sentence_vectors = sentence_to_vec( sentence_list, self.feature_size) # all vectors converted together if len(sentence_vectors) == len(sentence_list): for i in range(len(sentence_vectors)): # map: note_id -> vector sentence_embs[note_ids_lookup[i]] = sentence_vectors[i] self.items = sentence_embs return True elif self.option == 'bert_word_emb+sif': tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', max_len=512) #nids = [nid for nid in self.all_note_contents.NoteID.values if nid not in self.items.keys()] note_ids = self.all_note_contents.NoteID.to_list() MAX_LEN = 512 tokenized_texts_list = [] indexed_tokens_list = [] attention_masks = [] for text in self.all_note_contents.Contents.values: marked_text = "[CLS] " + text + " [SEP]" tokenized_text = tokenizer.tokenize(marked_text) tokenized_texts_list.append(tokenized_text) indexed_tokens_list.append( tokenizer.convert_tokens_to_ids(tokenized_text)) input_ids_list = pad_sequences(indexed_tokens_list, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") for seq in input_ids_list: seq_mask = [int(float(i > 0)) for i in seq] attention_masks.append(seq_mask) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor(input_ids_list) segments_tensors = torch.tensor(attention_masks) # Put the model in "evaluation" mode, meaning feed-forward operation. self.model.eval() with torch.no_grad(): encoded_layers, _ = model(tokens_tensor, segments_tensors) emb_layers = encoded_layers[-4:] sum_layers = torch.stack(emb_layers, dim=0).sum(dim=0) sentence_word_embs = {} for i in range(len(tokenized_texts_list)): sentence_word_embs[ note_ids[i]] = sum_layers[i][:len(tokenized_texts_list[i])] tokenized_texts_ = { nid: tokenized_texts_list[i] for i, nid in enumerate(note_ids) } sentence_list = [] note_ids_lookup = [] for note_id in note_ids: #print(note_id) word_list = [] for j in range(len(sentence_word_embs[note_id])): word_emb = sentence_word_embs[note_id][j] # Add here if to use only keywords word_text = tokenized_texts_[note_id][j] word_list.append(Word(word_text, word_emb.numpy())) if len(word_list ) > 0: # did we find any words (not an empty set) sentence_list.append(Sentence(word_list)) note_ids_lookup.append( note_id ) # in case there are some posts of 0 length, thus not included in this #print('wordlist',len(word_list)) sentence_embs = {} sentence_vectors = sentence_to_vec( sentence_list, self.feature_size) # all vectors converted together if len(sentence_vectors) == len(sentence_list): for i in range(len(sentence_vectors)): # map: note_id -> vector sentence_embs[note_ids_lookup[i]] = sentence_vectors[i] self.items = sentence_embs return True elif self.option == 'sentence_emb': note_ids = self.all_note_contents.NoteID.to_list() all_note_contents = self.all_note_contents['Contents'].to_list() sentence_embs = {} sentence_vectors = self.model[all_note_contents] if len(sentence_vectors) == len(all_note_contents): for i in range(len(sentence_vectors)): # map: note_id -> vector sentence_embs[ note_ids_lookup[i]] = sentence_vectors[i].numpy() self.items = sentence_embs return True elif self.option == 'sentence_emb_precomputed': return True
def precompute_similarity(self, model_path, option, feature_size): all_note_contents = self.all_note_contents.Contents.to_list() all_note_nids = self.all_note_contents.NoteID.to_list() similarity_matrix = {} if option == 'ft_word_emd+sif': #'cleaned_data/ft_model_incr' # Load pretrained FastText embeddings model = FastText.load(model_path) logging.info('[Preprocessor] Using model: %s', str(model)) similarity_matrix = {} nlp = spacy.load("en_core_web_sm") note_ids = all_note_contents['NoteID'].values contents = all_note_contents['Contents'].values data_words = [[token.text for token in nlp(content)] for note_id, content in zip(note_ids, contents)] post_tokens = pd.DataFrame(data={ 'NoteID': note_ids, 'Tokens': data_words }).set_index('NoteID') sentence_list = [] sentence_embs = {} for note_id, post in post_tokens.iterrows(): word_list = [] for word in post.values[0]: word_emd = model[word] word_list.append(Word(word, word_emd)) if len(word_list ) > 0: # did we find any words (not an empty set) sentence_list.append(Sentence(word_list)) sentence_embs[note_id] = sentence_to_vec( sentence_list, feature_size) # Compute post-wise cosine similarities for note_id1, emb1 in sentence_embs.items(): for note_id2, emb2 in sentence_embs.items(): if note_id1 != note_id2 and ( note_id2, note_id1) not in similarity_matrix: # apply l2-distance #utils.l2_sim() # apply cosine distance sim = utils.cosine_sim(emb1[0], emb2[0]) similarity_matrix[(note_id1, note_id2)] = sim similarity_matrix[(note_id2, note_id1)] = sim return similarity_matrix elif option == 'bert_word_emb+sif': # for BERT import torch from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM from keras.preprocessing.sequence import pad_sequences tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', max_len=128) MAX_LEN = 512 tokenized_texts_list = [] indexed_tokens_list = [] attention_masks = [] for text in all_note_contents.Contents.values: marked_text = "[CLS] " + text + " [SEP]" tokenized_text = tokenizer.tokenize(marked_text) tokenized_texts_list.append(tokenized_text) indexed_tokens_list.append( tokenizer.convert_tokens_to_ids(tokenized_text)) input_ids_list = pad_sequences(indexed_tokens, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") for seq in input_ids: seq_mask = [int(float(i > 0)) for i in seq] attention_masks.append(seq_mask) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor(input_ids_list) segments_tensors = torch.tensor(attention_masks) # Load pre-trained model (weights) model = BertModel.from_pretrained('bert-base-uncased') # Put the model in "evaluation" mode, meaning feed-forward operation. model.eval() with torch.no_grad(): encoded_layers, _ = model(tokens_tensor, segments_tensors) emb_layers = encoded_layers[-4:] sum_layers = torch.stack(emb_layers, dim=0).sum(dim=0) # 434*512*768 sentence_word_embs = {} for i in range(len(tokenized_texts_list)): sentence_word_embs[ nids[i]] = sum_layers[i][:len(tokenized_texts_list[i])] # Keep a look up dictionary [note id] --> text content tokenized_texts_ = { nid: tokenized_texts_list[i] for i, nid in enumerate(nids) } embedding_size = feature_size # Set the shape of the sentence/post embeddings sentence_list = [] note_ids_lookup = [] for note_id in nids: #print(note_id) word_list = [] for j in range(len(sentence_word_embs[note_id])): word_emb = sentence_word_embs[note_id][j] # Add here if to use only keywords word_text = tokenized_texts_[note_id][j] word_list.append(Word(word_text, word_emb.numpy())) if len(word_list) > 0: sentence_list.append(Sentence(word_list)) note_ids_lookup.append( note_id ) # in case there are some posts of 0 length, thus not included in this # Encode sentences/posts with embeddigns sentence_embs = {} sentence_vectors = sentence_to_vec( sentence_list, embedding_size) # all vectors converted together if len(sentence_vectors) == len(sentence_list): for i in range(len(sentence_vectors)): # map: note_id -> vector sentence_embs[note_ids_lookup[i]] = sentence_vectors[i] # Compute post-wise cosine similarities for note_id1, emb1 in sentence_embs.items(): for note_id2, emb2 in sentence_embs.items(): if note_id1 != note_id2 and ( note_id2, note_id1) not in similarity_matrix: # apply l2-distance #utils.l2_sim() # apply cosine distance sim = utils.cosine_sim(emb1[0], emb2[0]) similarity_matrix[(note_id1, note_id2)] = sim similarity_matrix[(note_id2, note_id1)] = sim return similarity_matrix, sentence_embs elif option == 'sentence_emb': import tensorflow as tf import tensorflow_hub as hub embed = hub.load(model_path) logging.info( '[Preprocessor] using model: universal-sentence-encoder-1') sentence_embs = {} sentence_vectors = embed(all_note_contents) if len(sentence_vectors) == len(all_note_contents): for i in range(len(sentence_vectors)): # map: note_id -> vector sentence_embs[ all_note_nids[i]] = sentence_vectors[i].numpy() #corr = np.inner(sentence_vectors, sentence_vectors) #cosine_similarities = tf.reduce_sum(tf.multiply(sentence_vectors, sentence_vectors), axis=1) #clip_cosine_similarities = tf.clip_by_value(cosine_similarities, -1.0, 1.0) #sim_scores = 1.0 - tf.acos(clip_cosine_similarities) #print(sim_scores) #for i, sims in enumerate(sim_scores): # for j, sim in enumerate(sims): ## note_id1 = all_note_nids[i] # note_id2 = all_note_nids[j] # if not note_id1==note_id2: # similarity_matrix[(note_id1, note_id2)] = sim # Compute post-wise cosine similarities for note_id1, emb1 in sentence_embs.items(): for note_id2, emb2 in sentence_embs.items(): if note_id1 != note_id2 and ( note_id2, note_id1) not in similarity_matrix: # apply l2-distance #utils.l2_sim() # apply cosine distance sim = utils.cosine_sim(emb1, emb2) similarity_matrix[(note_id1, note_id2)] = sim similarity_matrix[(note_id2, note_id1)] = sim return similarity_matrix, sentence_embs elif option == 'tfidf+lsi': logging.info( '[Preprocessor] using TFIDF vectors, LSI for dimension reduction' ) data_words, note_ids, id2word, corpus = utils.preprocess( self.all_note_contents, 10, ['NOUN', 'VERB'], STOP_WORDS, 'tokens_phrases') #self.post_bows = pd.DataFrame(data={'NoteID':note_ids,'BoW':data_words}).set_index('NoteID') logging.debug('[Preprocessor] - %d non-empty posts', len(corpus)) logging.debug('[Preprocessor] - %s extracted %d tokens/phrases', 'tokens_phrases', len(id2word)) tfidf_matrix, tf_dicts, post_appear_dict = utils.tfidf(data_words) word2id = {v: k for k, v in id2word.items()} tfidf_corpus = [[(word2id[pair[0]], pair[1]) for pair in post.items()] for post in tfidf_matrix] model = LsiModel(tfidf_corpus, num_topics=feature_size, id2word=id2word) sentence_embs = {} for i, post_tfidf in enumerate(tfidf_corpus): note_id = note_ids[i] if not note_id in sentence_embs: post_repr = model[post_tfidf] #print(post_repr) #print(i) sentence_embs[note_id] = np.array([ p[1] for p in post_repr if len(post_repr) == feature_size ]) # Compute post-wise cosine similarities for note_id1, emb1 in sentence_embs.items(): for note_id2, emb2 in sentence_embs.items(): if note_id1 != note_id2 and ( note_id2, note_id1) not in similarity_matrix: if len(emb1) and len(emb2): # apply l2-distance #utils.l2_sim() # apply cosine distance sim = utils.cosine_sim(emb1, emb2) similarity_matrix[(note_id1, note_id2)] = sim similarity_matrix[(note_id2, note_id1)] = sim return similarity_matrix, sentence_embs