def get_new_wsd_tokens(wsd_tokens, tokenizer): new_wsd_tokens = [] for tok in wsd_tokens: split_text = tokenizer.tokenize(tok.text) if len(split_text) > 1: for text in split_text: new_wsd_tokens.append(Token(token_id=tok.token_id, text=text, pos=tok.pos, lemma=tok.lemma)) else: new_wsd_tokens.append(Token(token_id=tok.token_id, text=tok.text, pos=tok.pos, lemma=tok.lemma)) return new_wsd_tokens
def __init__(self, corpus_path, tokenizer, logger): self.input_ids = [] self.input_masks = [] self.segment_ids = [] self.target_indexes = [] self.meanings = [] self.logger = logger self.corpus_lines = 0 meaning_to_sentence = get_meaning_to_sentence(path_to_corpus=corpus_path) for meaning, sentence_target_dict_list in meaning_to_sentence.items(): for sentence_index, sentence_target_dict in enumerate(sentence_target_dict_list): sentence = sentence_target_dict["sentence"] sentence_tokens = [] target_index = sentence_target_dict["target_index"] for tok_index, tok in enumerate(sentence): token_text = tok token_pos = 'n' token_lemma = 'unknown' if tok_index == target_index: token_id = "target" else: token_id = "unknown" sentence_tokens.append(Token(token_id=token_id, text=token_text, pos=token_pos, lemma=token_lemma)) sentence_tokens.insert(0, Token(text='[CLS]', token_id='unknown')) sentence_tokens.append(Token(text='[SEP]', token_id='unknown')) new_wsd_tokens = get_new_wsd_tokens(wsd_tokens=sentence_tokens, tokenizer=tokenizer) target_index_list = get_target_indexes(target_token_id="target", wsd_tokens=new_wsd_tokens, df_index=meaning+"_"+str(sentence_index)) tokenized_sentence = tokenizer.tokenize(' '.join(sentence)) example = InputExample(guid=self.corpus_lines, tokens_a=tokenized_sentence) feature = convert_example_to_feature(example=example, tokenizer=tokenizer, max_seq_length=73, logger=self.logger) target_index_list = pad_sequences([target_index_list], padding="post", value=-1, maxlen=73) self.input_ids.append(feature.input_ids) self.input_masks.append(feature.input_mask) self.segment_ids.append(feature.segment_ids) self.target_indexes.append(target_index_list) self.meanings.append(meaning) self.corpus_lines += 1 self.segment_ids = np.asarray(self.segment_ids) self.input_masks = np.asarray(self.input_masks) self.input_ids = np.asarray(self.input_ids) self.target_indexes = np.asarray(self.target_indexes) self.meanings = np.asarray(self.meanings)
def perform_wsd_on_test(test_dataframe, meanings, model, tokenizer, layer_indexes, use_context_embeddings, without_stop_words, target_word_embeddings_only): test_dataframe["bert_output"] = [None for _ in range(len(test_dataframe))] test_dataframe["meaning2confidence"] = [ None for _ in range(len(test_dataframe)) ] test_dataframe["wsd_strategy"] = [None for _ in range(len(test_dataframe))] test_dataframe['chosen_meaning_confidence'] = [ None for _ in range(len(test_dataframe)) ] stop_words = set(stopwords.words('english')) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") for index, instance in test_dataframe.iterrows(): sentence = copy.deepcopy(instance.sentence) sentence_tokens = copy.deepcopy(instance.sentence_tokens) target_index = get_target_index(instance.token_ids[0], sentence_tokens, index) if without_stop_words: temp_sentence_tokens = [] for token_index, token in enumerate(sentence_tokens): if not (token.text in stop_words) or token_index == target_index: temp_sentence_tokens.append(token) sentence_tokens = copy.deepcopy(temp_sentence_tokens) sentence_tokens.insert(0, Token(text='[CLS]', token_id='unknown')) sentence_tokens.append(Token(text='[SEP]', token_id='unknown')) new_sentence_tokens = get_new_wsd_tokens(wsd_tokens=sentence_tokens, tokenizer=tokenizer) tokenized_sentence = tokenizer.tokenize(sentence) tokenized_sentence.insert(0, '[CLS]') tokenized_sentence.append('[SEP]') target_indexes = get_target_indexes(instance.token_ids[0], new_sentence_tokens, index) if not target_word_embeddings_only: context_vector = get_context_vector_per_sentence( tokenized_sentence=tokenized_sentence, tokenizer=tokenizer, model=model, target_index_list=target_indexes, layer_index_list=layer_indexes, is_context_embedding=use_context_embeddings) else: context_vector = get_targetword_embedding_per_sentence( tokenized_sentence=tokenized_sentence, tokenizer=tokenizer, model=model, target_index_list=target_indexes, layer_index_list=layer_indexes, device=device) candidate_meanings = copy.deepcopy(instance.candidate_meanings) found_meaning = False meaning_similarities = dict() for candidate_meaning in candidate_meanings: meaning_similarities[candidate_meaning] = [] if candidate_meaning in meanings: found_meaning = True similarity = 1 - spatial.distance.cosine( meanings[candidate_meaning], context_vector) meaning_similarities[candidate_meaning].append(similarity) else: meaning_similarities[candidate_meaning] = float(0) wsd_strategy = "bert" for meaning, similarity_list in meaning_similarities.items(): if isinstance(similarity_list, list): similarity_list.sort(reverse=True) meaning_similarities[meaning] = similarity_list[0] if found_meaning: sorted_meanings = sorted(meaning_similarities.items(), key=operator.itemgetter(1), reverse=True) same_confidence = [ i for i, v in enumerate(sorted_meanings) if v[1] == sorted_meanings[0][1] ] if len(same_confidence) > 1 and 0 in same_confidence: temp_list = list() for idx in same_confidence: temp_list.append(sorted_meanings[idx][0]) temp_dict = {} for m in temp_list: sense_rank = candidate_meanings.index(m) temp_dict[m] = sense_rank sorted_ranks = sorted(temp_dict.items(), key=operator.itemgetter(1)) bert_output = sorted_ranks[0][0] else: bert_output = sorted_meanings[0][0] chosen_meaning_confidence = sorted_meanings[0][1] else: bert_output = candidate_meanings[0] chosen_meaning_confidence = meaning_similarities[ candidate_meanings[0]] wsd_strategy = "mfs_fallback" if len(candidate_meanings) == 1: bert_output = candidate_meanings[0] wsd_strategy = "monosemous" chosen_meaning_confidence = meaning_similarities[ candidate_meanings[0]] test_dataframe.at[index, 'bert_output'] = bert_output test_dataframe.at[index, 'wsd_strategy'] = wsd_strategy test_dataframe.at[index, "meaning2confidence"] = meaning_similarities test_dataframe.at[ index, "chosen_meaning_confidence"] = chosen_meaning_confidence return test_dataframe
def create_context_embeddings_from_dataframe(dataframe, tokenizer, model): layer_indexes = [-1, -2, -3, -4] meanings = {} for index, instance in dataframe.iterrows(): sentence = copy.deepcopy(instance.sentence) source_wn_engs = copy.deepcopy(instance.source_wn_engs) original_sentence_tokens = copy.deepcopy(instance.sentence_tokens) original_sentence_tokens.insert( 0, Token(token_id="unknown", text="[CLS]")) original_sentence_tokens.append(Token(token_id="unknown", text="[SEP]")) n_wsd_tokens = get_new_wsd_tokens(wsd_tokens=original_sentence_tokens, tokenizer=tokenizer) target_indexes = get_target_index(instance.token_ids[0], n_wsd_tokens, index) tokenized_sentence = tokenizer.tokenize(sentence) tokenized_sentence.insert(0, '[CLS]') tokenized_sentence.append('[SEP]') input_ids = np.asarray( tokenizer.convert_tokens_to_ids(tokenized_sentence)).reshape( 1, len(tokenized_sentence)) input_mask = [1] * len(tokenized_sentence) input_mask = np.asarray(input_mask).reshape(1, len(tokenized_sentence)) input_ids = torch.tensor(input_ids, dtype=torch.long) input_mask = torch.tensor(input_mask, dtype=torch.long) with torch.no_grad(): all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) all_encoder_layers = all_encoder_layers all_out_features = [] for i, token in enumerate(tokenized_sentence): all_layers = [] for j, layer_index in enumerate(layer_indexes): layer_output = all_encoder_layers[int( layer_index)].detach().cpu().numpy() layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(x.item(), 6) for x in layer_output[0][i] ] all_layers.append(layers) out_features = collections.OrderedDict() out_features["token"] = token out_features["layers"] = all_layers all_out_features.append(out_features) token_average_list = list() for feature_index, feature in enumerate(all_out_features): token = feature['token'] if token == '[CLS]' or token == '[SEP]' or (feature_index in target_indexes): continue layers = feature["layers"] layer_values = [] for layer in layers: values = layer['values'] layer_values.append(values) summed_values = np.sum(layer_values, axis=0) token_average_list.append(summed_values) context_vector = np.average(token_average_list, axis=0) for source_wn_eng in source_wn_engs: if source_wn_eng in meanings: meanings[source_wn_eng].append(context_vector) else: meanings[source_wn_eng] = [context_vector] return meanings
def create_target_word_embeddings_from_dataframe( path_to_dataframe, tokenizer, model, target_word_vector_method="average", final_vector_method="full_list"): assert target_word_vector_method == "average" or target_word_vector_method == "sum", \ "You can only choose between summing the target token word pieces or averaging them!" assert final_vector_method == "full_list" or final_vector_method == "average", \ "You can either choose to leave the target token embeddings " \ "for a meaning as a list or choose 'average' to create a " \ "one-to-one mapping between a mapping and its vector" dataframe = pd.read_pickle(path_to_dataframe) layer_indexes = [-1, -2, -3, -4] meanings_to_vec = {} total_length = len(dataframe) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with tqdm(total=total_length, desc="Creating context embeddings") as pbar: for index, instance in dataframe.iterrows(): sentence = copy.deepcopy(instance.sentence) sentence_tokens = copy.deepcopy(instance.sentence_tokens) sentence_tokens.insert(0, Token(text='[CLS]', token_id='unknown')) sentence_tokens.append(Token(text='[SEP]', token_id='unknown')) new_wsd_tokens = get_new_wsd_tokens(sentence_tokens, tokenizer) target_indexes = get_target_indexes(instance.token_ids[0], new_wsd_tokens, 0) gold_meanings = copy.deepcopy(instance.source_wn_engs) tokenized_sentence = tokenizer.tokenize(sentence) tokenized_sentence.insert(0, '[CLS]') tokenized_sentence.append('[SEP]') input_ids = np.asarray(tokenizer.convert_tokens_to_ids(tokenized_sentence)) \ .reshape(1, len(tokenized_sentence)) input_mask = [1] * len(tokenized_sentence) input_mask = np.asarray(input_mask).reshape( 1, len(tokenized_sentence)) input_ids = torch.tensor(input_ids, dtype=torch.long).to(device) input_mask = torch.tensor(input_mask, dtype=torch.long).to(device) with torch.no_grad(): all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) all_encoder_layers = all_encoder_layers all_out_features = [] for i, token in enumerate(tokenized_sentence): all_layers = [] for j, layer_index in enumerate(layer_indexes): layer_output = all_encoder_layers[int( layer_index)].detach().cpu().numpy() layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(x.item(), 6) for x in layer_output[0][i] ] all_layers.append(layers) out_features = collections.OrderedDict() out_features["token"] = token out_features["layers"] = all_layers all_out_features.append(out_features) token_average_list = list() for feature_index, feature in enumerate(all_out_features): layers = feature["layers"] layer_values = [] for layer in layers: values = layer['values'] layer_values.append(values) context_vector_values = np.sum(layer_values, axis=0) token_average_list.append(context_vector_values) temp_list = [] for token_index, token_vector in enumerate(token_average_list): if token_index in target_indexes: temp_list.append(token_vector) for meaning in gold_meanings: assert len(temp_list) > 0, "Temp list is empty at {}".format( meaning + "_" + str(index)) for item in temp_list: assert isinstance( item, np.ndarray ), "Temp list has nan vector(s) for {}".format(meaning + "_" + str(index)) if target_word_vector_method == "average": context_vector = np.average(temp_list, axis=0) elif target_word_vector_method == "sum": context_vector = np.sum(temp_list, axis=0) if meaning in meanings_to_vec: meanings_to_vec[meaning].append(context_vector) else: meanings_to_vec[meaning] = [context_vector] pbar.update(1) if final_vector_method == "averaging": for meaning, vec_list in meanings_to_vec: meanings_to_vec[meaning] = np.average(vec_list, axis=0) return meanings_to_vec
def create_context_embeddings_from_textfile(path_to_file, tokenizer, model, get_target_word_embedding_only, is_context_embedding, vector_method="full_list"): if get_target_word_embedding_only: print( "Please note that since only the target word embedding will be used to represent a meaning, " "then the paramater 'is_context_embedding' will be ignored ") meaning_to_sentence = get_meaning_to_sentence(path_to_corpus=path_to_file) layer_indexes = [-1, -2, -3, -4] meanings_to_vec = {} total_length = 0 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") for meaning, sentence_target_dict_list in meaning_to_sentence.items(): total_length += len(sentence_target_dict_list) with tqdm(total=total_length, desc="Creating context embeddings") as pbar: for meaning, sentence_target_dict_list in meaning_to_sentence.items(): for sentence_target_dict in sentence_target_dict_list: sentence = sentence_target_dict["sentence"] sentence_tokens = [] target_index = sentence_target_dict["target_index"] for tok_index, tok in enumerate(sentence): token_text = tok token_pos = 'n' token_lemma = 'unknown' if tok_index == target_index: token_id = "target" else: token_id = "unknown" sentence_tokens.append( Token(token_id=token_id, text=token_text, pos=token_pos, lemma=token_lemma)) sentence_tokens.insert(0, Token(text='[CLS]', token_id='unknown')) sentence_tokens.append(Token(text='[SEP]', token_id='unknown')) new_wsd_tokens = get_new_wsd_tokens(sentence_tokens, tokenizer) target_indexes = get_target_index("target", new_wsd_tokens, 0) tokenized_sentence = tokenizer.tokenize(' '.join(sentence)) tokenized_sentence.insert(0, '[CLS]') tokenized_sentence.append('[SEP]') input_ids = np.asarray(tokenizer.convert_tokens_to_ids(tokenized_sentence))\ .reshape(1, len(tokenized_sentence)) input_mask = [1] * len(tokenized_sentence) input_mask = np.asarray(input_mask).reshape( 1, len(tokenized_sentence)) input_ids = torch.tensor(input_ids, dtype=torch.long).to(device) input_mask = torch.tensor(input_mask, dtype=torch.long).to(device) with torch.no_grad(): all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) all_encoder_layers = all_encoder_layers all_out_features = [] for i, token in enumerate(tokenized_sentence): all_layers = [] for j, layer_index in enumerate(layer_indexes): layer_output = all_encoder_layers[int( layer_index)].detach().cpu().numpy() layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(x.item(), 6) for x in layer_output[0][i] ] all_layers.append(layers) out_features = collections.OrderedDict() out_features["token"] = token out_features["layers"] = all_layers all_out_features.append(out_features) token_average_list = list() for feature_index, feature in enumerate(all_out_features): token = feature['token'] if token == '[CLS]' or token == '[SEP]' or (feature_index in target_indexes) \ and is_context_embedding and not get_target_word_embedding_only: continue layers = feature["layers"] layer_values = [] for layer in layers: values = layer['values'] layer_values.append(values) context_vector_values = np.sum(layer_values, axis=0) token_average_list.append(context_vector_values) if not is_context_embedding and not get_target_word_embedding_only and token == '[CLS]': break if is_context_embedding and not get_target_word_embedding_only: context_vector = np.average(token_average_list, axis=0) elif not is_context_embedding and not get_target_word_embedding_only: context_vector = token_average_list[0] elif get_target_word_embedding_only: temp_list = [] for token_index, token_vector in enumerate( token_average_list): if token_index in target_indexes: temp_list.append(token_vector) context_vector = np.average(temp_list, axis=0) if not isinstance(context_vector, np.ndarray): print() if meaning in meanings_to_vec: meanings_to_vec[meaning].append(context_vector) else: meanings_to_vec[meaning] = [context_vector] pbar.update(1) if vector_method == "averaging": for meaning, vec_list in meanings_to_vec: meanings_to_vec[meaning] = np.average(vec_list, axis=0) return meanings_to_vec