コード例 #1
0
ファイル: bert_input_helper.py プロジェクト: cltl/BERT-WSD
def get_new_wsd_tokens(wsd_tokens, tokenizer):
    new_wsd_tokens = []
    for tok in wsd_tokens:
        split_text = tokenizer.tokenize(tok.text)
        if len(split_text) > 1:
            for text in split_text:
                new_wsd_tokens.append(Token(token_id=tok.token_id, text=text, pos=tok.pos, lemma=tok.lemma))
        else:
            new_wsd_tokens.append(Token(token_id=tok.token_id, text=tok.text, pos=tok.pos, lemma=tok.lemma))
    return new_wsd_tokens
コード例 #2
0
ファイル: bert_input_helper.py プロジェクト: cltl/BERT-WSD
    def __init__(self, corpus_path, tokenizer, logger):
        self.input_ids = []
        self.input_masks = []
        self.segment_ids = []
        self.target_indexes = []
        self.meanings = []
        self.logger = logger
        self.corpus_lines = 0

        meaning_to_sentence = get_meaning_to_sentence(path_to_corpus=corpus_path)

        for meaning, sentence_target_dict_list in meaning_to_sentence.items():
            for sentence_index, sentence_target_dict in enumerate(sentence_target_dict_list):
                sentence = sentence_target_dict["sentence"]
                sentence_tokens = []
                target_index = sentence_target_dict["target_index"]
                for tok_index, tok in enumerate(sentence):
                    token_text = tok
                    token_pos = 'n'
                    token_lemma = 'unknown'
                    if tok_index == target_index:
                        token_id = "target"
                    else:
                        token_id = "unknown"
                    sentence_tokens.append(Token(token_id=token_id, text=token_text, pos=token_pos, lemma=token_lemma))
                sentence_tokens.insert(0, Token(text='[CLS]', token_id='unknown'))
                sentence_tokens.append(Token(text='[SEP]', token_id='unknown'))
                new_wsd_tokens = get_new_wsd_tokens(wsd_tokens=sentence_tokens, tokenizer=tokenizer)
                target_index_list = get_target_indexes(target_token_id="target", wsd_tokens=new_wsd_tokens,
                                                       df_index=meaning+"_"+str(sentence_index))
                tokenized_sentence = tokenizer.tokenize(' '.join(sentence))
                example = InputExample(guid=self.corpus_lines, tokens_a=tokenized_sentence)
                feature = convert_example_to_feature(example=example, tokenizer=tokenizer,
                                                     max_seq_length=73, logger=self.logger)

                target_index_list = pad_sequences([target_index_list], padding="post", value=-1, maxlen=73)

                self.input_ids.append(feature.input_ids)
                self.input_masks.append(feature.input_mask)
                self.segment_ids.append(feature.segment_ids)
                self.target_indexes.append(target_index_list)
                self.meanings.append(meaning)
                self.corpus_lines += 1

        self.segment_ids = np.asarray(self.segment_ids)
        self.input_masks = np.asarray(self.input_masks)
        self.input_ids = np.asarray(self.input_ids)
        self.target_indexes = np.asarray(self.target_indexes)
        self.meanings = np.asarray(self.meanings)
コード例 #3
0
def perform_wsd_on_test(test_dataframe, meanings, model, tokenizer,
                        layer_indexes, use_context_embeddings,
                        without_stop_words, target_word_embeddings_only):
    test_dataframe["bert_output"] = [None for _ in range(len(test_dataframe))]
    test_dataframe["meaning2confidence"] = [
        None for _ in range(len(test_dataframe))
    ]
    test_dataframe["wsd_strategy"] = [None for _ in range(len(test_dataframe))]
    test_dataframe['chosen_meaning_confidence'] = [
        None for _ in range(len(test_dataframe))
    ]
    stop_words = set(stopwords.words('english'))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for index, instance in test_dataframe.iterrows():
        sentence = copy.deepcopy(instance.sentence)
        sentence_tokens = copy.deepcopy(instance.sentence_tokens)
        target_index = get_target_index(instance.token_ids[0], sentence_tokens,
                                        index)
        if without_stop_words:
            temp_sentence_tokens = []
            for token_index, token in enumerate(sentence_tokens):
                if not (token.text
                        in stop_words) or token_index == target_index:
                    temp_sentence_tokens.append(token)
            sentence_tokens = copy.deepcopy(temp_sentence_tokens)

        sentence_tokens.insert(0, Token(text='[CLS]', token_id='unknown'))
        sentence_tokens.append(Token(text='[SEP]', token_id='unknown'))
        new_sentence_tokens = get_new_wsd_tokens(wsd_tokens=sentence_tokens,
                                                 tokenizer=tokenizer)
        tokenized_sentence = tokenizer.tokenize(sentence)

        tokenized_sentence.insert(0, '[CLS]')
        tokenized_sentence.append('[SEP]')

        target_indexes = get_target_indexes(instance.token_ids[0],
                                            new_sentence_tokens, index)

        if not target_word_embeddings_only:
            context_vector = get_context_vector_per_sentence(
                tokenized_sentence=tokenized_sentence,
                tokenizer=tokenizer,
                model=model,
                target_index_list=target_indexes,
                layer_index_list=layer_indexes,
                is_context_embedding=use_context_embeddings)
        else:
            context_vector = get_targetword_embedding_per_sentence(
                tokenized_sentence=tokenized_sentence,
                tokenizer=tokenizer,
                model=model,
                target_index_list=target_indexes,
                layer_index_list=layer_indexes,
                device=device)

        candidate_meanings = copy.deepcopy(instance.candidate_meanings)
        found_meaning = False
        meaning_similarities = dict()
        for candidate_meaning in candidate_meanings:
            meaning_similarities[candidate_meaning] = []
            if candidate_meaning in meanings:
                found_meaning = True
                similarity = 1 - spatial.distance.cosine(
                    meanings[candidate_meaning], context_vector)
                meaning_similarities[candidate_meaning].append(similarity)
            else:
                meaning_similarities[candidate_meaning] = float(0)

        wsd_strategy = "bert"

        for meaning, similarity_list in meaning_similarities.items():
            if isinstance(similarity_list, list):
                similarity_list.sort(reverse=True)
                meaning_similarities[meaning] = similarity_list[0]

        if found_meaning:
            sorted_meanings = sorted(meaning_similarities.items(),
                                     key=operator.itemgetter(1),
                                     reverse=True)
            same_confidence = [
                i for i, v in enumerate(sorted_meanings)
                if v[1] == sorted_meanings[0][1]
            ]
            if len(same_confidence) > 1 and 0 in same_confidence:
                temp_list = list()
                for idx in same_confidence:
                    temp_list.append(sorted_meanings[idx][0])
                temp_dict = {}
                for m in temp_list:
                    sense_rank = candidate_meanings.index(m)
                    temp_dict[m] = sense_rank
                sorted_ranks = sorted(temp_dict.items(),
                                      key=operator.itemgetter(1))
                bert_output = sorted_ranks[0][0]
            else:
                bert_output = sorted_meanings[0][0]
            chosen_meaning_confidence = sorted_meanings[0][1]
        else:
            bert_output = candidate_meanings[0]
            chosen_meaning_confidence = meaning_similarities[
                candidate_meanings[0]]
            wsd_strategy = "mfs_fallback"

        if len(candidate_meanings) == 1:
            bert_output = candidate_meanings[0]
            wsd_strategy = "monosemous"
            chosen_meaning_confidence = meaning_similarities[
                candidate_meanings[0]]

        test_dataframe.at[index, 'bert_output'] = bert_output
        test_dataframe.at[index, 'wsd_strategy'] = wsd_strategy
        test_dataframe.at[index, "meaning2confidence"] = meaning_similarities
        test_dataframe.at[
            index, "chosen_meaning_confidence"] = chosen_meaning_confidence

    return test_dataframe
コード例 #4
0
def create_context_embeddings_from_dataframe(dataframe, tokenizer, model):
    layer_indexes = [-1, -2, -3, -4]
    meanings = {}
    for index, instance in dataframe.iterrows():
        sentence = copy.deepcopy(instance.sentence)
        source_wn_engs = copy.deepcopy(instance.source_wn_engs)
        original_sentence_tokens = copy.deepcopy(instance.sentence_tokens)
        original_sentence_tokens.insert(
            0, Token(token_id="unknown", text="[CLS]"))
        original_sentence_tokens.append(Token(token_id="unknown",
                                              text="[SEP]"))
        n_wsd_tokens = get_new_wsd_tokens(wsd_tokens=original_sentence_tokens,
                                          tokenizer=tokenizer)
        target_indexes = get_target_index(instance.token_ids[0], n_wsd_tokens,
                                          index)

        tokenized_sentence = tokenizer.tokenize(sentence)
        tokenized_sentence.insert(0, '[CLS]')
        tokenized_sentence.append('[SEP]')
        input_ids = np.asarray(
            tokenizer.convert_tokens_to_ids(tokenized_sentence)).reshape(
                1, len(tokenized_sentence))
        input_mask = [1] * len(tokenized_sentence)
        input_mask = np.asarray(input_mask).reshape(1, len(tokenized_sentence))
        input_ids = torch.tensor(input_ids, dtype=torch.long)
        input_mask = torch.tensor(input_mask, dtype=torch.long)

        with torch.no_grad():
            all_encoder_layers, _ = model(input_ids,
                                          token_type_ids=None,
                                          attention_mask=input_mask)
            all_encoder_layers = all_encoder_layers

        all_out_features = []
        for i, token in enumerate(tokenized_sentence):
            all_layers = []
            for j, layer_index in enumerate(layer_indexes):
                layer_output = all_encoder_layers[int(
                    layer_index)].detach().cpu().numpy()
                layers = collections.OrderedDict()
                layers["index"] = layer_index
                layers["values"] = [
                    round(x.item(), 6) for x in layer_output[0][i]
                ]
                all_layers.append(layers)
            out_features = collections.OrderedDict()
            out_features["token"] = token
            out_features["layers"] = all_layers
            all_out_features.append(out_features)

        token_average_list = list()
        for feature_index, feature in enumerate(all_out_features):
            token = feature['token']

            if token == '[CLS]' or token == '[SEP]' or (feature_index
                                                        in target_indexes):
                continue

            layers = feature["layers"]
            layer_values = []
            for layer in layers:
                values = layer['values']
                layer_values.append(values)

            summed_values = np.sum(layer_values, axis=0)
            token_average_list.append(summed_values)

        context_vector = np.average(token_average_list, axis=0)
        for source_wn_eng in source_wn_engs:
            if source_wn_eng in meanings:
                meanings[source_wn_eng].append(context_vector)
            else:
                meanings[source_wn_eng] = [context_vector]
    return meanings
コード例 #5
0
def create_target_word_embeddings_from_dataframe(
        path_to_dataframe,
        tokenizer,
        model,
        target_word_vector_method="average",
        final_vector_method="full_list"):

    assert target_word_vector_method == "average" or target_word_vector_method == "sum", \
        "You can only choose between summing the target token word pieces or averaging them!"

    assert final_vector_method == "full_list" or final_vector_method == "average", \
        "You can either choose to leave the target token embeddings " \
        "for a meaning as a list or choose 'average' to create a " \
        "one-to-one mapping between a mapping and its vector"

    dataframe = pd.read_pickle(path_to_dataframe)
    layer_indexes = [-1, -2, -3, -4]
    meanings_to_vec = {}
    total_length = len(dataframe)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    with tqdm(total=total_length, desc="Creating context embeddings") as pbar:
        for index, instance in dataframe.iterrows():
            sentence = copy.deepcopy(instance.sentence)
            sentence_tokens = copy.deepcopy(instance.sentence_tokens)
            sentence_tokens.insert(0, Token(text='[CLS]', token_id='unknown'))
            sentence_tokens.append(Token(text='[SEP]', token_id='unknown'))
            new_wsd_tokens = get_new_wsd_tokens(sentence_tokens, tokenizer)
            target_indexes = get_target_indexes(instance.token_ids[0],
                                                new_wsd_tokens, 0)
            gold_meanings = copy.deepcopy(instance.source_wn_engs)

            tokenized_sentence = tokenizer.tokenize(sentence)

            tokenized_sentence.insert(0, '[CLS]')
            tokenized_sentence.append('[SEP]')
            input_ids = np.asarray(tokenizer.convert_tokens_to_ids(tokenized_sentence)) \
                .reshape(1, len(tokenized_sentence))

            input_mask = [1] * len(tokenized_sentence)
            input_mask = np.asarray(input_mask).reshape(
                1, len(tokenized_sentence))
            input_ids = torch.tensor(input_ids, dtype=torch.long).to(device)
            input_mask = torch.tensor(input_mask, dtype=torch.long).to(device)

            with torch.no_grad():
                all_encoder_layers, _ = model(input_ids,
                                              token_type_ids=None,
                                              attention_mask=input_mask)
                all_encoder_layers = all_encoder_layers

            all_out_features = []
            for i, token in enumerate(tokenized_sentence):
                all_layers = []
                for j, layer_index in enumerate(layer_indexes):
                    layer_output = all_encoder_layers[int(
                        layer_index)].detach().cpu().numpy()
                    layers = collections.OrderedDict()
                    layers["index"] = layer_index
                    layers["values"] = [
                        round(x.item(), 6) for x in layer_output[0][i]
                    ]
                    all_layers.append(layers)
                out_features = collections.OrderedDict()
                out_features["token"] = token
                out_features["layers"] = all_layers
                all_out_features.append(out_features)

            token_average_list = list()
            for feature_index, feature in enumerate(all_out_features):
                layers = feature["layers"]
                layer_values = []
                for layer in layers:
                    values = layer['values']
                    layer_values.append(values)

                context_vector_values = np.sum(layer_values, axis=0)
                token_average_list.append(context_vector_values)

            temp_list = []
            for token_index, token_vector in enumerate(token_average_list):
                if token_index in target_indexes:
                    temp_list.append(token_vector)

            for meaning in gold_meanings:
                assert len(temp_list) > 0, "Temp list is empty at {}".format(
                    meaning + "_" + str(index))

                for item in temp_list:
                    assert isinstance(
                        item, np.ndarray
                    ), "Temp list has nan vector(s) for {}".format(meaning +
                                                                   "_" +
                                                                   str(index))

                if target_word_vector_method == "average":
                    context_vector = np.average(temp_list, axis=0)
                elif target_word_vector_method == "sum":
                    context_vector = np.sum(temp_list, axis=0)

                if meaning in meanings_to_vec:
                    meanings_to_vec[meaning].append(context_vector)
                else:
                    meanings_to_vec[meaning] = [context_vector]

            pbar.update(1)

    if final_vector_method == "averaging":
        for meaning, vec_list in meanings_to_vec:
            meanings_to_vec[meaning] = np.average(vec_list, axis=0)

    return meanings_to_vec
コード例 #6
0
def create_context_embeddings_from_textfile(path_to_file,
                                            tokenizer,
                                            model,
                                            get_target_word_embedding_only,
                                            is_context_embedding,
                                            vector_method="full_list"):

    if get_target_word_embedding_only:
        print(
            "Please note that since only the target word embedding will be used to represent a meaning, "
            "then the paramater 'is_context_embedding' will be ignored ")

    meaning_to_sentence = get_meaning_to_sentence(path_to_corpus=path_to_file)
    layer_indexes = [-1, -2, -3, -4]
    meanings_to_vec = {}
    total_length = 0

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for meaning, sentence_target_dict_list in meaning_to_sentence.items():
        total_length += len(sentence_target_dict_list)

    with tqdm(total=total_length, desc="Creating context embeddings") as pbar:
        for meaning, sentence_target_dict_list in meaning_to_sentence.items():
            for sentence_target_dict in sentence_target_dict_list:
                sentence = sentence_target_dict["sentence"]
                sentence_tokens = []
                target_index = sentence_target_dict["target_index"]
                for tok_index, tok in enumerate(sentence):
                    token_text = tok
                    token_pos = 'n'
                    token_lemma = 'unknown'
                    if tok_index == target_index:
                        token_id = "target"
                    else:
                        token_id = "unknown"
                    sentence_tokens.append(
                        Token(token_id=token_id,
                              text=token_text,
                              pos=token_pos,
                              lemma=token_lemma))
                sentence_tokens.insert(0,
                                       Token(text='[CLS]', token_id='unknown'))
                sentence_tokens.append(Token(text='[SEP]', token_id='unknown'))
                new_wsd_tokens = get_new_wsd_tokens(sentence_tokens, tokenizer)
                target_indexes = get_target_index("target", new_wsd_tokens, 0)

                tokenized_sentence = tokenizer.tokenize(' '.join(sentence))
                tokenized_sentence.insert(0, '[CLS]')
                tokenized_sentence.append('[SEP]')
                input_ids = np.asarray(tokenizer.convert_tokens_to_ids(tokenized_sentence))\
                    .reshape(1, len(tokenized_sentence))
                input_mask = [1] * len(tokenized_sentence)
                input_mask = np.asarray(input_mask).reshape(
                    1, len(tokenized_sentence))
                input_ids = torch.tensor(input_ids,
                                         dtype=torch.long).to(device)
                input_mask = torch.tensor(input_mask,
                                          dtype=torch.long).to(device)

                with torch.no_grad():
                    all_encoder_layers, _ = model(input_ids,
                                                  token_type_ids=None,
                                                  attention_mask=input_mask)
                    all_encoder_layers = all_encoder_layers

                all_out_features = []
                for i, token in enumerate(tokenized_sentence):
                    all_layers = []
                    for j, layer_index in enumerate(layer_indexes):
                        layer_output = all_encoder_layers[int(
                            layer_index)].detach().cpu().numpy()
                        layers = collections.OrderedDict()
                        layers["index"] = layer_index
                        layers["values"] = [
                            round(x.item(), 6) for x in layer_output[0][i]
                        ]
                        all_layers.append(layers)
                    out_features = collections.OrderedDict()
                    out_features["token"] = token
                    out_features["layers"] = all_layers
                    all_out_features.append(out_features)

                token_average_list = list()
                for feature_index, feature in enumerate(all_out_features):
                    token = feature['token']

                    if token == '[CLS]' or token == '[SEP]' or (feature_index in target_indexes) \
                            and is_context_embedding and not get_target_word_embedding_only:
                        continue

                    layers = feature["layers"]
                    layer_values = []
                    for layer in layers:
                        values = layer['values']
                        layer_values.append(values)

                    context_vector_values = np.sum(layer_values, axis=0)
                    token_average_list.append(context_vector_values)

                    if not is_context_embedding and not get_target_word_embedding_only and token == '[CLS]':
                        break

                if is_context_embedding and not get_target_word_embedding_only:
                    context_vector = np.average(token_average_list, axis=0)
                elif not is_context_embedding and not get_target_word_embedding_only:
                    context_vector = token_average_list[0]
                elif get_target_word_embedding_only:
                    temp_list = []
                    for token_index, token_vector in enumerate(
                            token_average_list):
                        if token_index in target_indexes:
                            temp_list.append(token_vector)
                    context_vector = np.average(temp_list, axis=0)
                if not isinstance(context_vector, np.ndarray):
                    print()
                if meaning in meanings_to_vec:
                    meanings_to_vec[meaning].append(context_vector)
                else:
                    meanings_to_vec[meaning] = [context_vector]

                pbar.update(1)

    if vector_method == "averaging":
        for meaning, vec_list in meanings_to_vec:
            meanings_to_vec[meaning] = np.average(vec_list, axis=0)

    return meanings_to_vec