예제 #1
0
class DocumentOracleDerivation(object):
    def __init__(self,
                 mixed_combination: bool,
                 min_combination_num: int = 1,
                 max_combination_num: int = 8,
                 rm_stop_word: bool = True,
                 stem: bool = False,
                 morphy: bool = False,
                 tokenization: bool = True,
                 beam_sz: int = 5,
                 prune_candidate_percent: float = 0.4):
        self.mixed_combination = mixed_combination
        self.min_combination_num = min_combination_num
        self.max_combination_num = max_combination_num
        self.rm_stop_word = rm_stop_word
        self.stem = stem
        self.tokenization = tokenization
        self.beam_sz = beam_sz
        self.prune_candidate_percent = prune_candidate_percent
        if self.stem:
            self.stemmer = PorterStemmer().stem_word
        else:
            self.stemmer = lambda x: x

        self.morphy = morphy

        if self.tokenization:
            from allennlp.data.tokenizers.word_tokenizer import WordTokenizer
            self.tokenizer = WordTokenizer()
        if self.rm_stop_word:
            self.stop_words = list(set(stopwords.words('english'))) + [
                x for x in string.punctuation
            ] + ['``', '\'\'']
        else:
            self.stop_words = []

    def derive_doc_oracle(
        self,
        doc_list: List[str],
        ref_sum: str,
        prefix_summary: str = "",
    ):
        # return a dict where key=rouge-f1 and value= [0,0,0,1,0,1,0,...] same size as doc_list
        # processed_doc_list, processed_ref_sum_str, processed_prefix_sum_str = [], '', ''
        len_of_doc = len(doc_list)
        processed_doc_list = [self._rouge_clean(x) for x in doc_list]
        processed_ref_sum_str = self._rouge_clean(ref_sum)
        processed_prefix_sum_str = self._rouge_clean(prefix_summary)
        if self.tokenization:
            new_processed_doc_list = []
            token_doc_list = self.tokenizer.batch_tokenize(processed_doc_list)
            for doc in token_doc_list:
                new_processed_doc_list.append([word.text for word in doc])
            processed_doc_list = new_processed_doc_list
            processed_ref_sum_list = [
                w.text for w in self.tokenizer.tokenize(processed_ref_sum_str)
            ]
            processed_prefix_sum_list = [
                w.text
                for w in self.tokenizer.tokenize(processed_prefix_sum_str)
            ]
        else:
            processed_doc_list = [d.split(" ") for d in processed_doc_list]
            processed_ref_sum_list = processed_ref_sum_str.split(" ")
            processed_prefix_sum_list = processed_prefix_sum_str.split(" ")

        # must do lower
        processed_doc_list = [[x.lower() for x in sent]
                              for sent in processed_doc_list]
        processed_ref_sum_list = [x.lower() for x in processed_ref_sum_list]
        processed_prefix_sum_list = [
            x.lower() for x in processed_prefix_sum_list
        ]

        # if self.rm_stop_word:
        #     processed_doc_list = [[x for x in sent if x not in self.stop_words] for sent in processed_doc_list]
        #     processed_ref_sum_list = [x for x in processed_ref_sum_list if x not in self.stop_words]
        #     processed_prefix_sum_list = [x for x in processed_prefix_sum_list if x not in self.stop_words]

        target_ref_sum_list = [
            x for x in processed_ref_sum_list
            if x not in processed_prefix_sum_list
        ]

        # TODO
        f_score_list, score_matrix = self.iter_rouge(processed_doc_list,
                                                     target_ref_sum_list)

        # preprocessing finished
        filtered_doc_list, map_from_new_to_ori_idx = self.pre_prune(
            processed_doc_list, target_ref_sum_list)
        combination_data_dict = {}
        for num_sent_in_combination in range(self.min_combination_num,
                                             self.max_combination_num):
            combination_data = self.comp_num_seg_out_of_p_sent_beam(
                _filtered_doc_list=filtered_doc_list,
                num_sent_in_combination=num_sent_in_combination,
                target_ref_sum_list=target_ref_sum_list,
                map_from_new_to_ori_idx=map_from_new_to_ori_idx)
            if combination_data['best'] is None:
                break
            best_rouge_of_this_batch = combination_data['best']['R1']
            if len(combination_data_dict) >= self.beam_sz:
                rouge_in_bag = [
                    float(k) for k, v in combination_data_dict.items()
                ]
                if best_rouge_of_this_batch < min(rouge_in_bag):
                    break

            combination_data_dict = {
                **combination_data_dict,
                **combination_data['data']
            }
            combination_data_dict = collections.OrderedDict(
                sorted(combination_data_dict.items(), reverse=True))
            sliced = islice(combination_data_dict.items(), self.beam_sz)
            combination_data_dict = collections.OrderedDict(sliced)
            # combination_data_dict[num_sent_in_combination] = combination_data

        # prepare return data
        return_dict = {}
        for k, v in combination_data_dict.items():
            # tmp_list = [0 for _ in range(len_of_doc)]
            # for i in v['label']:
            #     tmp_list[i] = 1
            return_dict[k] = v['label']
        return return_dict

    def iter_rouge(self, list_of_doc, ref_sum):
        f_score_list = [
            self.get_rouge_ready_to_use(ref_sum, x) for x in list_of_doc
        ]
        # score_matrix_delta = [[0 for _ in range(len(list_of_doc))] for _ in range(len(list_of_doc))]
        score_matrix = [[0 for _ in range(len(list_of_doc))]
                        for _ in range(len(list_of_doc))]
        input = []
        for idx, x in enumerate(list_of_doc):
            for jdx, y in enumerate(list_of_doc):
                input.append((idx, jdx, ref_sum, x + y))
                s = self.get_rouge_ready_to_use(ref_sum, x + y)
                score_matrix[idx][jdx] = s
                # if f_score_list[idx] < 0.01:
                #
                #     score_matrix_delta[idx][jdx] = 0
                # else:
                #     score_matrix_delta[idx][jdx] = min(s / (f_score_list[idx] + 0.001), 2)
        # import numpy as np
        # np.set_printoptions(precision=2)
        # import seaborn as sns
        # sns.set()
        # f_score_list = np.asarray([f_score_list, f_score_list])
        # bx = sns.heatmap(f_score_list)
        # fig = bx.get_figure()
        # fig.savefig("individual_output.png")
        # print('-' * 30)
        # print(np.asarray(score_matrix))
        # score_matrix_delta = np.asarray(score_matrix_delta)
        # ax = sns.heatmap(score_matrix_delta)
        # fig = ax.get_figure()
        # fig.savefig("output.png")

        # ncpu=multiprocessing.cpu_count()
        # pool = multiprocessing.Pool(processes=ncpu)
        # results = pool.starmap(self.get_rouge_ready_to_use, input)
        # for r in results:
        #     score, idx,jdx = r
        #     score_matrix[idx][jdx] = score
        return f_score_list, score_matrix

    def comp_num_seg_out_of_p_sent_beam(self, _filtered_doc_list,
                                        num_sent_in_combination,
                                        target_ref_sum_list,
                                        map_from_new_to_ori_idx) -> dict:
        beam: List[dict] = []
        if len(_filtered_doc_list) < num_sent_in_combination:
            return {
                "nlabel": num_sent_in_combination,
                "data": {},
                "best": None
            }

        combs = list(range(0, len(_filtered_doc_list)))
        # _num_edu seq_len
        cur_beam = {"in": [], "todo": combs, "val": 0}
        beam.append(cur_beam)
        for t in range(num_sent_in_combination):
            dict_pattern = {}
            # compute top beam_sz for every beam
            global_board = []
            for b in beam:
                already_in_beam = b['in']
                todo = b['todo']

                leaderboard = {}
                for to_add in todo:
                    after_add = already_in_beam + [to_add]
                    candidate_doc_list = list(
                        itertools.chain.from_iterable(
                            [_filtered_doc_list[i] for i in after_add]))
                    # average_f_score = self.get_approximate_rouge(target_ref_sum_list, candidate_doc_list)
                    average_f_score = self.get_rouge_ready_to_use(
                        gold_tokens=target_ref_sum_list,
                        pred_tokens=candidate_doc_list)
                    leaderboard[to_add] = average_f_score
                sorted_beam = [(k, leaderboard[k]) for k in sorted(
                    leaderboard, key=leaderboard.get, reverse=True)]

                for it in sorted_beam:
                    new_in = already_in_beam + [it[0]]
                    new_in.sort()
                    str_new_in = [str(x) for x in new_in]
                    if '_'.join(str_new_in) in dict_pattern:
                        continue
                    else:
                        dict_pattern['_'.join(str_new_in)] = True
                    new_list = todo.copy()
                    new_list.remove(it[0])
                    _beam = {"in": new_in, "todo": new_list, "val": it[1]}
                    global_board.append(_beam)
            # merge and get the top beam_sz among all

            sorted_global_board = sorted(global_board,
                                         key=lambda x: x["val"],
                                         reverse=True)

            _cnt = 0
            check_dict = []
            beam_waitlist = []
            for it in sorted_global_board:
                str_in = sorted(it['in'])
                str_in = [str(x) for x in str_in]
                _tmp_key = '_'.join(str_in)
                if _tmp_key in check_dict:
                    continue
                else:
                    beam_waitlist.append(it)
                    check_dict.append(_tmp_key)
                _cnt += 1
                if _cnt >= self.beam_sz:
                    break
            beam = beam_waitlist
        # if len(beam) < 2:
        #     print(len(_filtered_doc_list))
        #     print(_num_edu)
        # Write oracle to a string like: 0.4 0.3 0.4
        _comb_bag = {}
        for it in beam:
            n_comb = it['in']
            n_comb.sort()
            n_comb_original = [map_from_new_to_ori_idx[a] for a in n_comb]
            n_comb_original.sort()  # json label
            n_comb_original = [int(x) for x in n_comb_original]
            candidate_doc_list = list(
                itertools.chain.from_iterable(
                    [_filtered_doc_list[i] for i in n_comb]))
            # f1 = self.get_approximate_rouge(target_ref_sum_list, candidate_doc_list)
            f1 = self.get_rouge_ready_to_use(target_ref_sum_list,
                                             candidate_doc_list)

            # f_avg = (f1 + f2 + fl) / 3
            _comb_bag[f1] = {
                "label": n_comb_original,
                "R1": f1,
                "nlabel": num_sent_in_combination
            }
        # print(len(_comb_bag))
        if len(_comb_bag) == 0:
            return {
                "nlabel": num_sent_in_combination,
                "data": {},
                "best": None
            }
        else:
            best_key = sorted(_comb_bag.keys(), reverse=True)[0]
            rt_dict = {
                "nlabel": num_sent_in_combination,
                "data": _comb_bag,
                "best": _comb_bag[best_key]
            }
            return rt_dict

    @staticmethod
    def _rouge_clean(s):
        return re.sub(r'[^a-zA-Z0-9 ]', '', s)

    def get_rouge_ready_to_use_w_index(self, gold_tokens: List[str],
                                       pred_tokens: List[str], idx, jdx):
        return self.get_rouge_ready_to_use(gold_tokens, pred_tokens), idx, jdx

    # No synomous standard version

    def get_rouge_ready_to_use(self, gold_tokens: List[str],
                               pred_tokens: List[str]):
        len_gold = len(gold_tokens)
        len_pred = len(pred_tokens)

        gold_bigram = _get_ngrams(2, gold_tokens)
        pred_bigram = _get_ngrams(2, pred_tokens)

        if self.rm_stop_word:
            gold_unigram = set(
                [x for x in gold_tokens if x not in self.stop_words])
            pred_unigram = set(
                [x for x in pred_tokens if x not in self.stop_words])
        else:
            gold_unigram = set(gold_tokens)
            pred_unigram = set(pred_tokens)

        rouge_1 = cal_rouge(pred_unigram, gold_unigram, len_pred,
                            len_gold)['f']
        rouge_2 = cal_rouge(pred_bigram, gold_bigram, len_pred, len_gold)['f']
        rouge_score = (rouge_1 + rouge_2) / 2
        return rouge_score

    def pre_prune(self, list_of_doc: List[List[str]], ref_sum: List[str]):
        keep_candidate_num = math.ceil(
            len(list_of_doc) * self.prune_candidate_percent)
        # f_score_list = [self.get_approximate_rouge(ref_sum, x) for x in list_of_doc]
        f_score_list = [
            self.get_rouge_ready_to_use(ref_sum, x) for x in list_of_doc
        ]
        top_p_sent_idx = numpy.argsort(f_score_list)[-keep_candidate_num:]

        map_from_new_to_ori_idx = []
        # filter
        filtered_doc_list = []
        for i in range(len(top_p_sent_idx)):
            filtered_doc_list.append(list_of_doc[top_p_sent_idx[i]])
            map_from_new_to_ori_idx.append(top_p_sent_idx[i])
        return filtered_doc_list, map_from_new_to_ori_idx
예제 #2
0
def main():
    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=CharacterTokenizer(),
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={
            'tokens': SingleIdTokenIndexer(namespace='target_tokens')
        })
    train_dataset = reader.read('data/tatoeba/tatoeba.eng_cmn.train.tsv')
    validation_dataset = reader.read('data/tatoeba/tatoeba.eng_cmn.dev.tsv')

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                      min_count={
                                          'tokens': 3,
                                          'target_tokens': 3
                                      })

    en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=EN_EMBEDDING_DIM)
    # encoder = PytorchSeq2SeqWrapper(
    #     torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM,
                                          hidden_dim=HIDDEN_DIM,
                                          projection_dim=128,
                                          feedforward_hidden_dim=128,
                                          num_layers=1,
                                          num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

    # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
    # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
    attention = DotProductAttention()

    max_decoding_steps = 20  # TODO: make this variable
    model = SimpleSeq2Seq(vocab,
                          source_embedder,
                          encoder,
                          max_decoding_steps,
                          target_embedding_dim=ZH_EMBEDDING_DIM,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("source_tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=1,
                      cuda_device=CUDA_DEVICE)

    for i in range(50):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:',
                  predictor.predict_instance(instance)['predicted_tokens'])
예제 #3
0
 def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
     super().__init__(model, dataset_reader)
     self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
예제 #4
0
def main(args):
    print('Reading original dataset...')
    original_data = []
    with open(args.original) as f:
        total = sum((1 for _ in f))
    with open(args.original) as f:
        for line in tqdm(f, total=total):
            sample = json.loads(line)
            if sample['gold_label'] != '-':
                original_data.append({
                    'sentence1': sample['sentence1'],
                    'sentence2': sample['sentence2'],
                    'gold_label': sample['gold_label']
                })

    print(f'Read {len(original_data)} original instances.')
    print('-' * 100)
    print('Reading mirror instance...')
    mirror_data = []
    count = 0
    with open(args.mirror) as mf:
        total = sum((1 for _ in mf))
    with open(args.mirror) as mf, open(args.prediction) as pf:
        for instance, prediction in tqdm(zip(mf, pf), total=total):
            ins = json.loads(instance)
            pred = json.loads(prediction)
            mirror_data.append({
                'sentence1': ins['sentence1'],
                'sentence2': ins['sentence2'],
                'gold_label': pred['label'],
                'confidence': max(pred['label_probs'])
            })
            count += 1
    print(f'From {total} mirror instances.')

    print('-' * 100)
    print('Finding paraphrase samples...')
    assert len(original_data) == len(mirror_data),\
        'original dataset size != mirror dataset size'
    positive_samples, negative_samples = [], []

    for original, mirror in tqdm(zip(original_data, mirror_data),
                                 total=len(original_data)):
        assert original['sentence1'] == mirror['sentence2']
        assert original['sentence2'] == mirror['sentence1']
        if original['gold_label'] == 'entailment' and mirror['gold_label'] == 'entailment'\
                and mirror['confidence'] >= args.confidence_threshold:
            positive_samples.append({
                'sentence1': original['sentence1'],
                'sentence2': original['sentence2'],
                'label': 1
            })
        else:
            negative_samples.append({
                'sentence1': original['sentence1'],
                'sentence2': original['sentence2'],
                'label': 0
            })

    print('-' * 100)
    print('Tokenize and write into output')
    negative_samples = random.sample(negative_samples, len(positive_samples))
    samples = positive_samples + negative_samples
    random.shuffle(samples)

    tokenizer = WordTokenizer()
    with open(args.output, 'w') as outf:
        # MRPC format
        outf.write(f'Quality\t#1 ID\t#2 ID\t#1 String\t#2 String\n')

        for sample in tqdm(samples, total=len(samples)):
            label = sample['label']
            sentence1, sentence2 = sample['sentence1'], sample['sentence2']
            s1_tokens = ' '.join(
                (t.text for t in tokenizer.tokenize(sentence1)))
            s2_tokens = ' '.join(
                (t.text for t in tokenizer.tokenize(sentence2)))
            outf.write(
                f'{label}\tsentence1\tsentence2\t{s1_tokens}\t{s2_tokens}\n')

    print(f'Written {len(samples)} pairs of paraphrase into {args.output}')
예제 #5
0
from allennlp.data.tokenizers.word_tokenizer import WordTokenizer
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
import json
import argparse
from collections import Counter
import random
from copy import deepcopy
from tqdm import tqdm

tokenizer = WordTokenizer(
    word_splitter=SpacyWordSplitter(pos_tags=True, ner=True))

parser = argparse.ArgumentParser()
parser.add_argument('--input_file')
parser.add_argument('--output_file')
parser.add_argument('--switch_rate', type=float, default=0.4)
args = parser.parse_args()

pronouns = [
    'he', 'him', 'his', 'she', 'her', 'it', 'its', 'they', 'them', 'their',
    'that', 'those', 'this', 'these', 'there', 'here'
]
cut_length = 12
min_length = 5
ner_tag_cnt = 3
pos_tag_cnt = 3


def question_label(question):
    # True for <START>, False for others
    q_tokens = tokenizer.tokenize(question)
예제 #6
0
    def __init__(self):
        config = conf['seq2seq_allen']
        prefix = config['processed_data_prefix']
        train_file = config['train_data']
        valid_file = config['valid_data']
        src_embedding_dim = config['src_embedding_dim']
        trg_embedding_dim = config['trg_embedding_dim']
        hidden_dim = config['hidden_dim']

        if torch.cuda.is_available():
            cuda_device = 0
        else:
            cuda_device = -1

        self.reader = Seq2SeqDatasetReader(
            source_tokenizer=WordTokenizer(),
            target_tokenizer=WordTokenizer(),
            source_token_indexers={'tokens': SingleIdTokenIndexer()},
            target_token_indexers={
                'tokens': SingleIdTokenIndexer(namespace='target_tokens')
            })

        self.train_dataset = self.reader.read(os.path.join(prefix, train_file))
        self.valid_dataset = self.reader.read(os.path.join(prefix, valid_file))

        vocab = Vocabulary.from_instances(self.train_dataset +
                                          self.valid_dataset,
                                          min_count={
                                              'tokens': 3,
                                              'target_tokens': 3
                                          })

        src_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=src_embedding_dim)

        encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(src_embedding_dim, hidden_dim, batch_first=True))

        source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding})

        attention = LinearAttention(hidden_dim,
                                    hidden_dim,
                                    activation=Activation.by_name('tanh')())

        self.model = SimpleSeq2Seq(
            vocab=vocab,
            source_embedder=source_embedder,
            encoder=encoder,
            max_decoding_steps=20,
            target_embedding_dim=trg_embedding_dim,
            target_namespace='target_tokens',
            attention=attention,  # pass attention
            use_bleu=True)

        optimizer = optim.Adam(self.model.parameters())
        iterator = BucketIterator(batch_size=32,
                                  sorting_keys=[("source_tokens", "num_tokens")
                                                ])
        # 迭代器需要接受vocab,在训练时可以用vocab来index数据
        iterator.index_with(vocab)

        self.model.cuda(cuda_device)

        self.trainer = Trainer(model=self.model,
                               optimizer=optimizer,
                               iterator=iterator,
                               patience=10,
                               validation_metric="+accuracy",
                               train_dataset=self.train_dataset,
                               validation_dataset=self.valid_dataset,
                               num_epochs=1,
                               cuda_device=cuda_device)
from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq
from allennlp.modules.attention import DotProductAttention
from allennlp.modules.seq2seq_encoders import StackedSelfAttentionEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.predictors import SimpleSeq2SeqPredictor

EN_EMBEDDING_DIM = 256
ZH_EMBEDDING_DIM = 256
HIDDEN_DIM = 256

CUDA_DEVICE = -1

# Loading the reader, vocab, embeddings and model structure
reader = Seq2SeqDatasetReader(
    source_tokenizer=WordTokenizer(),
    target_tokenizer=CharacterTokenizer(),
    source_token_indexers={'tokens': SingleIdTokenIndexer()},
    target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')},
    lazy=True)

vocab = Vocabulary.from_files('/home/earendil/NLP/neural_machine_translation/checkpoint_vocab_epoch_13')

en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                         embedding_dim=EN_EMBEDDING_DIM)

encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128,
                                      feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)

source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})
import re
from .regex_expressions import *

from allennlp.data.tokenizers.word_tokenizer import WordTokenizer

tokenizer = WordTokenizer(end_tokens=['<EOS>'])


# TODO - extend settings and add emoji end emoticon processing
class Preprocessing(object):
    """
    Module for text pre-processing
    """
    def __init__(self, **kwargs):
        self.char_clean = kwargs.get('char_cleaning', True)
        self.char_normalize = kwargs.get('char_normalize', True)
        self.word_normalize = kwargs.get('word_normalization', True)
        self.expand = kwargs.get('expand', True)
        self.escape_punctuation = kwargs.get('escape_punctuation', True)
        self.negation = kwargs.get('negation', True)

    def split_text(self, text):
        return text.split()

    def tokenize(self, text):
        tokens = tokenizer.tokenize(text)
        return [t.text for t in tokens]

    def process_text(self, text):

        tokens = tokenizer.tokenize(text)