class DocumentOracleDerivation(object): def __init__(self, mixed_combination: bool, min_combination_num: int = 1, max_combination_num: int = 8, rm_stop_word: bool = True, stem: bool = False, morphy: bool = False, tokenization: bool = True, beam_sz: int = 5, prune_candidate_percent: float = 0.4): self.mixed_combination = mixed_combination self.min_combination_num = min_combination_num self.max_combination_num = max_combination_num self.rm_stop_word = rm_stop_word self.stem = stem self.tokenization = tokenization self.beam_sz = beam_sz self.prune_candidate_percent = prune_candidate_percent if self.stem: self.stemmer = PorterStemmer().stem_word else: self.stemmer = lambda x: x self.morphy = morphy if self.tokenization: from allennlp.data.tokenizers.word_tokenizer import WordTokenizer self.tokenizer = WordTokenizer() if self.rm_stop_word: self.stop_words = list(set(stopwords.words('english'))) + [ x for x in string.punctuation ] + ['``', '\'\''] else: self.stop_words = [] def derive_doc_oracle( self, doc_list: List[str], ref_sum: str, prefix_summary: str = "", ): # return a dict where key=rouge-f1 and value= [0,0,0,1,0,1,0,...] same size as doc_list # processed_doc_list, processed_ref_sum_str, processed_prefix_sum_str = [], '', '' len_of_doc = len(doc_list) processed_doc_list = [self._rouge_clean(x) for x in doc_list] processed_ref_sum_str = self._rouge_clean(ref_sum) processed_prefix_sum_str = self._rouge_clean(prefix_summary) if self.tokenization: new_processed_doc_list = [] token_doc_list = self.tokenizer.batch_tokenize(processed_doc_list) for doc in token_doc_list: new_processed_doc_list.append([word.text for word in doc]) processed_doc_list = new_processed_doc_list processed_ref_sum_list = [ w.text for w in self.tokenizer.tokenize(processed_ref_sum_str) ] processed_prefix_sum_list = [ w.text for w in self.tokenizer.tokenize(processed_prefix_sum_str) ] else: processed_doc_list = [d.split(" ") for d in processed_doc_list] processed_ref_sum_list = processed_ref_sum_str.split(" ") processed_prefix_sum_list = processed_prefix_sum_str.split(" ") # must do lower processed_doc_list = [[x.lower() for x in sent] for sent in processed_doc_list] processed_ref_sum_list = [x.lower() for x in processed_ref_sum_list] processed_prefix_sum_list = [ x.lower() for x in processed_prefix_sum_list ] # if self.rm_stop_word: # processed_doc_list = [[x for x in sent if x not in self.stop_words] for sent in processed_doc_list] # processed_ref_sum_list = [x for x in processed_ref_sum_list if x not in self.stop_words] # processed_prefix_sum_list = [x for x in processed_prefix_sum_list if x not in self.stop_words] target_ref_sum_list = [ x for x in processed_ref_sum_list if x not in processed_prefix_sum_list ] # TODO f_score_list, score_matrix = self.iter_rouge(processed_doc_list, target_ref_sum_list) # preprocessing finished filtered_doc_list, map_from_new_to_ori_idx = self.pre_prune( processed_doc_list, target_ref_sum_list) combination_data_dict = {} for num_sent_in_combination in range(self.min_combination_num, self.max_combination_num): combination_data = self.comp_num_seg_out_of_p_sent_beam( _filtered_doc_list=filtered_doc_list, num_sent_in_combination=num_sent_in_combination, target_ref_sum_list=target_ref_sum_list, map_from_new_to_ori_idx=map_from_new_to_ori_idx) if combination_data['best'] is None: break best_rouge_of_this_batch = combination_data['best']['R1'] if len(combination_data_dict) >= self.beam_sz: rouge_in_bag = [ float(k) for k, v in combination_data_dict.items() ] if best_rouge_of_this_batch < min(rouge_in_bag): break combination_data_dict = { **combination_data_dict, **combination_data['data'] } combination_data_dict = collections.OrderedDict( sorted(combination_data_dict.items(), reverse=True)) sliced = islice(combination_data_dict.items(), self.beam_sz) combination_data_dict = collections.OrderedDict(sliced) # combination_data_dict[num_sent_in_combination] = combination_data # prepare return data return_dict = {} for k, v in combination_data_dict.items(): # tmp_list = [0 for _ in range(len_of_doc)] # for i in v['label']: # tmp_list[i] = 1 return_dict[k] = v['label'] return return_dict def iter_rouge(self, list_of_doc, ref_sum): f_score_list = [ self.get_rouge_ready_to_use(ref_sum, x) for x in list_of_doc ] # score_matrix_delta = [[0 for _ in range(len(list_of_doc))] for _ in range(len(list_of_doc))] score_matrix = [[0 for _ in range(len(list_of_doc))] for _ in range(len(list_of_doc))] input = [] for idx, x in enumerate(list_of_doc): for jdx, y in enumerate(list_of_doc): input.append((idx, jdx, ref_sum, x + y)) s = self.get_rouge_ready_to_use(ref_sum, x + y) score_matrix[idx][jdx] = s # if f_score_list[idx] < 0.01: # # score_matrix_delta[idx][jdx] = 0 # else: # score_matrix_delta[idx][jdx] = min(s / (f_score_list[idx] + 0.001), 2) # import numpy as np # np.set_printoptions(precision=2) # import seaborn as sns # sns.set() # f_score_list = np.asarray([f_score_list, f_score_list]) # bx = sns.heatmap(f_score_list) # fig = bx.get_figure() # fig.savefig("individual_output.png") # print('-' * 30) # print(np.asarray(score_matrix)) # score_matrix_delta = np.asarray(score_matrix_delta) # ax = sns.heatmap(score_matrix_delta) # fig = ax.get_figure() # fig.savefig("output.png") # ncpu=multiprocessing.cpu_count() # pool = multiprocessing.Pool(processes=ncpu) # results = pool.starmap(self.get_rouge_ready_to_use, input) # for r in results: # score, idx,jdx = r # score_matrix[idx][jdx] = score return f_score_list, score_matrix def comp_num_seg_out_of_p_sent_beam(self, _filtered_doc_list, num_sent_in_combination, target_ref_sum_list, map_from_new_to_ori_idx) -> dict: beam: List[dict] = [] if len(_filtered_doc_list) < num_sent_in_combination: return { "nlabel": num_sent_in_combination, "data": {}, "best": None } combs = list(range(0, len(_filtered_doc_list))) # _num_edu seq_len cur_beam = {"in": [], "todo": combs, "val": 0} beam.append(cur_beam) for t in range(num_sent_in_combination): dict_pattern = {} # compute top beam_sz for every beam global_board = [] for b in beam: already_in_beam = b['in'] todo = b['todo'] leaderboard = {} for to_add in todo: after_add = already_in_beam + [to_add] candidate_doc_list = list( itertools.chain.from_iterable( [_filtered_doc_list[i] for i in after_add])) # average_f_score = self.get_approximate_rouge(target_ref_sum_list, candidate_doc_list) average_f_score = self.get_rouge_ready_to_use( gold_tokens=target_ref_sum_list, pred_tokens=candidate_doc_list) leaderboard[to_add] = average_f_score sorted_beam = [(k, leaderboard[k]) for k in sorted( leaderboard, key=leaderboard.get, reverse=True)] for it in sorted_beam: new_in = already_in_beam + [it[0]] new_in.sort() str_new_in = [str(x) for x in new_in] if '_'.join(str_new_in) in dict_pattern: continue else: dict_pattern['_'.join(str_new_in)] = True new_list = todo.copy() new_list.remove(it[0]) _beam = {"in": new_in, "todo": new_list, "val": it[1]} global_board.append(_beam) # merge and get the top beam_sz among all sorted_global_board = sorted(global_board, key=lambda x: x["val"], reverse=True) _cnt = 0 check_dict = [] beam_waitlist = [] for it in sorted_global_board: str_in = sorted(it['in']) str_in = [str(x) for x in str_in] _tmp_key = '_'.join(str_in) if _tmp_key in check_dict: continue else: beam_waitlist.append(it) check_dict.append(_tmp_key) _cnt += 1 if _cnt >= self.beam_sz: break beam = beam_waitlist # if len(beam) < 2: # print(len(_filtered_doc_list)) # print(_num_edu) # Write oracle to a string like: 0.4 0.3 0.4 _comb_bag = {} for it in beam: n_comb = it['in'] n_comb.sort() n_comb_original = [map_from_new_to_ori_idx[a] for a in n_comb] n_comb_original.sort() # json label n_comb_original = [int(x) for x in n_comb_original] candidate_doc_list = list( itertools.chain.from_iterable( [_filtered_doc_list[i] for i in n_comb])) # f1 = self.get_approximate_rouge(target_ref_sum_list, candidate_doc_list) f1 = self.get_rouge_ready_to_use(target_ref_sum_list, candidate_doc_list) # f_avg = (f1 + f2 + fl) / 3 _comb_bag[f1] = { "label": n_comb_original, "R1": f1, "nlabel": num_sent_in_combination } # print(len(_comb_bag)) if len(_comb_bag) == 0: return { "nlabel": num_sent_in_combination, "data": {}, "best": None } else: best_key = sorted(_comb_bag.keys(), reverse=True)[0] rt_dict = { "nlabel": num_sent_in_combination, "data": _comb_bag, "best": _comb_bag[best_key] } return rt_dict @staticmethod def _rouge_clean(s): return re.sub(r'[^a-zA-Z0-9 ]', '', s) def get_rouge_ready_to_use_w_index(self, gold_tokens: List[str], pred_tokens: List[str], idx, jdx): return self.get_rouge_ready_to_use(gold_tokens, pred_tokens), idx, jdx # No synomous standard version def get_rouge_ready_to_use(self, gold_tokens: List[str], pred_tokens: List[str]): len_gold = len(gold_tokens) len_pred = len(pred_tokens) gold_bigram = _get_ngrams(2, gold_tokens) pred_bigram = _get_ngrams(2, pred_tokens) if self.rm_stop_word: gold_unigram = set( [x for x in gold_tokens if x not in self.stop_words]) pred_unigram = set( [x for x in pred_tokens if x not in self.stop_words]) else: gold_unigram = set(gold_tokens) pred_unigram = set(pred_tokens) rouge_1 = cal_rouge(pred_unigram, gold_unigram, len_pred, len_gold)['f'] rouge_2 = cal_rouge(pred_bigram, gold_bigram, len_pred, len_gold)['f'] rouge_score = (rouge_1 + rouge_2) / 2 return rouge_score def pre_prune(self, list_of_doc: List[List[str]], ref_sum: List[str]): keep_candidate_num = math.ceil( len(list_of_doc) * self.prune_candidate_percent) # f_score_list = [self.get_approximate_rouge(ref_sum, x) for x in list_of_doc] f_score_list = [ self.get_rouge_ready_to_use(ref_sum, x) for x in list_of_doc ] top_p_sent_idx = numpy.argsort(f_score_list)[-keep_candidate_num:] map_from_new_to_ori_idx = [] # filter filtered_doc_list = [] for i in range(len(top_p_sent_idx)): filtered_doc_list.append(list_of_doc[top_p_sent_idx[i]]) map_from_new_to_ori_idx.append(top_p_sent_idx[i]) return filtered_doc_list, map_from_new_to_ori_idx
def main(): reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='target_tokens') }) train_dataset = reader.read('data/tatoeba/tatoeba.eng_cmn.train.tsv') validation_dataset = reader.read('data/tatoeba/tatoeba.eng_cmn.dev.tsv') vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={ 'tokens': 3, 'target_tokens': 3 }) en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EN_EMBEDDING_DIM) # encoder = PytorchSeq2SeqWrapper( # torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding}) # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')()) # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM) attention = DotProductAttention() max_decoding_steps = 20 # TODO: make this variable model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=ZH_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=1, cuda_device=CUDA_DEVICE) for i in range(50): print('Epoch: {}'.format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(validation_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
def main(args): print('Reading original dataset...') original_data = [] with open(args.original) as f: total = sum((1 for _ in f)) with open(args.original) as f: for line in tqdm(f, total=total): sample = json.loads(line) if sample['gold_label'] != '-': original_data.append({ 'sentence1': sample['sentence1'], 'sentence2': sample['sentence2'], 'gold_label': sample['gold_label'] }) print(f'Read {len(original_data)} original instances.') print('-' * 100) print('Reading mirror instance...') mirror_data = [] count = 0 with open(args.mirror) as mf: total = sum((1 for _ in mf)) with open(args.mirror) as mf, open(args.prediction) as pf: for instance, prediction in tqdm(zip(mf, pf), total=total): ins = json.loads(instance) pred = json.loads(prediction) mirror_data.append({ 'sentence1': ins['sentence1'], 'sentence2': ins['sentence2'], 'gold_label': pred['label'], 'confidence': max(pred['label_probs']) }) count += 1 print(f'From {total} mirror instances.') print('-' * 100) print('Finding paraphrase samples...') assert len(original_data) == len(mirror_data),\ 'original dataset size != mirror dataset size' positive_samples, negative_samples = [], [] for original, mirror in tqdm(zip(original_data, mirror_data), total=len(original_data)): assert original['sentence1'] == mirror['sentence2'] assert original['sentence2'] == mirror['sentence1'] if original['gold_label'] == 'entailment' and mirror['gold_label'] == 'entailment'\ and mirror['confidence'] >= args.confidence_threshold: positive_samples.append({ 'sentence1': original['sentence1'], 'sentence2': original['sentence2'], 'label': 1 }) else: negative_samples.append({ 'sentence1': original['sentence1'], 'sentence2': original['sentence2'], 'label': 0 }) print('-' * 100) print('Tokenize and write into output') negative_samples = random.sample(negative_samples, len(positive_samples)) samples = positive_samples + negative_samples random.shuffle(samples) tokenizer = WordTokenizer() with open(args.output, 'w') as outf: # MRPC format outf.write(f'Quality\t#1 ID\t#2 ID\t#1 String\t#2 String\n') for sample in tqdm(samples, total=len(samples)): label = sample['label'] sentence1, sentence2 = sample['sentence1'], sample['sentence2'] s1_tokens = ' '.join( (t.text for t in tokenizer.tokenize(sentence1))) s2_tokens = ' '.join( (t.text for t in tokenizer.tokenize(sentence2))) outf.write( f'{label}\tsentence1\tsentence2\t{s1_tokens}\t{s2_tokens}\n') print(f'Written {len(samples)} pairs of paraphrase into {args.output}')
from allennlp.data.tokenizers.word_tokenizer import WordTokenizer from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter import json import argparse from collections import Counter import random from copy import deepcopy from tqdm import tqdm tokenizer = WordTokenizer( word_splitter=SpacyWordSplitter(pos_tags=True, ner=True)) parser = argparse.ArgumentParser() parser.add_argument('--input_file') parser.add_argument('--output_file') parser.add_argument('--switch_rate', type=float, default=0.4) args = parser.parse_args() pronouns = [ 'he', 'him', 'his', 'she', 'her', 'it', 'its', 'they', 'them', 'their', 'that', 'those', 'this', 'these', 'there', 'here' ] cut_length = 12 min_length = 5 ner_tag_cnt = 3 pos_tag_cnt = 3 def question_label(question): # True for <START>, False for others q_tokens = tokenizer.tokenize(question)
def __init__(self): config = conf['seq2seq_allen'] prefix = config['processed_data_prefix'] train_file = config['train_data'] valid_file = config['valid_data'] src_embedding_dim = config['src_embedding_dim'] trg_embedding_dim = config['trg_embedding_dim'] hidden_dim = config['hidden_dim'] if torch.cuda.is_available(): cuda_device = 0 else: cuda_device = -1 self.reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=WordTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='target_tokens') }) self.train_dataset = self.reader.read(os.path.join(prefix, train_file)) self.valid_dataset = self.reader.read(os.path.join(prefix, valid_file)) vocab = Vocabulary.from_instances(self.train_dataset + self.valid_dataset, min_count={ 'tokens': 3, 'target_tokens': 3 }) src_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=src_embedding_dim) encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(src_embedding_dim, hidden_dim, batch_first=True)) source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding}) attention = LinearAttention(hidden_dim, hidden_dim, activation=Activation.by_name('tanh')()) self.model = SimpleSeq2Seq( vocab=vocab, source_embedder=source_embedder, encoder=encoder, max_decoding_steps=20, target_embedding_dim=trg_embedding_dim, target_namespace='target_tokens', attention=attention, # pass attention use_bleu=True) optimizer = optim.Adam(self.model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens") ]) # 迭代器需要接受vocab,在训练时可以用vocab来index数据 iterator.index_with(vocab) self.model.cuda(cuda_device) self.trainer = Trainer(model=self.model, optimizer=optimizer, iterator=iterator, patience=10, validation_metric="+accuracy", train_dataset=self.train_dataset, validation_dataset=self.valid_dataset, num_epochs=1, cuda_device=cuda_device)
from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq from allennlp.modules.attention import DotProductAttention from allennlp.modules.seq2seq_encoders import StackedSelfAttentionEncoder from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder from allennlp.modules.token_embedders import Embedding from allennlp.predictors import SimpleSeq2SeqPredictor EN_EMBEDDING_DIM = 256 ZH_EMBEDDING_DIM = 256 HIDDEN_DIM = 256 CUDA_DEVICE = -1 # Loading the reader, vocab, embeddings and model structure reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')}, lazy=True) vocab = Vocabulary.from_files('/home/earendil/NLP/neural_machine_translation/checkpoint_vocab_epoch_13') en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EN_EMBEDDING_DIM) encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})
import re from .regex_expressions import * from allennlp.data.tokenizers.word_tokenizer import WordTokenizer tokenizer = WordTokenizer(end_tokens=['<EOS>']) # TODO - extend settings and add emoji end emoticon processing class Preprocessing(object): """ Module for text pre-processing """ def __init__(self, **kwargs): self.char_clean = kwargs.get('char_cleaning', True) self.char_normalize = kwargs.get('char_normalize', True) self.word_normalize = kwargs.get('word_normalization', True) self.expand = kwargs.get('expand', True) self.escape_punctuation = kwargs.get('escape_punctuation', True) self.negation = kwargs.get('negation', True) def split_text(self, text): return text.split() def tokenize(self, text): tokens = tokenizer.tokenize(text) return [t.text for t in tokens] def process_text(self, text): tokens = tokenizer.tokenize(text)