class SentenceTaggerPredictorNER(Predictor): """ Predictor for any model that takes in a sentence and returns a single set of tags for it. In particular, it can be used with the [`CrfTagger`](../models/crf_tagger.md) model and also the [`SimpleTagger`](../models/simple_tagger.md) model. ``P.S.``: For words tokenization is uses ``JustSpacesWordSplitter`` from ``word_splitter`` """ def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) def predict(self, sentence: str) -> JsonDict: return self.predict_json({"sentence": sentence}) @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like `{"sentence": "..."}`. Runs the underlying model, and adds the `"words"` to the output. """ sentence = json_dict["sentence"] tokens = self._tokenizer.tokenize(sentence) return self._dataset_reader.text_to_instance(tokens)
def test_passes_through_correctly(self): word_processor = WordTokenizer() sentence = "this (sentence) has 'crazy' \"punctuation\"." tokens = word_processor.tokenize(sentence) expected_tokens = [ "this", "(", "sentence", ")", "has", "'", "crazy", "'", "\"", "punctuation", "\"", "." ] assert tokens == expected_tokens
def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"sentence": "..."}``. Runs the underlying model, and adds the ``"label"`` to the output. """ sentence = json_dict["sentence"] if not hasattr(self._dataset_reader, "tokenizer") and not hasattr( self._dataset_reader, "_tokenizer"): tokenizer = WordTokenizer() sentence = [str(t) for t in tokenizer.tokenize(sentence)] return self._dataset_reader.text_to_instance(sentence)
def _read(self, file_path: str) -> Iterator[Instance]: # Keys: title + abstractText splitter = SpacyWordSplitter('en_core_web_sm', True, True, True) tokenizer = WordTokenizer(word_splitter=splitter) with open(file_path, 'r') as f: json_docs = json.load(f) for article in json_docs['documents']: doc_name = article['pmid'] title = article['title'] abstract = article['abstractText'] text = title + " " + abstract tokens = tokenizer.tokenize(text) yield self.text_to_instance(doc_name, tokens)
def _read(self, file_path: str) -> Iterator[Instance]: splitter = SpacyWordSplitter('en_core_web_sm', True, True, True) tokenizer = WordTokenizer(word_splitter=splitter) root = ElementTree.parse(file_path).getroot() xml_sents = root.findall("./sentence") for xml_sent in tqdm(xml_sents): text = xml_sent.find("text").text annotations = xml_sent.find('aspectTerms') if annotations is not None: annotations = annotations.findall("aspectTerm") else: annotations = [] # Sorts the annotations by start character annotations.sort(key=lambda x: int(x.get('from'))) # Tokenizes the sentence tokens = tokenizer.tokenize(text) # Assigns tags based on annotations tags = [] next = 0 current = None for token in tokens: # Checks if the next annotation begins somewhere in this token start_entity = next < len(annotations) start_entity = start_entity and token.idx <= int( annotations[next].get('from')) start_entity = start_entity and token.idx + len( token.text) > int(annotations[next].get('from')) if start_entity: tags.append('I' if current is None else 'B') current = annotations[next] next += 1 elif current is not None: if token.idx < int(current.get('to')): tags.append('I') else: tags.append('O') current = None else: tags.append('O') yield self.text_to_instance(xml_sent.get('id'), tokens, tags)
class DocumentOracleDerivation(object): def __init__(self, min_combination_num: int = 3, max_combination_num: int = 5, rm_stop_word: bool = True, synonyms: bool = True, stem: bool = False, tokenization: bool = True, beam_sz: int = 5, candidate_percent: float = 1.0): self.min_combination_num = min_combination_num self.max_combination_num = max_combination_num self.rm_stop_word = rm_stop_word self.stem = stem self.tokenization = tokenization self.beam_sz = beam_sz self.candidate_percent = candidate_percent if self.stem: self.stemmer = PorterStemmer().stem_word else: self.stemmer = lambda x: x self.synonyms = synonyms if self.tokenization: from allennlp.data.tokenizers.word_tokenizer import WordTokenizer self.tokenizer = WordTokenizer() if self.rm_stop_word: self.stop_words = list(set(stopwords.words('english'))) + [x for x in string.punctuation] + ['``', '\'\''] else: self.stop_words = [] def get_rouge_w_annotation_ready_to_use(self, gold_tokens: List[str], pred_tokens: List[str]): gold_lower = list(set([x.lower() for x in gold_tokens])) gold_wo_stop = [x for x in gold_lower if x not in self.stop_words] # change of index gold_wo_stop = replace_w_morphy(gold_wo_stop) gold_stem = [ps.stem(x) for x in gold_wo_stop] pred_lower = list([x.lower() for x in pred_tokens]) pred_lower = replace_w_morphy(pred_lower) pred_lower = remove_duplicate_tok(pred_lower) pred_stem = [ps.stem(x) for x in pred_lower] pred_stem = remove_duplicate_tok(pred_stem) size_of_gold = len(gold_stem) size_of_pred = len(pred_stem) gold_key, gold_value = [], [] for idx, word in enumerate(gold_wo_stop): # for one gold word, we have a minigroup _tmp = [] if word in pred_lower: _tmp.append(word) elif word in pred_stem: _tmp.append(word) elif gold_stem[idx] in pred_lower: _tmp.append(gold_stem[idx]) elif gold_stem[idx] in pred_stem: _tmp.append(gold_stem[idx]) # if word or stm word could match, we don't need to search syn if _tmp != []: _tmp = _tmp[0] gold_key.append(_tmp) gold_value.append(1) else: if word not in cache_for_th: try: cache_for_th[word] = flatten(th.Word(word).synonyms('all', relevance=[3])) except: cache_for_th[word] = [] if gold_stem[idx] not in cache_for_th: try: cache_for_th[gold_stem[idx]] = flatten( th.Word(gold_stem[idx]).synonyms('all', relevance=[3])) except: cache_for_th[gold_stem[idx]] = [] syn = cache_for_th[word] syn_stem = cache_for_th[gold_stem[idx]] syn = list(set(syn + syn_stem)) # print(syn) l_syn = len(syn) if l_syn != 0: gold_key += syn gold_value += [float(1 / l_syn)] * l_syn gold_tokens = [ps.stem(x) for x in gold_key] # pred_set = set(pred) # comp intersection vs = 0 key_index = [] for p_idx in range(len(pred_lower)): p_word = pred_lower[p_idx] p_stem_word = pred_stem[p_idx] if p_word in gold_key: idx = gold_key.index(p_word) v = gold_value[idx] vs += v key_index.append(p_idx) elif p_stem_word in gold_tokens: idx = gold_tokens.index(p_stem_word) v = gold_value[idx] vs += v key_index.append(p_idx) rouge_recall_1 = 0 if size_of_gold != 0: rouge_recall_1 = vs / float(size_of_gold) rouge_pre_1 = 0 if size_of_pred != 0: rouge_pre_1 = vs / float(size_of_pred) # print(rouge_recall_1, rouge_pre_1) # assert rouge_recall_1 <= 1 # assert rouge_pre_1 <= 1 if random.random() < 0.00001: print("Recall: {}\tPre: {}".format(rouge_recall_1, rouge_pre_1)) print(pred_tokens) customed_recall = rouge_recall_1 + rouge_pre_1 * 0.01 - 0.01 f1 = 0 if (rouge_recall_1 + rouge_pre_1 == 0) else 2 * (rouge_recall_1 * rouge_pre_1) / ( rouge_recall_1 + rouge_pre_1) return customed_recall, f1, key_index # f1 = 0 if (rouge_recall_1 + rouge_pre_1 == 0) else 2 * (rouge_recall_1 * rouge_pre_1) / ( # rouge_recall_1 + rouge_pre_1) # f1 = rouge_recall_1 * 5 + rouge_pre_1 def comp_num_seg_out_of_p_sent_beam(self, _filtered_doc_list, num_sent_in_combination, target_ref_sum_list, map_from_new_to_ori_idx) -> dict: beam: List[dict] = [] if len(_filtered_doc_list) < num_sent_in_combination: return {"nlabel": num_sent_in_combination, "data": {}, "best": None } combs = list(range(0, len(_filtered_doc_list))) # _num_edu seq_len cur_beam = { "in": [], "todo": combs, "val": 0 } beam.append(cur_beam) for t in range(num_sent_in_combination): dict_pattern = {} # compute top beam_sz for every beam global_board = [] for b in beam: already_in_beam = b['in'] todo = b['todo'] leaderboard = {} for to_add in todo: after_add = already_in_beam + [to_add] candidate_doc_list = list(itertools.chain.from_iterable([_filtered_doc_list[i] for i in after_add])) # average_f_score = self.get_approximate_rouge(target_ref_sum_list, candidate_doc_list) _, average_f_score, _ = self.get_rouge_w_annotation_ready_to_use(gold_tokens=target_ref_sum_list, pred_tokens=candidate_doc_list) leaderboard[to_add] = average_f_score sorted_beam = [(k, leaderboard[k]) for k in sorted(leaderboard, key=leaderboard.get, reverse=True)] for it in sorted_beam: new_in = already_in_beam + [it[0]] new_in.sort() str_new_in = [str(x) for x in new_in] if '_'.join(str_new_in) in dict_pattern: continue else: dict_pattern['_'.join(str_new_in)] = True new_list = todo.copy() new_list.remove(it[0]) _beam = { "in": new_in, "todo": new_list, "val": it[1] } global_board.append(_beam) # merge and get the top beam_sz among all sorted_global_board = sorted(global_board, key=lambda x: x["val"], reverse=True) _cnt = 0 check_dict = [] beam_waitlist = [] for it in sorted_global_board: str_in = sorted(it['in']) str_in = [str(x) for x in str_in] _tmp_key = '_'.join(str_in) if _tmp_key in check_dict: continue else: beam_waitlist.append(it) check_dict.append(_tmp_key) _cnt += 1 if _cnt >= self.beam_sz: break beam = beam_waitlist # if len(beam) < 2: # print(len(_filtered_doc_list)) # print(_num_edu) # Write oracle to a string like: 0.4 0.3 0.4 _comb_bag = {} for it in beam: n_comb = it['in'] n_comb.sort() n_comb_original = [map_from_new_to_ori_idx[a] for a in n_comb] n_comb_original.sort() # json label n_comb_original = [int(x) for x in n_comb_original] candidate_doc_list = list(itertools.chain.from_iterable([_filtered_doc_list[i] for i in n_comb])) # f1 = self.get_approximate_rouge(target_ref_sum_list, candidate_doc_list) _, f1, _ = self.get_rouge_w_annotation_ready_to_use(target_ref_sum_list, candidate_doc_list) # f_avg = (f1 + f2 + fl) / 3 _comb_bag[f1] = {"label": n_comb_original, "R1": f1, "nlabel": num_sent_in_combination} # print(len(_comb_bag)) if len(_comb_bag) == 0: return {"nlabel": num_sent_in_combination, "data": {}, "best": None } else: best_key = sorted(_comb_bag.keys(), reverse=True)[0] rt_dict = {"nlabel": num_sent_in_combination, "data": _comb_bag, "best": _comb_bag[best_key] } return rt_dict def derive_doc_oracle(self, doc_list: List[str], ref_sum: str, prefix_summary: str = "" ): processed_doc_list, processed_ref_sum_str, processed_prefix_sum_str = [], '', '' if self.tokenization: token_doc_list = self.tokenizer.batch_tokenize(doc_list) for doc in token_doc_list: processed_doc_list.append([word.text for word in doc]) processed_ref_sum_list = [w.text for w in self.tokenizer.tokenize(ref_sum)] processed_prefix_sum_list = [w.text for w in self.tokenizer.tokenize(prefix_summary)] else: processed_doc_list = [d.split(" ") for d in doc_list] processed_ref_sum_list = ref_sum.split(" ") processed_prefix_sum_list = prefix_summary.split(" ") processed_doc_list = [[x.lower() for x in sent] for sent in processed_doc_list] processed_ref_sum_list = [x.lower() for x in processed_ref_sum_list] processed_prefix_sum_list = [x.lower() for x in processed_prefix_sum_list] if self.rm_stop_word: processed_doc_list = [[x for x in sent if x not in self.stop_words] for sent in processed_doc_list] processed_ref_sum_list = [x for x in processed_ref_sum_list if x not in self.stop_words] processed_prefix_sum_list = [x for x in processed_prefix_sum_list if x not in self.stop_words] target_ref_sum_list = [x for x in processed_ref_sum_list if x not in processed_prefix_sum_list] # preprocessing finished filtered_doc_list, map_from_new_to_ori_idx = self.pre_prune(processed_doc_list, target_ref_sum_list) combination_data_dict = {} for num_sent_in_combination in range(self.min_combination_num, self.max_combination_num): combination_data = self.comp_num_seg_out_of_p_sent_beam(_filtered_doc_list=filtered_doc_list, num_sent_in_combination=num_sent_in_combination, target_ref_sum_list=target_ref_sum_list, map_from_new_to_ori_idx=map_from_new_to_ori_idx) combination_data_dict = {**combination_data_dict, **combination_data['data']} combination_data_dict[num_sent_in_combination] = combination_data return combination_data_dict def pre_prune(self, list_of_doc: List[List[str]], ref_sum: List[str] ): keep_candidate_num = math.ceil(len(list_of_doc) * self.candidate_percent) # f_score_list = [self.get_approximate_rouge(ref_sum, x) for x in list_of_doc] f_score_list = [self.get_rouge_w_annotation_ready_to_use(ref_sum, x)[1] for x in list_of_doc] top_p_sent_idx = numpy.argsort(f_score_list)[-keep_candidate_num:] map_from_new_to_ori_idx = [] # filter filtered_doc_list = [] for i in range(len(top_p_sent_idx)): filtered_doc_list.append(list_of_doc[top_p_sent_idx[i]]) map_from_new_to_ori_idx.append(top_p_sent_idx[i]) return filtered_doc_list, map_from_new_to_ori_idx
token_indexer = bert_indexer.PretrainedBertIndexer( '../TransformerCoqa/bert-base-uncased-vocab.txt', do_lowercase=False, max_pieces=8, doc_stride=3) token_embedder = PretrainedBertEmbedder( '../TransformerCoqa/bert-base-uncased.tar.gz') # with open(args.input_file, 'w') as f: # data = json.load(f)['data'] # # for article in data: # story = article['story'] a = "the man went to the store and bought a gallon of milk" b = tokenizer.tokenize(a) print(b) bert_vocab = Vocabulary() c = token_indexer.tokens_to_indices(b, bert_vocab, 'bert') print(c) input_ids = c['bert'] for input_id in input_ids: tokens = [ bert_vocab.get_token_from_index(index=idx, namespace='bert') for idx in input_id ] print(tokens) d = token_embedder(torch.LongTensor(c['bert']))
class DocumentOracleDerivation(object): def __init__(self, mixed_combination: bool, min_combination_num: int = 1, max_combination_num: int = 8, rm_stop_word: bool = True, stem: bool = False, morphy: bool = False, tokenization: bool = True, beam_sz: int = 5, prune_candidate_percent: float = 0.4): self.mixed_combination = mixed_combination self.min_combination_num = min_combination_num self.max_combination_num = max_combination_num self.rm_stop_word = rm_stop_word self.stem = stem self.tokenization = tokenization self.beam_sz = beam_sz self.prune_candidate_percent = prune_candidate_percent if self.stem: self.stemmer = PorterStemmer().stem_word else: self.stemmer = lambda x: x self.morphy = morphy if self.tokenization: from allennlp.data.tokenizers.word_tokenizer import WordTokenizer self.tokenizer = WordTokenizer() if self.rm_stop_word: self.stop_words = list(set(stopwords.words('english'))) + [ x for x in string.punctuation ] + ['``', '\'\''] else: self.stop_words = [] def derive_doc_oracle( self, doc_list: List[str], ref_sum: str, prefix_summary: str = "", ): # return a dict where key=rouge-f1 and value= [0,0,0,1,0,1,0,...] same size as doc_list # processed_doc_list, processed_ref_sum_str, processed_prefix_sum_str = [], '', '' len_of_doc = len(doc_list) processed_doc_list = [self._rouge_clean(x) for x in doc_list] processed_ref_sum_str = self._rouge_clean(ref_sum) processed_prefix_sum_str = self._rouge_clean(prefix_summary) if self.tokenization: new_processed_doc_list = [] token_doc_list = self.tokenizer.batch_tokenize(processed_doc_list) for doc in token_doc_list: new_processed_doc_list.append([word.text for word in doc]) processed_doc_list = new_processed_doc_list processed_ref_sum_list = [ w.text for w in self.tokenizer.tokenize(processed_ref_sum_str) ] processed_prefix_sum_list = [ w.text for w in self.tokenizer.tokenize(processed_prefix_sum_str) ] else: processed_doc_list = [d.split(" ") for d in processed_doc_list] processed_ref_sum_list = processed_ref_sum_str.split(" ") processed_prefix_sum_list = processed_prefix_sum_str.split(" ") # must do lower processed_doc_list = [[x.lower() for x in sent] for sent in processed_doc_list] processed_ref_sum_list = [x.lower() for x in processed_ref_sum_list] processed_prefix_sum_list = [ x.lower() for x in processed_prefix_sum_list ] # if self.rm_stop_word: # processed_doc_list = [[x for x in sent if x not in self.stop_words] for sent in processed_doc_list] # processed_ref_sum_list = [x for x in processed_ref_sum_list if x not in self.stop_words] # processed_prefix_sum_list = [x for x in processed_prefix_sum_list if x not in self.stop_words] target_ref_sum_list = [ x for x in processed_ref_sum_list if x not in processed_prefix_sum_list ] # TODO f_score_list, score_matrix = self.iter_rouge(processed_doc_list, target_ref_sum_list) # preprocessing finished filtered_doc_list, map_from_new_to_ori_idx = self.pre_prune( processed_doc_list, target_ref_sum_list) combination_data_dict = {} for num_sent_in_combination in range(self.min_combination_num, self.max_combination_num): combination_data = self.comp_num_seg_out_of_p_sent_beam( _filtered_doc_list=filtered_doc_list, num_sent_in_combination=num_sent_in_combination, target_ref_sum_list=target_ref_sum_list, map_from_new_to_ori_idx=map_from_new_to_ori_idx) if combination_data['best'] is None: break best_rouge_of_this_batch = combination_data['best']['R1'] if len(combination_data_dict) >= self.beam_sz: rouge_in_bag = [ float(k) for k, v in combination_data_dict.items() ] if best_rouge_of_this_batch < min(rouge_in_bag): break combination_data_dict = { **combination_data_dict, **combination_data['data'] } combination_data_dict = collections.OrderedDict( sorted(combination_data_dict.items(), reverse=True)) sliced = islice(combination_data_dict.items(), self.beam_sz) combination_data_dict = collections.OrderedDict(sliced) # combination_data_dict[num_sent_in_combination] = combination_data # prepare return data return_dict = {} for k, v in combination_data_dict.items(): # tmp_list = [0 for _ in range(len_of_doc)] # for i in v['label']: # tmp_list[i] = 1 return_dict[k] = v['label'] return return_dict def iter_rouge(self, list_of_doc, ref_sum): f_score_list = [ self.get_rouge_ready_to_use(ref_sum, x) for x in list_of_doc ] # score_matrix_delta = [[0 for _ in range(len(list_of_doc))] for _ in range(len(list_of_doc))] score_matrix = [[0 for _ in range(len(list_of_doc))] for _ in range(len(list_of_doc))] input = [] for idx, x in enumerate(list_of_doc): for jdx, y in enumerate(list_of_doc): input.append((idx, jdx, ref_sum, x + y)) s = self.get_rouge_ready_to_use(ref_sum, x + y) score_matrix[idx][jdx] = s # if f_score_list[idx] < 0.01: # # score_matrix_delta[idx][jdx] = 0 # else: # score_matrix_delta[idx][jdx] = min(s / (f_score_list[idx] + 0.001), 2) # import numpy as np # np.set_printoptions(precision=2) # import seaborn as sns # sns.set() # f_score_list = np.asarray([f_score_list, f_score_list]) # bx = sns.heatmap(f_score_list) # fig = bx.get_figure() # fig.savefig("individual_output.png") # print('-' * 30) # print(np.asarray(score_matrix)) # score_matrix_delta = np.asarray(score_matrix_delta) # ax = sns.heatmap(score_matrix_delta) # fig = ax.get_figure() # fig.savefig("output.png") # ncpu=multiprocessing.cpu_count() # pool = multiprocessing.Pool(processes=ncpu) # results = pool.starmap(self.get_rouge_ready_to_use, input) # for r in results: # score, idx,jdx = r # score_matrix[idx][jdx] = score return f_score_list, score_matrix def comp_num_seg_out_of_p_sent_beam(self, _filtered_doc_list, num_sent_in_combination, target_ref_sum_list, map_from_new_to_ori_idx) -> dict: beam: List[dict] = [] if len(_filtered_doc_list) < num_sent_in_combination: return { "nlabel": num_sent_in_combination, "data": {}, "best": None } combs = list(range(0, len(_filtered_doc_list))) # _num_edu seq_len cur_beam = {"in": [], "todo": combs, "val": 0} beam.append(cur_beam) for t in range(num_sent_in_combination): dict_pattern = {} # compute top beam_sz for every beam global_board = [] for b in beam: already_in_beam = b['in'] todo = b['todo'] leaderboard = {} for to_add in todo: after_add = already_in_beam + [to_add] candidate_doc_list = list( itertools.chain.from_iterable( [_filtered_doc_list[i] for i in after_add])) # average_f_score = self.get_approximate_rouge(target_ref_sum_list, candidate_doc_list) average_f_score = self.get_rouge_ready_to_use( gold_tokens=target_ref_sum_list, pred_tokens=candidate_doc_list) leaderboard[to_add] = average_f_score sorted_beam = [(k, leaderboard[k]) for k in sorted( leaderboard, key=leaderboard.get, reverse=True)] for it in sorted_beam: new_in = already_in_beam + [it[0]] new_in.sort() str_new_in = [str(x) for x in new_in] if '_'.join(str_new_in) in dict_pattern: continue else: dict_pattern['_'.join(str_new_in)] = True new_list = todo.copy() new_list.remove(it[0]) _beam = {"in": new_in, "todo": new_list, "val": it[1]} global_board.append(_beam) # merge and get the top beam_sz among all sorted_global_board = sorted(global_board, key=lambda x: x["val"], reverse=True) _cnt = 0 check_dict = [] beam_waitlist = [] for it in sorted_global_board: str_in = sorted(it['in']) str_in = [str(x) for x in str_in] _tmp_key = '_'.join(str_in) if _tmp_key in check_dict: continue else: beam_waitlist.append(it) check_dict.append(_tmp_key) _cnt += 1 if _cnt >= self.beam_sz: break beam = beam_waitlist # if len(beam) < 2: # print(len(_filtered_doc_list)) # print(_num_edu) # Write oracle to a string like: 0.4 0.3 0.4 _comb_bag = {} for it in beam: n_comb = it['in'] n_comb.sort() n_comb_original = [map_from_new_to_ori_idx[a] for a in n_comb] n_comb_original.sort() # json label n_comb_original = [int(x) for x in n_comb_original] candidate_doc_list = list( itertools.chain.from_iterable( [_filtered_doc_list[i] for i in n_comb])) # f1 = self.get_approximate_rouge(target_ref_sum_list, candidate_doc_list) f1 = self.get_rouge_ready_to_use(target_ref_sum_list, candidate_doc_list) # f_avg = (f1 + f2 + fl) / 3 _comb_bag[f1] = { "label": n_comb_original, "R1": f1, "nlabel": num_sent_in_combination } # print(len(_comb_bag)) if len(_comb_bag) == 0: return { "nlabel": num_sent_in_combination, "data": {}, "best": None } else: best_key = sorted(_comb_bag.keys(), reverse=True)[0] rt_dict = { "nlabel": num_sent_in_combination, "data": _comb_bag, "best": _comb_bag[best_key] } return rt_dict @staticmethod def _rouge_clean(s): return re.sub(r'[^a-zA-Z0-9 ]', '', s) def get_rouge_ready_to_use_w_index(self, gold_tokens: List[str], pred_tokens: List[str], idx, jdx): return self.get_rouge_ready_to_use(gold_tokens, pred_tokens), idx, jdx # No synomous standard version def get_rouge_ready_to_use(self, gold_tokens: List[str], pred_tokens: List[str]): len_gold = len(gold_tokens) len_pred = len(pred_tokens) gold_bigram = _get_ngrams(2, gold_tokens) pred_bigram = _get_ngrams(2, pred_tokens) if self.rm_stop_word: gold_unigram = set( [x for x in gold_tokens if x not in self.stop_words]) pred_unigram = set( [x for x in pred_tokens if x not in self.stop_words]) else: gold_unigram = set(gold_tokens) pred_unigram = set(pred_tokens) rouge_1 = cal_rouge(pred_unigram, gold_unigram, len_pred, len_gold)['f'] rouge_2 = cal_rouge(pred_bigram, gold_bigram, len_pred, len_gold)['f'] rouge_score = (rouge_1 + rouge_2) / 2 return rouge_score def pre_prune(self, list_of_doc: List[List[str]], ref_sum: List[str]): keep_candidate_num = math.ceil( len(list_of_doc) * self.prune_candidate_percent) # f_score_list = [self.get_approximate_rouge(ref_sum, x) for x in list_of_doc] f_score_list = [ self.get_rouge_ready_to_use(ref_sum, x) for x in list_of_doc ] top_p_sent_idx = numpy.argsort(f_score_list)[-keep_candidate_num:] map_from_new_to_ori_idx = [] # filter filtered_doc_list = [] for i in range(len(top_p_sent_idx)): filtered_doc_list.append(list_of_doc[top_p_sent_idx[i]]) map_from_new_to_ori_idx.append(top_p_sent_idx[i]) return filtered_doc_list, map_from_new_to_ori_idx
def main(args): print('Reading original dataset...') original_data = [] with open(args.original) as f: total = sum((1 for _ in f)) with open(args.original) as f: for line in tqdm(f, total=total): sample = json.loads(line) if sample['gold_label'] != '-': original_data.append({ 'sentence1': sample['sentence1'], 'sentence2': sample['sentence2'], 'gold_label': sample['gold_label'] }) print(f'Read {len(original_data)} original instances.') print('-' * 100) print('Reading mirror instance...') mirror_data = [] count = 0 with open(args.mirror) as mf: total = sum((1 for _ in mf)) with open(args.mirror) as mf, open(args.prediction) as pf: for instance, prediction in tqdm(zip(mf, pf), total=total): ins = json.loads(instance) pred = json.loads(prediction) mirror_data.append({ 'sentence1': ins['sentence1'], 'sentence2': ins['sentence2'], 'gold_label': pred['label'], 'confidence': max(pred['label_probs']) }) count += 1 print(f'From {total} mirror instances.') print('-' * 100) print('Finding paraphrase samples...') assert len(original_data) == len(mirror_data),\ 'original dataset size != mirror dataset size' positive_samples, negative_samples = [], [] for original, mirror in tqdm(zip(original_data, mirror_data), total=len(original_data)): assert original['sentence1'] == mirror['sentence2'] assert original['sentence2'] == mirror['sentence1'] if original['gold_label'] == 'entailment' and mirror['gold_label'] == 'entailment'\ and mirror['confidence'] >= args.confidence_threshold: positive_samples.append({ 'sentence1': original['sentence1'], 'sentence2': original['sentence2'], 'label': 1 }) else: negative_samples.append({ 'sentence1': original['sentence1'], 'sentence2': original['sentence2'], 'label': 0 }) print('-' * 100) print('Tokenize and write into output') negative_samples = random.sample(negative_samples, len(positive_samples)) samples = positive_samples + negative_samples random.shuffle(samples) tokenizer = WordTokenizer() with open(args.output, 'w') as outf: # MRPC format outf.write(f'Quality\t#1 ID\t#2 ID\t#1 String\t#2 String\n') for sample in tqdm(samples, total=len(samples)): label = sample['label'] sentence1, sentence2 = sample['sentence1'], sample['sentence2'] s1_tokens = ' '.join( (t.text for t in tokenizer.tokenize(sentence1))) s2_tokens = ' '.join( (t.text for t in tokenizer.tokenize(sentence2))) outf.write( f'{label}\tsentence1\tsentence2\t{s1_tokens}\t{s2_tokens}\n') print(f'Written {len(samples)} pairs of paraphrase into {args.output}')