def modified_recall(references, hypothesis, n): """ Calculate modified ngram recall. :param references: A list of reference translations. :type references: list(list(str)) :param hypothesis: A hypothesis translation. :type hypothesis: list(str) :param n: The ngram order. :type n: int :return: BLEU's modified precision for the nth order ngram. :rtype: Fraction """ # Extracts all ngrams in hypothesis # Set an empty Counter if hypothesis is empty. # pdb.set_trace() numerator = 0 denominator = 0 counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter() # Extract a union of references' counts. # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references]) max_counts = {} for reference_and_weights in references: reference = reference_and_weights[0] weights = reference_and_weights[1] reference_counts = ( Counter(ngrams(reference, n)) if len(reference) >= n else Counter() ) # for ngram in reference_counts: # max_counts[ngram] = max(max_counts.get(ngram, 0), counts[ngram]) clipped_counts = { ngram: min(count, counts[ngram]) for ngram, count in reference_counts.items() } # reweight if n == 1 and len(weights) == len(reference_counts): def weighted_sum(weights, counts): sum_counts = 0 for ngram, count in counts.items(): sum_counts += count * (weights[ngram[0]] if ngram[0] in weights else 1) return sum_counts numerator += weighted_sum(weights, clipped_counts) denominator += max(1, weighted_sum(weights, reference_counts)) else: numerator += sum(clipped_counts.values()) denominator += max(1, sum(reference_counts.values())) # # Assigns the intersection between hypothesis and references' counts. # clipped_counts = { # ngram: min(count, max_counts[ngram]) for ngram, count in counts.items() # } # numerator += sum(clipped_counts.values()) # # Ensures that denominator is minimum 1 to avoid ZeroDivisionError. # # Usually this happens when the ngram order is > len(reference). # denominator += max(1, sum(counts.values())) #return Fraction(numerator, denominator, _normalize=False) return numerator, denominator
def test_ngrams_padding_left(): text = 'hello' assert [('h',), ('e',), ('l',), ('l',), ('o',)] == \ utils.ngrams(text, 1, padding_left=True) assert [(utils.START, 'h'), ('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o')] == \ utils.ngrams(text, 2, padding_left=True) assert [(utils.START, utils.START, 'h'), (utils.START, 'h', 'e'), ('h', 'e', 'l'), ('e', 'l', 'l'), ('l', 'l', 'o')] == \ utils.ngrams(text, 3, padding_left=True)
def test_ngrams_padding_right(): text = 'hello' assert [('h',), ('e',), ('l',), ('l',), ('o',)] == \ utils.ngrams(text, 1, padding_right=True) assert [('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o'), ('o', utils.END)] == \ utils.ngrams(text, 2, padding_right=True) assert [('h', 'e', 'l'), ('e', 'l', 'l'), ('l', 'l', 'o'), ('l', 'o', utils.END), ('o', utils.END, utils.END)] == \ utils.ngrams(text, 3, padding_right=True)
def generate_batch_text_grams(self): start = self.start end = self.end self.texts_train = np.array([]) self.labels_train = np.array([]) data_split = self.ids[start:end] for i in range(0, len(data_split)): ids_index = data_split[i][0].split(" ") id = int(ids_index[0]) index = int(ids_index[1]) labels = self.labels[index][0] split_labels = labels.split(" ") labels_temp = np.zeros(config.label_size) for j in range(1, len(split_labels)): try: label_index = utils.find_label_index(split_labels[j]) labels_temp[label_index] = 1.0 except ValueError: print("Not have label: ", split_labels[j]) self.labels_train = np.append(self.labels_train, labels_temp) text_name = str(id) + "newsML.xml" reuters = et.parse("data/rcv1-2/train-text/" + text_name, et.XMLParser(encoding='ISO-8859-1')).getroot() temp_text = "" for text in reuters.findall("title"): #print(text.text) temp_text = temp_text + text.text #.replace(" ", "") for text in reuters.findall("text"): for p in text.findall("p"): temp_text = temp_text + p.text #.replace(" ", "").replace("\t","") #print("ID TExt: ", id) temp_text = utils.ngrams(temp_text) self.texts_train = np.append(self.texts_train, temp_text)
def compute_counts(self, sentences, targets): """ Load text and estimate parameters for bigram HMM :param path: path to data file """ for i in range(len(sentences)): # When at the end of a sentence # update counts words = sentences[i] tags = targets[i] for i in range(len(tags)): self.emission_counts[(words[i], tags[i])] = self.emission_counts.get( (words[i], tags[i]), 0.0) + 1 # Adding start to have estimate for P(t_i|START) usually denote pi pos_bigrams = ngrams(['<START>'] + tags, r=2, pad=False) for gram in pos_bigrams.values(): # Update unigram counts if len(gram) == 1: self.pos_unigram_counts[ gram[0]] = self.pos_unigram_counts.get(gram[0], 0.0) + 1 else: self.transition_counts[gram] = self.transition_counts.get( gram, 0.0) + 1
def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): """ Smoothing method 6: Interpolates the maximum likelihood estimate of the precision *p_n* with a prior estimate *pi0*. The prior is estimated by assuming that the ratio between pn and pn−1 will be the same as that between pn−1 and pn−2; from Gao and He (2013) Training MRF-Based Phrase Translation Models using Gradient Ascent. In NAACL. """ hyp_len = hyp_len if hyp_len else len(hypothesis) # This smoothing only works when p_1 and p_2 is non-zero. # Raise an error with an appropriate message when the input is too short # to use this smoothing technique. assert p_n[2], "This smoothing method requires non-zero precision for bigrams." for i, p_i in enumerate(p_n): if i in [0, 1]: # Skips the first 2 orders of ngrams. continue else: pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2] # No. of ngrams in translation that matches the reference. m = p_i.numerator # No. of ngrams in translation. l = sum(1 for _ in ngrams(hypothesis, i + 1)) # Calculates the interpolated precision. p_n[i] = (m + self.alpha * pi0) / (l + self.alpha) return p_n
def test_ngrams_simple(): text = 'hello' assert [('h',), ('e',), ('l',), ('l',), ('o',)] == utils.ngrams(text, 1) assert [('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o')] == \ utils.ngrams(text, 2) assert [('h', 'e', 'l'), ('e', 'l', 'l'), ('l', 'l', 'o')] == \ utils.ngrams(text, 3) text2 = 'hello world' assert [('h',), ('e',), ('l',), ('l',), ('o',), (' ',), ('w',), ('o',), ('r',), ('l',), ('d',)] == \ utils.ngrams(text2, 1) assert [('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o'), ('o', ' '), (' ', 'w'), ('w', 'o'), ('o', 'r'), ('r', 'l'), ('l', 'd')] == \ utils.ngrams(text2, 2)
def _score_compression(abs_list, doc_list, hyp_labels): abs_tokens = flatten(abs_list) doc_tokens = [] assert len(doc_list) == len(hyp_labels) for (sent, mask) in zip(doc_list, hyp_labels): assert len(sent) == len(mask) for (token, label) in zip(sent, mask): if label == 0: doc_tokens.append(token) doc_tokens = _preprocess(doc_tokens) doc_uni = set(ngrams(doc_tokens, 1)) doc_bi = set(ngrams(doc_tokens, 2)) rouge1 = compute_rouge(doc_uni, _abs_uni) rouge2 = compute_rouge(doc_bi, _abs_bi) return (rouge1, rouge2)
def greedy_search(abs_list, doc_list, budget=3): def _rouge_clean(s): return re.sub(r'[^a-zA-Z0-9 ]', '', s) max_rouge = 0.0 abs_tokens = _rouge_clean(' '.join(flatten(abs_list))).split() sents = [_rouge_clean(' '.join(sent)).split() for sent in doc_list] hyp_unigrams = [ngrams(sent, 1) for sent in sents] hyp_bigrams = [ngrams(sent, 2) for sent in sents] ref_unigrams = ngrams(abs_tokens, 1) ref_bigrams = ngrams(abs_tokens, 2) selected_idxs = [] for _ in range(budget): curr_max_rouge = max_rouge curr_id = -1 for (i, sent) in enumerate(sents): if i in selected_idxs: continue candidate_idxs = selected_idxs + [i] candidate_unigrams = set.union( *[set(hyp_unigrams[idx]) for idx in candidate_idxs]) candidate_bigrams = set.union( *[set(hyp_bigrams[idx]) for idx in candidate_idxs]) rouge1 = approx_rouge(candidate_unigrams, ref_unigrams) rouge2 = approx_rouge(candidate_bigrams, ref_bigrams) rouge_score = rouge1 + rouge2 if rouge_score > curr_max_rouge: curr_max_rouge = rouge_score curr_id = i if curr_id == -1: return (list(sorted(selected_idxs)), max_rouge) selected_idxs.append(curr_id) max_rouge = curr_max_rouge return (list(sorted(selected_idxs)), max_rouge)
def test_ngrams_simple(): text = 'hello' assert [('h', ), ('e', ), ('l', ), ('l', ), ('o', )] == utils.ngrams(text, 1) assert [('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o')] == \ utils.ngrams(text, 2) assert [('h', 'e', 'l'), ('e', 'l', 'l'), ('l', 'l', 'o')] == \ utils.ngrams(text, 3) text2 = 'hello world' assert [('h',), ('e',), ('l',), ('l',), ('o',), (' ',), ('w',), ('o',), ('r',), ('l',), ('d',)] == \ utils.ngrams(text2, 1) assert [('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o'), ('o', ' '), (' ', 'w'), ('w', 'o'), ('o', 'r'), ('r', 'l'), ('l', 'd')] == \ utils.ngrams(text2, 2)
def preprocess_data(dataset, embeddings_path, topn, min_cos_sim): path = './' + dataset.dataset_name + '/examples/' embeddings = load_embeddings(embeddings_path) # Training part examples = set((ngram, ngram[1]) for sentence in dataset.get_train_sentences for ngram in ngrams(sentence) if ngram[1] in embeddings) save_examples(examples, path, 'examples') # examples = load_examples(path+'examples.pkl') na_dataset = PerClassDataset(examples) # Validation part valid_examples = set((ngram, ngram[1]) for sentence in dataset.get_valid_sentences for ngram in ngrams(sentence) if ngram[1] not in na_dataset and ngram[1] in embeddings) save_examples(valid_examples, path, 'valid_examples') valid_dataset = PerClassDataset(valid_examples) tr_val_dataset = na_dataset | valid_dataset # Test part test_examples = set((ngram, ngram[1]) for sentence in dataset.get_test_sentences for ngram in ngrams( sentence) if ngram[1] not in tr_val_dataset and ngram[1] in embeddings) save_examples(test_examples, path, 'test_examples') test_dataset = PerClassDataset(test_examples) # valid_examples = load_examples(path+'valid_examples.pkl') # test_examples = load_examples(path+'test_examples.pkl') save_examples(test_examples | valid_examples, path, 'valid_test_examples') # OOV part all_sentences = dataset.get_train_sentences + dataset.get_valid_sentences + dataset.get_test_sentences oov_examples = set((ngram, ngram[1]) for sentence in all_sentences for ngram in ngrams(sentence) if ngram[1] not in embeddings) save_examples(oov_examples, path, 'oov_examples') # Augmented part all_dataset = tr_val_dataset | test_dataset filter_cond = lambda label: label not in all_dataset augmented_examples = augment_data(examples, embeddings_path, filter_cond=filter_cond, topn=topn, min_cos_sim=min_cos_sim) augmented_examples |= examples # Union save_examples(augmented_examples, path, 'augmented_examples_topn{topn}_cos_sim{cs}'.format(topn=topn, cs=min_cos_sim))
def get_mentions(dbr, question): tris = str_to_trigrams_dict(dbr.lower()) sets = [] best_guess = MinStore() current_mention = MentionSet() for index, word in enumerate(question.split(' ')): word_trigrams = ngrams(word.lower()) prob = overlap_trigrams_score(tris, word_trigrams) if not current_mention.append(word, index, prob): mention = MentionSet(mention=[word], indexes=[index], prob=prob) best_guess.store(mention, prob) if not current_mention.is_empty(): sets.append(current_mention) current_mention = MentionSet() if not current_mention.is_empty(): sets.append(current_mention) if len(sets) == 0: # no match found, best guess time best_guess = best_guess.get_item() sets = [best_guess] return sets
def character_based_4gram(text): return utils.ngrams(text, 4, padding_left=True, padding_right=True)
def character_based_4gram(text): return utils.ngrams(text, n, padding_left=padding, padding_right=padding)
def trigram_overlap(x, y): x_tri = ngrams(x, 3) y_tri = ngrams(y, 3) return any(_y_tri in x_tri for _y_tri in y_tri)
def test(test_dataset): model.load_state_dict(torch.load(args.ckpt_path)) model.eval() test_loader = tqdm( load(test_dataset, args.batch_size, shuffle=False), ncols=100, ) true_list = [] pred_list = [] avgs = [] with torch.no_grad(): for i, batch in enumerate(test_loader): batch_probs = F.softmax(model(batch), 1) batch_labels = batch['label_ids'] for j in range(batch_probs.size(0)): probs = unpack(batch_probs[j, 1]) labels = unpack(batch_labels[j]) _probs = [] for (prob, label) in zip(probs, labels): if label != -1: true_list.append(label) pred_list.append(prob) _probs.append(prob) abs_list = test_dataset.abs_lists[i][0] doc_list = test_dataset.doc_lists[i][0] node_list = test_dataset.node_lists[i] pred_label_list = [0] * len(doc_list) opt_nodes = [] for (node, prob) in zip(node_list, _probs): if prob > 0.7: for optnode in node_list: if optnode['group'] == node['group']: opt_nodes.append(optnode) for kk in range(node['start_index'], node['end_index']): pred_label_list[kk] = 1 for node in optnodes: for kk in range(node['start_index'], node['end_index']): pred_label_list[kk] = 1 words = [ word for (word, lab) in zip(doc_list, pred_label_list) if lab == 0 ] abs_ngrams = set(ngrams(abs_list, 1)) doc_ngrams = set(ngrams(words, 1)) rouge = compute_rouge(abs_ngrams, doc_ngrams) avgs.append(rouge) results_dict = {'F1': np.mean(avgs)} return results_dict
def modified_precision(references, hypothesis, n): """ Calculate modified ngram precision. The normal precision method may lead to some wrong translations with high-precision, e.g., the translation, in which a word of reference repeats several times, has very high precision. This function only returns the Fraction object that contains the numerator and denominator necessary to calculate the corpus-level precision. To calculate the modified precision for a single pair of hypothesis and references, cast the Fraction object into a float. The famous "the the the ... " example shows that you can get BLEU precision by duplicating high frequency words. >>> reference1 = 'the cat is on the mat'.split() >>> reference2 = 'there is a cat on the mat'.split() >>> hypothesis1 = 'the the the the the the the'.split() >>> references = [reference1, reference2] >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS 0.2857... In the modified n-gram precision, a reference word will be considered exhausted after a matching hypothesis word is identified, e.g. >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', ... 'forever', 'heed', 'Party', 'commands'] >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', ... 'Party'] >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> hypothesis = 'of the'.split() >>> references = [reference1, reference2, reference3] >>> float(modified_precision(references, hypothesis, n=1)) 1.0 >>> float(modified_precision(references, hypothesis, n=2)) 1.0 An example of a normal machine translation hypothesis: >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', ... 'forever', 'hearing', 'the', 'activity', 'guidebook', ... 'that', 'party', 'direct'] >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', ... 'forever', 'heed', 'Party', 'commands'] >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', ... 'Party'] >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> references = [reference1, reference2, reference3] >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS 0.9444... >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS 0.5714... >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS 0.5882352941176471 >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS 0.07692... :param references: A list of reference translations. :type references: list(list(str)) :param hypothesis: A hypothesis translation. :type hypothesis: list(str) :param n: The ngram order. :type n: int :return: BLEU's modified precision for the nth order ngram. :rtype: Fraction """ # Extracts all ngrams in hypothesis # Set an empty Counter if hypothesis is empty. counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter() # Extract a union of references' counts. # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references]) max_counts = {} for reference in references: reference_counts = ( Counter(ngrams(reference, n)) if len(reference) >= n else Counter() ) for ngram in counts: max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) # Assigns the intersection between hypothesis and references' counts. clipped_counts = { ngram: min(count, max_counts[ngram]) for ngram, count in counts.items() } numerator = sum(clipped_counts.values()) # Ensures that denominator is minimum 1 to avoid ZeroDivisionError. # Usually this happens when the ngram order is > len(reference). denominator = max(1, sum(counts.values())) return Fraction(numerator, denominator, _normalize=False)
def eval_compressions(abs_list, doc_list, doc_compressions): def _apply_compression(node, labels): _labels = copy.deepcopy(labels) for _i in range(node.start_index, node.end_index): assert 0 <= node.sent_index <= len(_labels) assert 0 <= _i <= len(_labels[node.sent_index]) _labels[node.sent_index][_i] = 1 # Delete return _labels def _preprocess(tokens): return [_rouge_clean(token.lower()) for token in tokens] def _score_compression(abs_list, doc_list, hyp_labels): abs_tokens = flatten(abs_list) doc_tokens = [] assert len(doc_list) == len(hyp_labels) for (sent, mask) in zip(doc_list, hyp_labels): assert len(sent) == len(mask) for (token, label) in zip(sent, mask): if label == 0: doc_tokens.append(token) doc_tokens = _preprocess(doc_tokens) doc_uni = set(ngrams(doc_tokens, 1)) doc_bi = set(ngrams(doc_tokens, 2)) rouge1 = compute_rouge(doc_uni, _abs_uni) rouge2 = compute_rouge(doc_bi, _abs_bi) return (rouge1, rouge2) _abs_tokens = _preprocess(flatten(abs_list)) _abs_uni = set(ngrams(_abs_tokens, 1)) _abs_bi = set(ngrams(_abs_tokens, 2)) compressions_list = [] doc_labels = [[0 for _ in range(len(sent))] for sent in doc_list] base_rouge1, base_rouge2 = _score_compression(abs_list, doc_list, doc_labels) base_rouge = base_rouge1 + base_rouge2 for sent_compressions in doc_compressions: for i, node in enumerate(sent_compressions): if [node.sent_index, node.node_index] in compressions_list: continue hyp_labels = _apply_compression(node, doc_labels) mod_rouge1, mod_rouge2 = _score_compression( abs_list, doc_list, hyp_labels) mod_rouge = mod_rouge1 + mod_rouge2 if mod_rouge > base_rouge: compressions_list.append([node.sent_index, node.node_index]) # If a parent constituent gets deleted, then by definition, all child # constituents must also be deleted. for child_node in sent_compressions[i + 1:]: if (node.start_index <= child_node.start_index and child_node.end_index <= node.end_index): compressions_list.append( [child_node.sent_index, child_node.node_index]) return compressions_list