def str_common_grams(str1, str2, min_len=3, max_len=4): '''Return how many times the ngrams (of length min_len to max_len) of str1 appeared on str2 ''' grams1 = list(everygrams(str1, min_len, max_len)) grams2 = list(everygrams(str2, min_len, max_len)) return sum(grams2.count(gram) for gram in grams1)
def setup_class(self): text = [list("abcd"), list("egdbe")] self.trigram_counter = NgramCounter( everygrams(sent, max_len=3) for sent in text) self.bigram_counter = NgramCounter( everygrams(sent, max_len=2) for sent in text) self.case = unittest.TestCase()
def setUpClass(cls): text = [list("abcd"), list("egdbe")] cls.trigram_counter = NgramCounter( (everygrams(sent, max_len=3) for sent in text)) cls.bigram_counter = NgramCounter( (everygrams(sent, max_len=2) for sent in text))
def custom_sentence_gleu(references, hypothesis, min_len=1, max_len=4): from collections import Counter from nltk.util import everygrams assert len(references) == 1 hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len)) tpfp = sum(hyp_ngrams.values()) # True positives + False positives. reference = references[0] ref_ngrams = Counter(everygrams(reference, min_len, max_len)) tpfn = sum(ref_ngrams.values()) # True positives + False negatives. overlap_ngrams = ref_ngrams & hyp_ngrams tp = sum(overlap_ngrams.values()) # True positives. n_all = max(tpfp, tpfn) n_match = tp if n_all > 0 else 0 # corner case: empty corpus or empty references---don't divide by zero! if n_all == 0: gleu_score = 0.0 else: gleu_score = n_match / n_all return gleu_score, n_match, tpfp, tpfn
def abstraction(chaine, ancestor): if all(isinstance(x, Variable) for x in chaine): # print(Type(chaine, ancestor)) # print() # print() return else: if isinstance(chaine, ChaineConcrete): # print(Type(chaine, chaine)) # print() spans = everygrams([x for x, y in enumerate(chaine)]) lignee = [variabilise(chaine, span) for span in list(spans)] else: # print(Type(chaine, ancestor)) # print() lignee = [] for i, element in enumerate(chaine): debut = [chaine[:i]] fin = [chaine[i + 1:]] if not isinstance(element, Variable): spans = everygrams([x for x, y in enumerate(element)]) for span in spans: chainex = variabilise(element, span) print(chainex, fin) if i == 0: lignee.append(chainex + fin) elif i == len(chaine) - 1: lignee.append(debut + chainex) else: lignee.append(debut + chainex + fin) for descendant in lignee: abstraction(descendant, chaine)
def corpus_chrf(list_of_references, hypotheses, min_len=1, max_len=6, beta=3.0): """ Calculates the corpus level CHRF (Character n-gram F-score), it is the micro-averaged value of the sentence/segment level CHRF score. CHRF only supports a single reference. >>> ref1 = str('It is a guide to action that ensures that the military ' ... 'will forever heed Party commands').split() >>> ref2 = str('It is the guiding principle which guarantees the military ' ... 'forces always being under the command of the Party').split() >>> >>> hyp1 = str('It is a guide to action which ensures that the military ' ... 'always obeys the commands of the party').split() >>> hyp2 = str('It is to insure the troops forever hearing the activity ' ... 'guidebook that party direct') >>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS 0.4915... :param references: a corpus of list of reference sentences, w.r.t. hypotheses :type references: list(list(str)) / list(str) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) / list(str) :param min_len: The minimum order of n-gram this function should extract. :type min_len: int :param max_len: The maximum order of n-gram this function should extract. :type max_len: int :param beta: the parameter to assign more importance to recall over precision :type beta: float :return: the sentence level CHRF score. :rtype: float """ assert len(list_of_references) == len( hypotheses ), "The number of hypotheses and their references should be the same" # Iterate through each hypothesis and their corresponding references. for reference, hypothesis in zip(list_of_references, hypotheses): # Cheating condition to allow users to input strings instead of tokens. if type(reference) and type(hypothesis) != str: reference, hypothesis = ' '.join(reference), ' '.join(hypothesis) # For each order of ngram, calculate the no. of ngram matches and # keep track of no. of ngram in references. ref_ngrams = Counter(everygrams(reference, min_len, max_len)) hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len)) overlap_ngrams = ref_ngrams & hyp_ngrams tp = sum(overlap_ngrams.values()) # True positives. tpfp = sum(hyp_ngrams.values()) # True positives + False positives. tffn = sum(ref_ngrams.values()) # True posities + False negatives. precision = tp / tpfp recall = tp / tffn factor = beta**2 score = (1 + factor) * (precision * recall) / (factor * precision + recall) return score
def setUpClass(cls): text = [list("abcd"), list("egdbe")] cls.trigram_counter = NgramCounter( (everygrams(sent, max_len=3) for sent in text) ) cls.bigram_counter = NgramCounter( (everygrams(sent, max_len=2) for sent in text) )
def grams(text): #Character grams for i in list( everygrams(''.join([c for c in text if c != ' ']), min_len=1, max_len=4)): yield i #Word grams for i in list(everygrams(text.split(' '), min_len=1, max_len=3)): yield i
def kmers(self, content): try: size = int(self.size) ngramG = ngrams(content, size) return [''.join(i) for i in list(ngramG)] except: size = self.size.replace('to', ' ').split(' ') minsize = size[0] maxsize = size[1] everygrams(content, minsize, maxsize)
def custom_corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4): """ Copy of the GLEU implementation in NLTK that also returns n_match and n_all :param list_of_references: :param hypotheses: :param min_len: :param max_len: :return: """ # sanity check assert len(list_of_references) == len \ (hypotheses), "The number of hypotheses and their reference(s) should be the same" # sum matches and max-token-lengths over all sentences corpus_n_match = 0 corpus_n_all = 0 for references, hypothesis in zip(list_of_references, hypotheses): hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len)) tpfp = sum(hyp_ngrams.values()) # True positives + False positives. hyp_counts = [] for reference in references: ref_ngrams = Counter(everygrams(reference, min_len, max_len)) tpfn = sum( ref_ngrams.values()) # True positives + False negatives. overlap_ngrams = ref_ngrams & hyp_ngrams tp = sum(overlap_ngrams.values()) # True positives. # While GLEU is defined as the minimum of precision and # recall, we can reduce the number of division operations by one by # instead finding the maximum of the denominators for the precision # and recall formulae, since the numerators are the same: # precision = tp / tpfp # recall = tp / tpfn # gleu_score = min(precision, recall) == tp / max(tpfp, tpfn) n_all = max(tpfp, tpfn) if n_all > 0: hyp_counts.append((tp, n_all)) # use the reference yielding the highest score if hyp_counts: n_match, n_all = max(hyp_counts, key=lambda hc: hc[0] / hc[1]) corpus_n_match += n_match corpus_n_all += n_all # corner case: empty corpus or empty references---don't divide by zero! if corpus_n_all == 0: gleu_score = 0.0 else: gleu_score = corpus_n_match / corpus_n_all return gleu_score, corpus_n_match, corpus_n_all
def corpus_chrf(list_of_references, hypotheses, min_len=1, max_len=6, beta=3.0): """ Calculates the corpus level CHRF (Character n-gram F-score), it is the micro-averaged value of the sentence/segment level CHRF score. CHRF only supports a single reference. >>> ref1 = str('It is a guide to action that ensures that the military ' ... 'will forever heed Party commands').split() >>> ref2 = str('It is the guiding principle which guarantees the military ' ... 'forces always being under the command of the Party').split() >>> >>> hyp1 = str('It is a guide to action which ensures that the military ' ... 'always obeys the commands of the party').split() >>> hyp2 = str('It is to insure the troops forever hearing the activity ' ... 'guidebook that party direct') >>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS 0.4915... :param references: a corpus of list of reference sentences, w.r.t. hypotheses :type references: list(list(str)) / list(str) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) / list(str) :param min_len: The minimum order of n-gram this function should extract. :type min_len: int :param max_len: The maximum order of n-gram this function should extract. :type max_len: int :param beta: the parameter to assign more importance to recall over precision :type beta: float :return: the sentence level CHRF score. :rtype: float """ assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their references should be the same" # Iterate through each hypothesis and their corresponding references. for reference, hypothesis in zip(list_of_references, hypotheses): # Cheating condition to allow users to input strings instead of tokens. if type(reference) and type(hypothesis) != str: reference, hypothesis = ' '.join(reference), ' '.join(hypothesis) # For each order of ngram, calculate the no. of ngram matches and # keep track of no. of ngram in references. ref_ngrams = Counter(everygrams(reference, min_len, max_len)) hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len)) overlap_ngrams = ref_ngrams & hyp_ngrams tp = sum(overlap_ngrams.values()) # True positives. tpfp = sum(hyp_ngrams.values()) # True positives + False positives. tffn = sum(ref_ngrams.values()) # True posities + False negatives. precision = tp / tpfp recall = tp / tffn factor = beta**2 score = (1+ factor ) * (precision * recall) / ( factor * precision + recall) return score
def _compute_rouge(self, words_t, words_c): ngrams_t = set(everygrams(words_t, max_len=self.max_gram)) ngrams_c = everygrams(words_c, max_len=self.max_gram) match_count = 0 total_count = 0 for ngram_c in ngrams_c: total_count += 1 if ngram_c in ngrams_t: match_count += 1 if total_count == 0: warnings.warn('empty template for title:{}'.format( ' '.join(words_t))) return 0 score = match_count / total_count return score
def ngram_counts(tokens, min_len=1, max_len=None, transform=" ".join, in_vocabulary=lambda _: True): """ Compute n-gram counts using toolz and Counter :param tokens: Iterable[str] :param min_len: int Minimum N-Gram size :param max_len: int Maximum N-Gram size :param transfrom: Callable[[Tuple[str, ...], str]] Function transforming ngram tuple into key :param in_vocabulary: Callable[[str], bool] Should token be preserved :return: Dict[str, int] """ tokens = list(tokens) wc = len(tokens) max_len = (max_len if max_len else wc) + 1 return ( wc, pipe( everygrams(tokens, min_len=min_len, max_len=max_len), map(transform), filter(in_vocabulary), frequencies, ), )
def evaluate_context(self, sentence, word): sentTokens = sentence.split() if word in sentTokens: index = sentTokens.index(word) else: return False if index == 0: left = index right = min(len(sentTokens), index + self.left + self.right) elif index == len(sentTokens) - 1: right = index left = max(0, index - self.right - self.left) else: left = max(0, index - self.left) right = min(len(sentTokens) - 1, index + self.right) scores = [] tokens = sentTokens[left:right + 1] nGrams = list(everygrams(tokens)) possibleNgrams = [n for n in nGrams if word in n] wordScores = [] for n in possibleNgrams: # print("N gram is " , n ) bosFlag = (sentTokens.index(n[0]) == 0 and left == 0) eosFlag = (sentTokens.index(n[len(n) - 1]) == len(sentTokens) - 1 and right == len(sentTokens) - 1) # print(" Eos flag ",eosFlag , " BOS flag",bosFlag) wordScores.append( self.langModel.score(" ".join(n), eos=eosFlag, bos=bosFlag)) #print("Candidate : ", candidate) # print(candidateScores) scores.append(np.average(wordScores)) return np.average(wordScores)
def get_ngrams(max_len, tokens, delimiter=" "): """Get ngrams (sequences of consecutive tokens) from tokens. ngrams are sequences of consecutive tokens. Return an iterator of all ngrams of length at most max_len represented as the concatenation of the constituent tokens, delimited by delimiter. Args: max_len (int): Max length of ngram to consider. tokens (iterable of str): Token iterator. delimiter (str, optional): Separator to use between tokens in an ngram. Returns: iterable of str: String representations of each ngram. Examples: To use `get_ngrams` directly on an iterable of tokens: >>> list(get_ngrams(2, ["a", "b", "c"])) ['a', 'b', 'c', 'a b', 'b c'] To use `get_ngrams` on a stream of token iterables: >>> tokens_gen = iter([["a", "b", "c"], ... ["d", "e", "f"]]) >>> from functools import partial >>> ngrams_gen = map(partial(get_ngrams, 2), tokens_gen) >>> from twitter_analysis_tools.utils import listify_nested_iterables >>> listify_nested_iterables(ngrams_gen) [['a', 'b', 'c', 'a b', 'b c'], ['d', 'e', 'f', 'd e', 'e f']] """ # Gotcha: everygrams doesn't work with iterables. Only lists. ngrams = everygrams(list(tokens), max_len=max_len) return map(delimiter.join, ngrams)
def test_everygrams_min_len(everygram_input): expected_output = [ ("a", "b"), ("a", "b", "c"), ("b", "c"), ] output = list(everygrams(everygram_input, min_len=2)) assert output == expected_output
def test_everygrams_min_len(self): expected_output = [ ('a', 'b'), ('b', 'c'), ('a', 'b', 'c'), ] output = everygrams(self.test_data, min_len=2) self.assertCountEqual(output, expected_output)
def extract_features(document): words = word_tokenize(document) lemmas = [str(lemmatizer.lemmatize(w)) for w in words if w not in stopwords_eng and w not in punctuation] document = " ".join(lemmas) document = document.lower() document = re.sub(r'[^a-zA-Z0-9\s]', ' ', document) words = [w for w in document.split(" ") if w!="" and w not in stopwords_eng and w not in punctuation] return [str('_'.join(ngram)) for ngram in list(everygrams(words, max_len=3))]
def findScoreForAlignmentUsingnGrams(srcSent, tgtList, transDict): """Find alignment score for a source sentence with a list of target sentences.""" wordsInSourceSent = word_tokenize(srcSent.lower()) wordsInSrc = len(wordsInSourceSent) sourceNgrams = list(everygrams(wordsInSourceSent, max_len=2)) sourceNgrams = create_string_ngrams(sourceNgrams) srcDict = Counter(sourceNgrams) scores = list() tgtDicts, allTgtWords = list(), list() for tgt in tgtList: wordsInTgt = word_tokenize(tgt.lower()) allTgtWords.append(wordsInTgt) tgtNgrams = list(everygrams(wordsInTgt, max_len=2)) tgtNgrams = create_string_ngrams(tgtNgrams) tgtDicts.append(Counter(tgtNgrams)) print(len(transDict)) for index, tgtDict in enumerate(tgtDicts): count = 0 matchedNgrams = list() for src in srcDict: if re.search('\d+(\.\d+)?', src): if src in tgtDict: matchedNgrams.append((src, src)) count += 1 elif src in transDict: foundTgt = transDict[src] for ngrm in foundTgt: if ngrm in tgtDict and tgtDict[ngrm] == srcDict[src]: matchedNgrams.append((src, ngrm)) count += tgtDict[ngrm] break elif ngrm in tgtDict and tgtDict[ngrm] != srcDict[src]: break wordsInTgt = len(allTgtWords[index]) if abs(wordsInSrc - wordsInTgt) in range(5): lengthValue = 0.2 else: lengthValue = 1 / abs(wordsInSrc - wordsInTgt) print(count) print(matchedNgrams) if count == 0.: scores.append((1e-5, lengthValue)) else: scores.append((count / len(sourceNgrams), lengthValue)) print(scores) return np.array(scores)
def get_ngram_word_dict(emotion_line): words = nltk.word_tokenize(emotion_line) word_ngram = everygrams(words, min_len=1, max_len=3) word_feats = {} for w in word_ngram: if w not in word_feats: word_feats[w] = "feature_word" return word_feats
def test_everygrams_without_padding(everygram_input): expected_output = [ ("a",), ("a", "b"), ("a", "b", "c"), ("b",), ("b", "c"), ("c",), ] output = list(everygrams(everygram_input)) assert output == expected_output
def test_everygrams_without_padding(self): expected_output = [ ('a', ), ('a', 'b'), ('a', 'b', 'c'), ('b', ), ('b', 'c'), ('c', ), ] output = everygrams(self.test_data) self.assertCountEqual(output, expected_output)
def encode_sentences(txt): feature_set=np.zeros((len(txt), len(word_set)+1),dtype=int) tnum=0 for t in txt: s_words=t[1:]+list(set(list(everygrams(t[1:], min_len=2,max_len=2)))) for w in s_words: idx=word_idx[w] feature_set[tnum][idx]=1 feature_set[tnum][-1]=t[0] tnum+=1 return feature_set
def initialiseCorrespondances(self): for couple in self.corpus: (langue1,langue2) = couple for correspond1 in self.correspondancesLangue1: if correspond1 in [" ".join(x) for x in everygrams(langue1.split(' '))]: if couple not in self.correspondancesLangue1[correspond1]: self.correspondancesLangue1[correspond1].append(couple) for correspond2 in self.correspondancesLangue2: if correspond2 in langue2: if couple not in self.correspondancesLangue2[correspond2]: self.correspondancesLangue2[correspond2].append(couple)
def get_score(self, sentTokens, word, candidate): # print(word , " " , candidate) if len(word.split()) > 1: sentence = " ".join(sentTokens) sentence = re.sub(r'[^a-zA-Z0-9\s]', ' ', sentence) sentence = sentence.replace(word, "#") sentTokens = sentence.split() if "#" not in sentTokens: print("MOSHKLAAAAAAAAAA TNYAAA 3ND ", self.left, " MODEL") print(sentence) print(word, " ", candidate) index = 0 else: index = sentTokens.index("#") sentTokens[index] = word if word in sentTokens: index = sentTokens.index(word) else: print("MOSHKLA KBERRRRRRAAAAAAA 3ND ", self.left, " MODEL") print(word, "###", candidate, sentTokens) index = 0 ## Get window words if index == 0: left = index right = min(len(sentTokens), index + self.left + self.right) elif index == len(sentTokens) - 1: right = index left = max(0, index - self.right - self.left) else: left = max(0, index - self.left) right = min(len(sentTokens) - 1, index + self.right) scores = [] tokens = sentTokens[left:right + 1] #print(tokens) sentTokens[ index] = candidate ## Put candidate in sentence to test its score tokens = sentTokens[left:right + 1] nGrams = list(everygrams(tokens)) possibleNgrams = [n for n in nGrams if candidate in n] candidateScores = [] for n in possibleNgrams: # print("N gram is " , n ) bosFlag = (sentTokens.index(n[0]) == 0 and left == 0) eosFlag = (sentTokens.index(n[len(n) - 1]) == len(sentTokens) - 1 and right == len(sentTokens) - 1) # print(" Eos flag ",eosFlag , " BOS flag",bosFlag) candidateScores.append( self.langModel.score(" ".join(n), eos=eosFlag, bos=bosFlag)) #print("Candidate : ", candidate) # print(candidateScores) scores.append(np.average(candidateScores)) return np.average(candidateScores)
def read_trigrams(): training = [] vocab = [] text = [] with open('trigrams.pkl', 'rb') as f: data = pickle.load(f) for d in data: trigrams = list(everygrams(d, max_len=3)) training.append(trigrams) for word in d: vocab.append(word) text.append(d[0]) return training, set(vocab), text
def convert_sentence_to_ngrams(inp_sentence: str, n_param=3, add_unknown=False) -> List[str]: ''' Convert the input sentence to trigrams using a tokenizer ''' tokenized_input = wordpunct_tokenize(inp_sentence) # and an unknown token behind the if add_unknown: tokenized_input = ['<UNK>'] + tokenized_input return everygrams(tokenized_input, min_len=n_param, max_len=n_param)
def EstimateNgrams(self, training_set): for sent in training_set: sent = list(sent) for ngram in everygrams(sent, max_len=self.max_n): n = len(ngram) self.model_map[n][ngram] += 1 for n in self.model_map: for ngram in self.model_map[n]: if n == 1: pass else: self.log_probs[ngram] = log( self.model_map[n][ngram] / self.model_map[n-1][ngram[:-1]] )
def create_ngrams(input, min_len, max_len): """ create N-grams min_len is the minimum length of the N-grams max_len is the maximum length of the N-grams :param input: :param min_len: :param max_len: :return: """ result = [] for sent in input: sent_split = sent.split() result.append(list(everygrams(sent_split, min_len=min_len, max_len=max_len))) return result
def model_iterator(n): perp = [] n = n+1 for n in range(1,n): print(n) train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text) #model = MLE(n) #model = Laplace(n) #only add-one smoothing here #model = Lidstone(0.1,n) #Lidstones second number is Gamma/Alpha/Delta #model = WittenBellInterpolated(n) model = KneserNeyInterpolated(n, discount = 0.88) #only order and discount needed, WB only order print(n,model) model.fit(train_data, padded_sents) print(model.vocab) vocab_list = [] for word in model.vocab: vocab_list.append(word) #print(vocab_list) print("value",model. score('<UNK>')) #print(generate_sent_text_seed(model, 30, random_seed=['thicc'])) #print(generate_sent(model, 50, random_seed = 30)) entropy_fin = 0 lense = 1000 i = 0 for z in range(lense): #print(contents[i]) tokenized_test = [list(map(str.lower, word_tokenize(contents[i])))] if len(tokenized_test[0]) > 0: for g in range(len(tokenized_test[0])): if tokenized_test[0][g] not in vocab_list: tokenized_test[0][g] = '<UNK>' test_text_pad = list(flatten(pad_both_ends(sent, n) for sent in tokenized_test)) test_text_everygram = list(everygrams(test_text_pad, max_len=n)) #print(test_text_everygram) #test_data, padded_sents_test = padded_everygram_pipeline(n, tokenized_test) #print(i) #print(model.entropy(test_text_bigram)) #print(model.entropy(test_text_everygram)) entropy_fin += model.entropy(test_text_everygram) i += 1 print(entropy_fin) avg_entr = entropy_fin/lense print("perplexity",2**avg_entr) perp.append([n,2**avg_entr]) import pandas as pd DF = pd.DataFrame(perp) return DF
def train_texts(train_files, exclude, extension, n_ngram): # Training data file # train_data_file = "./train/treino.txt" # read training data #train_data_files = glob.glob('./train/*' + extension) train_data_files = train_files.copy() if (exclude): print("Arquivos no diretorio do treino antes de remover o item do test: ", train_data_files) train_data_files.remove(exclude) print("Arquivos utilizados no treino: ", train_data_files) train_texts = '' for train_data_file in train_data_files: try: #path_file_train = with open(os.path.join("./train", train_data_file), encoding='utf-8') as f: train_text = f.read().lower() except: print("Não foi possível acessar os arquivos de treino com a extensão ." + extension + " no diretório train.") # apply preprocessing (remove text inside square and curly brackets and rem punc) train_text = re.sub(r"\[.*\]|\{.*\}", "", train_text) train_text = re.sub(r'[^\w\s]', "", train_text) train_texts += train_text # pad the text and tokenize training_data = list(pad_sequence(word_tokenize(train_texts), n_ngram, pad_left=True, left_pad_symbol="<s>")) print("training_data", training_data) # generate ngrams ngrams = list(everygrams(training_data, max_len=n_ngram)) print("Number of ngrams:", len(ngrams)) # build ngram language models model = WittenBellInterpolated(n_ngram) model.fit([ngrams], vocabulary_text=training_data) print(model.vocab) return model
def test_everygrams_pad_left(everygram_input): expected_output = [ (None,), (None, None), (None, None, "a"), (None,), (None, "a"), (None, "a", "b"), ("a",), ("a", "b"), ("a", "b", "c"), ("b",), ("b", "c"), ("c",), ] output = list(everygrams(everygram_input, max_len=3, pad_left=True)) assert output == expected_output
def padded_everygram_pipeline(order, text): """Default preprocessing for a sequence of sentences. Creates two iterators: - sentences padded and turned into sequences of `nltk.util.everygrams` - sentences padded as above and chained together for a flat stream of words :param order: Largest ngram length produced by `everygrams`. :param text: Text to iterate over. Expected to be an iterable of sentences: Iterable[Iterable[str]] :return: iterator over text as ngrams, iterator over text as vocabulary data """ padding_fn = partial(pad_both_ends, n=order) return ( (everygrams(list(padding_fn(sent)), max_len=order) for sent in text), flatten(map(padding_fn, text)), )
def test_everygrams_pad_left(self): expected_output = [ (None, ), (None, None), (None, None, 'a'), (None, ), (None, 'a'), (None, 'a', 'b'), ('a', ), ('a', 'b'), ('a', 'b', 'c'), ('b', ), ('b', 'c'), ('c', ), ] output = everygrams(self.test_data, max_len=3, pad_left=True) self.assertCountEqual(output, expected_output)
def build_one_chat_LM(fb_dict, max_len=3): """ input: one chat room output: LM of that chatroom """ final_list = [] for line in fb_dict['msgs']: line = line[2] final_list += list(everygrams(line, max_len=max_len, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='<\s>')) Counter_LM = Counter(final_list) total_count = sum(Counter_LM.values()) total_count = float(total_count) for key in Counter_LM: Counter_LM[key] /= total_count return Counter_LM
def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4): """ Calculate a single corpus-level GLEU score (aka. system-level GLEU) for all the hypotheses and their respective references. Instead of averaging the sentence level GLEU scores (i.e. macro-average precision), Wu et al. (2016) sum up the matching tokens and the max of hypothesis and reference tokens for each sentence, then compute using the aggregate values. From Mike Schuster (via email): "For the corpus, we just add up the two statistics n_match and n_all = max(n_all_output, n_all_target) for all sentences, then calculate gleu_score = n_match / n_all, so it is not just a mean of the sentence gleu scores (in our case, longer sentences count more, which I think makes sense as they are more difficult to translate)." >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', 'forever', ... 'heed', 'Party', 'commands'] >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', ... 'interested', 'in', 'world', 'history'] >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', ... 'because', 'he', 'read', 'the', 'book'] >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] >>> hypotheses = [hyp1, hyp2] >>> corpus_gleu(list_of_references, hypotheses) # doctest: +ELLIPSIS 0.5673... The example below show that corpus_gleu() is different from averaging sentence_gleu() for hypotheses >>> score1 = sentence_gleu([ref1a], hyp1) >>> score2 = sentence_gleu([ref2a], hyp2) >>> (score1 + score2) / 2 # doctest: +ELLIPSIS 0.6144... :param list_of_references: a list of reference sentences, w.r.t. hypotheses :type list_of_references: list(list(list(str))) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) :param min_len: The minimum order of n-gram this function should extract. :type min_len: int :param max_len: The maximum order of n-gram this function should extract. :type max_len: int :return: The corpus-level GLEU score. :rtype: float """ # sanity check assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same" # sum matches and max-token-lengths over all sentences corpus_n_match = 0 corpus_n_all = 0 for references, hypothesis in zip(list_of_references, hypotheses): hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len)) tpfp = sum(hyp_ngrams.values()) # True positives + False positives. hyp_counts = [] for reference in references: ref_ngrams = Counter(everygrams(reference, min_len, max_len)) tpfn = sum(ref_ngrams.values()) # True positives + False negatives. overlap_ngrams = ref_ngrams & hyp_ngrams tp = sum(overlap_ngrams.values()) # True positives. # While GLEU is defined as the minimum of precision and # recall, we can reduce the number of division operations by one by # instead finding the maximum of the denominators for the precision # and recall formulae, since the numerators are the same: # precision = tp / tpfp # recall = tp / tpfn # gleu_score = min(precision, recall) == tp / max(tpfp, tpfn) n_all = max(tpfp, tpfn) if n_all > 0: hyp_counts.append((tp, n_all)) # use the reference yielding the highest score if hyp_counts: n_match, n_all = max(hyp_counts, key=lambda hc: hc[0]/hc[1]) corpus_n_match += n_match corpus_n_all += n_all # corner case: empty corpus or empty references---don't divide by zero! if corpus_n_all == 0: gleu_score = 0.0 else: gleu_score = corpus_n_match / corpus_n_all return gleu_score
def sentence_gleu(reference, hypothesis, min_len=1, max_len=4): """ Calculates the sentence level GLEU (Google-BLEU) score described in Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser, Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens, George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, Jeffrey Dean. (2016) Google’s Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. eprint arXiv:1609.08144. https://arxiv.org/pdf/1609.08144v2.pdf Retrieved on 27 Oct 2016. From Wu et al. (2016): "The BLEU score has some undesirable properties when used for single sentences, as it was designed to be a corpus measure. We therefore use a slightly different score for our RL experiments which we call the 'GLEU score'. For the GLEU score, we record all sub-sequences of 1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then compute a recall, which is the ratio of the number of matching n-grams to the number of total n-grams in the target (ground truth) sequence, and a precision, which is the ratio of the number of matching n-grams to the number of total n-grams in the generated output sequence. Then GLEU score is simply the minimum of recall and precision. This GLEU score's range is always between 0 (no matches) and 1 (all match) and it is symmetrical when switching output and target. According to our experiments, GLEU score correlates quite well with the BLEU metric on a corpus level but does not have its drawbacks for our per sentence reward objective." Note: The GLEU score is designed for sentence based evaluation thus there is no corpus based scores implemented in NLTK. The infamous "the the the ... " example >>> ref = 'the cat is on the mat'.split() >>> hyp = 'the the the the the the the'.split() >>> sentence_gleu(ref, hyp) # doctest: +ELLIPSIS 0.0909... An example to evaluate normal machine translation outputs >>> ref1 = str('It is a guide to action that ensures that the military ' ... 'will forever heed Party commands').split() >>> hyp1 = str('It is a guide to action which ensures that the military ' ... 'always obeys the commands of the party').split() >>> hyp2 = str('It is to insure the troops forever hearing the activity ' ... 'guidebook that party direct').split() >>> sentence_gleu(ref1, hyp1) # doctest: +ELLIPSIS 0.4393... >>> sentence_gleu(ref1, hyp2) # doctest: +ELLIPSIS 0.1206... :param references: reference sentence :type references: list(str) :param hypothesis: a hypothesis sentence :type hypothesis: list(str) :param min_len: The minimum order of n-gram this function should extract. :type min_len: int :param max_len: The maximum order of n-gram this function should extract. :type max_len: int :return: the sentence level CHRF score. :rtype: float """ # For each order of ngram, calculate the no. of ngram matches and # keep track of no. of ngram in references. ref_ngrams = Counter(everygrams(reference, min_len, max_len)) hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len)) overlap_ngrams = ref_ngrams & hyp_ngrams tp = sum(overlap_ngrams.values()) # True positives. tpfp = sum(hyp_ngrams.values()) # True positives + False positives. tffn = sum(ref_ngrams.values()) # True posities + False negatives. precision = tp / tpfp recall = tp / tffn return min(precision, recall)
def remplirLexique(self): for phrase in self.corpusLangue1: self.lexiqueLangue1 |= set([" ".join(x) for x in everygrams(phrase.split(' '))]) for phras in self.corpusLangue2: self.lexiqueLangue2 |= set(["".join(x) for x in everygrams(list(phras))])
def padded_everygrams(order, sentence): """Helper with some useful defaults. Applies pad_both_ends to sentence and follows it up with everygrams. """ return everygrams(list(pad_both_ends(sentence, n=order)), max_len=order)
def firstPassGrouping(): words = [] stemmed = [] features = {} tokenizer = RegexpTokenizer('\s+', gaps=True) clean = re.compile("[()\/']") split = re.compile("[/]") grams = [] with open('data/features.txt', 'r') as featureIn: for line in map(cleanFeatures, featureIn): ws = [] for w in tokenizer.tokenize(clean.sub(' ', line[1])): if w not in engStop: stemmed.append((eng.stem(w).lower(), line[1])) words.append((w.lower(), line[1])) ws.append(w.lower()) grams.append((list(everygrams(ws, min_len=2, max_len=2)), line[1])) features[line[0]] = line[1] # cuisine, style, price, atmosphere, and occasion noGrams = set(map(lambda x: x[1], filter(lambda x: len(x[0]) == 0, grams))) grams = list(filter(lambda x: len(x[0]) > 0, grams)) groupedw = seq(grams) \ .flat_map(lambda x: set([(w, x[1]) for w in seq(x[0]).flat_map(lambda y: list(y)).to_list()])) \ .group_by(lambda w: w[0]) \ .map(lambda x: (x[0], list(map(lambda y: y[1], x[1])))) \ .to_dict() noGramsId = {} for g in noGrams: noGramsId[g] = g simGrouped = {} simular = set() for k, v in sorted(groupedw.items(), key=lambda x: x[0]): # print(k, v) nl = v.copy() match = noGramsId.get(k, None) for nk in noGramsId.keys(): if len(nk) > 1: if nk in v: nl.append(nk) simular.add(nk) for vv in v: if nk in vv: nl.append(nk) simular.add(nk) if match is not None: nl.append(match) simGrouped[k] = list(set(nl)) simular.add(match) else: if len(k) > 1: simGrouped[k] = v noSim = noGrams - simular # nationalities = gazetteers.words() featureNationality = [] for nosim in noSim: didConvert = convert(nosim) if didConvert is not None: if didConvert in nationalities: featureNationality.append(nosim) else: if nosim in nationalities: featureNationality.append(nosim) else: split = nosim.split('-') for sp in split: if sp in nationalities: featureNationality.append(nosim) # print("-----------------") noSim = noSim - set(featureNationality) # occasions = ['monday'] # # cuisine, style, price, atmosphere, and occasion for k, v in sorted(simGrouped.items(), key=lambda x: x[0]): # print(k,v) if k in nationalities: featureNationality.append(k) featureNationality.extend(v) simGrouped.pop(k) didConvert = convert(k) if didConvert is not None: if didConvert in nationalities: simGrouped.pop(k) featureNationality.append(k) featureNationality.extend(v) with open('q1/noSim.json', 'w+') as nsOut: nsOut.write(json.dumps(list(noSim), indent=2, sort_keys=True)) with open('q1/featureNationality.json', 'w+') as nsOut: nsOut.write(json.dumps(featureNationality, indent=2, sort_keys=True)) with open('q1/grouped.json', 'w+') as nsOut: nsOut.write(json.dumps(simGrouped, indent=2, sort_keys=True))