class KneserNeyInterpolatedTrigramTests(unittest.TestCase): def setUp(self): vocab, training_text = _prepare_test_data(3) self.model = KneserNeyInterpolated(3, vocabulary=vocab) self.model.fit(training_text) score_tests = [ # For unigram scores revert to uniform # Vocab size: 8 # count('c'): 1 ("c", None, 1.0 / 8), # in vocabulary but unseen, still uses uniform ("z", None, 1 / 8), # out of vocabulary should use "UNK" score, i.e. again uniform ("y", None, 1.0 / 8), # alpha = count('bc') - discount = 1 - 0.1 = 0.9 # gamma(['b']) = discount * number of unique words that follow ['b'] = 0.1 * 2 # normalizer = total number of bigrams with this context = 2 # the final should be: (alpha + gamma * unigram_score("c")) ("c", ["b"], (0.9 + 0.2 * (1 / 8)) / 2), # building on that, let's try 'a b c' as the trigram # alpha = count('abc') - discount = 1 - 0.1 = 0.9 # gamma(['a', 'b']) = 0.1 * 1 # normalizer = total number of trigrams with prefix "ab" = 1 => we can ignore it! ("c", ["a", "b"], 0.9 + 0.1 * ((0.9 + 0.2 * (1 / 8)) / 2)), ]
class NGramLM: def __init__(self, n: int = 3): self.n = n self.model = KneserNeyInterpolated(n) self.model_path = LM_PATH / f"{n}_gram.model" def train(self, dataset: NGramDataset, save=True): self.model.fit(dataset.train, dataset.vocab) if save: pickle.dump(self.model, self.model_path.open("wb")) logging.info(f"ngram model saved at: {self.model_path}") def load_model(self, path: Path = None): if path: if path.is_file(): self.model_path = path else: logging.exception(f"ngram model doesn't exits at: {path}") self.model = pickle.load(self.model_path.open("rb")) # TODO: set self.n from self.model self.n = self.model.order def evaluate_sentence(self, sentence: str): """check the prob. of `sentence`""" tokens = NGramDataset.preprocess_sentence(sentence, self.n) ngrams = list(get_ngrams(tokens, self.n)) total_logscore = 0 for ngram in ngrams: cur_logscore = self.model.logscore(ngram[-1], ngram[:-1]) total_logscore += cur_logscore return math.exp(total_logscore) def predict_next(self): pass
def build_ngram_lm(self, train): if not train: return None n = 5 # up to 5 gram language model train, vocab = padded_everygram_pipeline(n, train) model = KneserNeyInterpolated(n) model.fit(train, vocab) return model
class TestKneserNeyInterpolatedTrigram(metaclass=ParametrizedTests): @classmethod def setup_method(self): vocab, training_text = _prepare_test_data(3) self.model = KneserNeyInterpolated(3, discount=0.75, vocabulary=vocab) self.model.fit(training_text) score_tests = [ # P(c) = count('*c') / unique('**') # = 1 / 14 ("c", None, 1.0 / 14), # P(z) = count('*z') / unique('**') # = 0 / 14 # 'z' is in the vocabulary, but it was not seen during training. ("z", None, 0.0 / 14), # P(y) # Out of vocabulary should use "UNK" score. # P(y) = P(UNK) = count('*UNK') / unique('**') ("y", None, 3 / 14), # We start with P(c|b) # P(c|b) = alpha('bc') + gamma('b') * P(c) # alpha('bc') = max(unique('*bc') - discount, 0) / unique('*b*') # = max(1 - 0.75, 0) / 2 # = 0.125 # gamma('b') = discount * unique('b*') / unique('*b*') # = (0.75 * 2) / 2 # = 0.75 ("c", ["b"], (0.125 + 0.75 * (1 / 14))), # Building on that, let's try P(c|ab). # P(c|ab) = alpha('abc') + gamma('ab') * P(c|b) # alpha('abc') = max(count('abc') - discount, 0) / count('ab*') # = max(1 - 0.75, 0) / 1 # = 0.25 # gamma('ab') = (discount * unique('ab*')) / count('ab*') # = 0.75 * 1 / 1 ("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (1 / 14))), # P(c|zb) # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332. ("c", ["z", "b"], (0.125 + 0.75 * (1 / 14))), ]
class TestKneserNeyInterpolatedTrigram(metaclass=ParametrizedTests): @classmethod def setup_method(self): vocab, training_text = _prepare_test_data(3) self.model = KneserNeyInterpolated(3, vocabulary=vocab) self.model.fit(training_text) score_tests = [ # # of bigrams ending with c = 1 # total # of unique bigrams = 14 ("c", None, 1.0 / 14), # in vocabulary but unseen # # of bigrams ending with z = 0 ("z", None, 0.0 / 14), # out of vocabulary should use "UNK" score # # of bigrams ending with <UNK> = 3 ("y", None, 3 / 14), # alpha = max(count('bc') - discount,0)/# of bigrams starting 'b' # = (1 - 0.75)/2 = 0.125 # gamma(['b']) = (discount * number of unique continuations after ['b'])/ # of bigrams starting 'b' # = (0.75 * 2)/2 = 0.75 # the final should be: (alpha + gamma * unigram_score("c")) ("c", ["b"], (0.125 + 0.75 * (1 / 14))), # building on that, let's try 'a b c' as the trigram # alpha = max(count('abc') - discount,0)/# of trigrams starting "ab" # = max(1 - 0.1, 0)/1 = 0.25 # gamma(['a', 'b']) = (discount * number of unique continuations after ['ab'])/ # of bigrams starting 'ab' # = 0.75 * 1/1 # final: alpha + gamma*(P(c|b)) # alpha of P(c|b) = max(# of trigrams ending in "bc" - discount,0)/# unique trigram continuations with 'b' in the middle # = (1-0.75)/2 =0.125 # gamma of P(c|b) = (discount * # of unique continuations after 'b')/ # of unique bigram continuations with 'b' in the middle # = 0.75 * 2/2 ("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (1 / 14))), # The ngram 'z b c' was not seen, so we should simply revert to # the score of the ngram 'b c'. See issue #2332. ("c", ["z", "b"], (0.125 + 0.75 * (1 / 14))), ]
def trigram_model(tokenized_text, test_sentences, sentence_count): n = 3 average_perplexity = 0.0 train_data, padded_vocab = padded_everygram_pipeline(n, tokenized_text) model = KneserNeyInterpolated(n) model.fit(train_data, padded_vocab) tokenized_text = [ list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in test_sentences ] test_data, _ = padded_everygram_pipeline(n, tokenized_text) for test in list(test_data): ngrams = list(test) if model.perplexity(ngrams) != float('inf'): average_perplexity += model.perplexity(ngrams) average_perplexity /= sentence_count print( f"Average Perplexity for Trigram model on Test tweets: {round(average_perplexity, 4)}" )
temp = word_tokenize(sent) for idx, word in enumerate(temp): if word not in vnword: temp[idx] = 'unknown' result.append(temp) print('tokenize done') return result if __name__ == '__main__': arg = get_arg() # get train data and tokenize with open(arg.doc_dir, 'r', encoding='utf-8') as fin: doc = fin.readlines() corpus = tokenize(doc) del doc vi_model = KneserNeyInterpolated(arg.ngram) train_data, padded_sent = padded_everygram_pipeline(arg.ngram, corpus) del corpus start_time = time.time() vi_model.fit(train_data, padded_sent) print('train %s-gram model in %d s' % (arg.ngram, time.time() - start_time)) print('length of vocab = %s' % (len(vi_model.vocab))) with open(arg.model_dir, 'wb') as fout: pickle.dump(vi_model, fout) print('save model successfully!')
class SynonymParaphraser: def __init__(self, model=None, ngram=3): if True: stanza.download('sv') # download Swedish model self.nlp = stanza.Pipeline( 'sv') # initialize Swedish neural pipeline self.base_url = 'https://www.synonymer.se/sv-syn/' # Build Language Model from corpus if model is None: with open('kneyser_lm.pkl', 'rb') as f: self.model = pickle.load(f) else: self.model = KneserNeyInterpolated(ngram) sentences = np.loadtxt(corpus_file, dtype='U', delimiter='\n') text = [ list(map(str.lower, word_tokenize(sent))) for sent in sentences ] train_data, padded_sents = padded_everygram_pipeline(ngram, text) self.model.fit(train_data, padded_sents) def generate_paraphrases(self, source_file): # Read data and make a copy to store edited paraphrases source_data = pd.read_csv(source_file)['question1'] paraphrases = source_data.copy() for i in range(1688, len(source_data)): # Clean source sentences and generate dependency parse treee source_data[i] = clean_str(source_data[i]) doc = self.nlp(source_data[i]) print(doc) # Iterate all words to find potential words to replace with synonyms candidate_words = [] for word in doc.sentences[0].words: if word.upos in ["ADJ", "ADV", "NOUN", "VERB"] and word.feats: candidate_word = { 'word': word.text, 'index': word.id - 1, 'POS': word.upos } valid_candidate = True features = [ feature.split('=') for feature in word.feats.split('|') ] for feature in features: if feature[0] == 'VerbForm' and feature[1] == 'Part': valid_candidate = False break candidate_word[feature[0]] = feature[1] if valid_candidate: candidate_words.append(candidate_word) replacements = 0 best_candidate = {'word': '', 'index': 0, 'diff': -np.inf} for j, candidate in enumerate(candidate_words): candidate_synonyms = self.get_synonyms(candidate['word']) if candidate_synonyms == None: continue original = (candidate['word'], self.get_score(candidate['word'], candidate['index'], source_data[i])) best_synonym = original synonym_count = 0 for synonym in candidate_synonyms: synonym = self.get_inflection(candidate, synonym) if synonym is None: continue synonym_count += 1 # Calculate score for the synonym and compare to the current best score = self.get_score(synonym, candidate['index'], source_data[i]) if score > best_synonym[1]: best_synonym = (synonym, score) diff = score - original[1] if best_candidate['diff'] < diff: best_candidate['word'] = synonym best_candidate['index'] = candidate['index'] best_candidate['diff'] = diff print( f'New best candidate: {synonym} with score {diff}') # Build paraphrase sentence if best_synonym[0] != candidate['word']: new_sentence = '' for (k, w) in enumerate(source_data[i].split()): if k == candidate['index'] and best_synonym[0] != w: new_sentence += best_synonym[0] replacements += 1 print(f'Replaced word {w} with {best_synonym[0]}') else: new_sentence += w if k < len(doc.sentences[0].words) - 1: new_sentence += ' ' source_data[i] = new_sentence # Assure at least one word is replaced with a synonym if replacements == 0 and best_candidate['word'] != '': print(best_candidate.items()) new_sentence = '' for (k, w) in enumerate(source_data[i].split()): if k == best_candidate['index']: new_sentence += best_candidate['word'] else: new_sentence += w if k < len(doc.sentences[0].words) - 1: new_sentence += ' ' source_data[i] = new_sentence print(f'{i} sentences done') print(source_data[i]) print(paraphrases[i]) print('\n') with open('synonym_samples_final.txt', 'a') as f: f.write(source_data[i] + '\n') return source_data def get_inflection(self, word, synonym): pos = POS[word['POS']] url = f"https://ws.spraakbanken.gu.se/ws/karp/v4/query?q=extended||and|pos|equals|{POS[word['POS']]}||and|wf|equals|{synonym}&resource=saldom" response = requests.get(url).json()['hits'] if response['total'] == 0: return None msd = self.word_grammar(word) for i in range(len(response['hits'])): if response['hits'][i]['_source']['FormRepresentations'][0][ 'baseform'] in synonym: word_forms = response['hits'][i]['_source']['WordForms'] for j in range(len(word_forms)): if word_forms[j]['msd'] == msd: if word['POS'] == 'NOUN' and 'Gender' in word.keys(): inherent = 'n' if word['Gender'] == 'Neut' else 'u' if inherent != response['hits'][i]['_source'][ 'FormRepresentations'][0]['inherent']: return None return word_forms[j]['writtenForm'] def get_synonyms(self, word): synonyms = set() url = self.base_url + word html_doc = requests.get(url).text soup = BeautifulSoup(html_doc, 'html.parser') soup = soup.find("div", {"id": "dict-default"}) if soup == None: return None else: soup = soup.find("div", {"body"}).ul for synset in soup.find_all('li'): for syns in synset.find_all('ol', class_=lambda x: not x): for synonym in syns.find_all('a'): if len(synonym.text.split()) > 1: continue synonyms.add(synonym.text) return synonyms def get_score(self, word, j, source_sentence): scores = [] sentence_len = len(source_sentence.split()) if sentence_len >= 3: if j >= 2: scores.append( self.model.logscore( word, source_sentence.split()[(j - 2):(j - 1)])) if j < sentence_len - 2: scores.append( self.model.logscore( source_sentence.split()[j + 2], [word, source_sentence.split()[j + 1]])) if j >= 1 and j < sentence_len - 1: scores.append( self.model.logscore( source_sentence.split()[j - 1], [source_sentence.split()[j + 1], word])) else: if j == 0: scores.append( self.model.logscore(source_sentence.split()[1], [word])) else: scores.append( self.model.logscore(word, [source_sentence.split()[0]])) score = sum(scores) / len(scores) return score def word_grammar(self, word): grammar = None if word['POS'] == 'ADJ': if 'Degree' not in word: return None if word['Degree'] == 'Pos': grammar = 'pos' elif word['Degree'] == 'Cmp': grammar = 'komp' if 'Case' in word.keys() and word['Case'] == 'Nom': grammar = grammar + ' nom' else: grammar = grammar + ' gen' return grammar elif word['Degree'] == 'Sup': grammar = 'super' if 'Case' in word.keys() and word['Case'] == 'Nom': grammar = grammar + ' nom' else: grammar = grammar + ' gen' return grammar if 'Definite' not in word: return None if word['Definite'] == 'Ind': grammar = grammar + ' indef' elif word['Definite'] == 'Def': grammar = grammar + ' def' if 'Number' in word.keys(): if word['Number'] == 'Sing': grammar = grammar + ' sg' elif word['Number'] == 'Plur': grammar = grammar + ' pl' if 'Gender' in word.keys() and word['Gender'] == 'Neut': grammar = grammar + ' n nom' else: grammar = grammar + ' u nom' elif word['POS'] == 'ADV': if 'Degree' not in word: return None else: if word['Degree'] == 'Pos': grammar = 'pos' elif word['Degree'] == 'Cmp': grammar = 'komp' elif word['Degree'] == 'Sup': grammar = 'super' elif word['POS'] == 'VERB': if word['VerbForm'] == 'Inf': grammar = 'inf' elif word['VerbForm'] == 'Sup': grammar = 'sup' elif 'Tense' in word.keys() and word['Tense'] == 'Past': grammar = 'pret ind' elif word['Mood'] == 'Ind': grammar = 'pres ind' elif word['Mood'] == 'Imp': grammar = 'imper' return grammar if 'Voice' in word.keys() and word['Voice'] == 'Act': grammar = grammar + ' aktiv' else: grammar = grammar + ' s-form' # if elif word['POS'] == 'NOUN': if 'Number' not in word.keys(): return None if word['Number'] == 'Sing': grammar = 'sg' elif word['Number'] == 'Plur': grammar = 'pl' if 'Definite' not in word.keys(): return None elif word['Definite'] == 'Ind': grammar = grammar + ' indef' elif word['Definite'] == 'Def': grammar = grammar + ' def' if word['Case'] == 'Gen': grammar = grammar + ' gen' else: grammar = grammar + ' nom' return grammar
def kneserney_bigram_model(bigram_training_data, vocabulary): model = KneserNeyInterpolated(order=2, vocabulary=vocabulary) model.fit(bigram_training_data) return model
def kneserney_trigram_model(trigram_training_data, vocabulary): model = KneserNeyInterpolated(order=3, discount=0.75, vocabulary=vocabulary) model.fit(trigram_training_data) return model
print("corpus read") tokens = nltk.word_tokenize(raw) sents = [nltk.word_tokenize(s) for s in nltk.sent_tokenize(raw)] voc = Voc(corpus) print(voc) for s in sents: for w in s: voc.addWord(w) print(voc) sents = [[str(SOS_token)] + [str(voc.word2index[w]) for w in s] + [str(EOS_token)] for s in sents] print(sents[0]) # train, vocab = my_padded_everygram_pipeline(N, sents, left_pad_symbol="SOS", right_pad_symbol="EOS") train, vocab = my_padded_everygram_pipeline(N, sents, left_pad_symbol="0", right_pad_symbol="0") print("preprocessing ready") LM.fit(train, vocab) print("LM ready") # LM.generate() # print("how") # print(LM.score("How")) # print("are you") # print(LM.score("you", ["are"])) out_file = f"token_KneserNey_{N}_lm.pkl" with open(out_file, "wb") as f: pickle.dump(LM, f) print(LM.score("6")) print(LM.score("4", ["8"]))
sentences_strings_ted.extend(sent for sent in re.split('[。?!]', m.groupdict()['postcolon']) if sent) del input_text_noparens, input_text sentences_strings_ted = [re.sub(r'[^\w\s]', '', sent) for sent in sentences_strings_ted] sentences_strings_ted = [re.sub(r'[a-zA-Z0-9]', '', sent) for sent in sentences_strings_ted] sentences_strings_ted = filter(None, sentences_strings_ted) data = ' '.join([re.sub(r'\s', '', sent) for sent in sentences_strings_ted]).split(' ') datax = [' '.join(sent).split(' ') for sent in data] del sentences_strings_ted, data # 训练 5-gram lm = KneserNeyInterpolated(5) train, vocab = padded_everygram_pipeline(5, datax) lm.fit(train, vocab) del train, vocab, datax # 困惑度测试 test = '我想带你们体验一下,我们所要实现的“信任”的感觉。' sent_list = re.sub(r'[^\w\s]', '', test) sent_list = ','.join(sent_list).split(',') text = list(ngrams(pad_both_ends(sent_list, 5), 5)) entropy = lm.entropy(text) # 交叉熵 perplexity = lm.perplexity(text) # 困惑度 print('交叉熵:%f' % entropy, '困惑度:%f' % perplexity) # 储存模型 ... 以下内容 内存不足跑不起来 去 Colaboratory 或者 kaggle 跑蹭谷歌服务器 joblib.dump(lm, 'panti_gram.pkl') # In[] # 测试储存的模型