Esempio n. 1
0
class KneserNeyInterpolatedTrigramTests(unittest.TestCase):
    def setUp(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = KneserNeyInterpolated(3, vocabulary=vocab)
        self.model.fit(training_text)

    score_tests = [
        # For unigram scores revert to uniform
        # Vocab size: 8
        # count('c'): 1
        ("c", None, 1.0 / 8),
        # in vocabulary but unseen, still uses uniform
        ("z", None, 1 / 8),
        # out of vocabulary should use "UNK" score, i.e. again uniform
        ("y", None, 1.0 / 8),
        # alpha = count('bc') - discount = 1 - 0.1 = 0.9
        # gamma(['b']) = discount * number of unique words that follow ['b'] = 0.1 * 2
        # normalizer = total number of bigrams with this context = 2
        # the final should be: (alpha + gamma * unigram_score("c"))
        ("c", ["b"], (0.9 + 0.2 * (1 / 8)) / 2),
        # building on that, let's try 'a b c' as the trigram
        # alpha = count('abc') - discount = 1 - 0.1 = 0.9
        # gamma(['a', 'b']) = 0.1 * 1
        # normalizer = total number of trigrams with prefix "ab" = 1 => we can ignore it!
        ("c", ["a", "b"], 0.9 + 0.1 * ((0.9 + 0.2 * (1 / 8)) / 2)),
    ]
Esempio n. 2
0
class KneserNeyInterpolatedTrigramTests(unittest.TestCase):
    def setUp(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = KneserNeyInterpolated(3, vocabulary=vocab)
        self.model.fit(training_text)

    score_tests = [
        # For unigram scores revert to uniform
        # Vocab size: 8
        # count('c'): 1
        ("c", None, 1.0 / 8),
        # in vocabulary but unseen, still uses uniform
        ("z", None, 1 / 8),
        # out of vocabulary should use "UNK" score, i.e. again uniform
        ("y", None, 1.0 / 8),
        # alpha = count('bc') - discount = 1 - 0.1 = 0.9
        # gamma(['b']) = discount * number of unique words that follow ['b'] = 0.1 * 2
        # normalizer = total number of bigrams with this context = 2
        # the final should be: (alpha + gamma * unigram_score("c"))
        ("c", ["b"], (0.9 + 0.2 * (1 / 8)) / 2),
        # building on that, let's try 'a b c' as the trigram
        # alpha = count('abc') - discount = 1 - 0.1 = 0.9
        # gamma(['a', 'b']) = 0.1 * 1
        # normalizer = total number of trigrams with prefix "ab" = 1 => we can ignore it!
        ("c", ["a", "b"], 0.9 + 0.1 * ((0.9 + 0.2 * (1 / 8)) / 2)),
    ]
Esempio n. 3
0
class NGramLM:
    def __init__(self, n: int = 3):
        self.n = n
        self.model = KneserNeyInterpolated(n)
        self.model_path = LM_PATH / f"{n}_gram.model"

    def train(self, dataset: NGramDataset, save=True):
        self.model.fit(dataset.train, dataset.vocab)
        if save:
            pickle.dump(self.model, self.model_path.open("wb"))
            logging.info(f"ngram model saved at: {self.model_path}")

    def load_model(self, path: Path = None):
        if path:
            if path.is_file():
                self.model_path = path
            else:
                logging.exception(f"ngram model doesn't exits at: {path}")
        self.model = pickle.load(self.model_path.open("rb"))
        # TODO: set self.n from self.model
        self.n = self.model.order

    def evaluate_sentence(self, sentence: str):
        """check the prob. of `sentence`"""
        tokens = NGramDataset.preprocess_sentence(sentence, self.n)
        ngrams = list(get_ngrams(tokens, self.n))
        total_logscore = 0
        for ngram in ngrams:
            cur_logscore = self.model.logscore(ngram[-1], ngram[:-1])
            total_logscore += cur_logscore
        return math.exp(total_logscore)

    def predict_next(self):
        pass
Esempio n. 4
0
    def build_ngram_lm(self, train):
        if not train: return None

        n = 5  # up to 5 gram language model
        train, vocab = padded_everygram_pipeline(n, train)
        model = KneserNeyInterpolated(n)
        model.fit(train, vocab)
        return model
Esempio n. 5
0
class TestKneserNeyInterpolatedTrigram(metaclass=ParametrizedTests):
    @classmethod
    def setup_method(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = KneserNeyInterpolated(3, discount=0.75, vocabulary=vocab)
        self.model.fit(training_text)

    score_tests = [
        # P(c) = count('*c') / unique('**')
        #      = 1 / 14
        ("c", None, 1.0 / 14),
        # P(z) = count('*z') / unique('**')
        #      = 0 / 14
        # 'z' is in the vocabulary, but it was not seen during training.
        ("z", None, 0.0 / 14),
        # P(y)
        # Out of vocabulary should use "UNK" score.
        # P(y) = P(UNK) = count('*UNK') / unique('**')
        ("y", None, 3 / 14),
        # We start with P(c|b)
        # P(c|b) = alpha('bc') + gamma('b') * P(c)
        # alpha('bc') = max(unique('*bc') - discount, 0) / unique('*b*')
        #             = max(1 - 0.75, 0) / 2
        #             = 0.125
        # gamma('b')  = discount * unique('b*') / unique('*b*')
        #             = (0.75 * 2) / 2
        #             = 0.75
        ("c", ["b"], (0.125 + 0.75 * (1 / 14))),
        # Building on that, let's try P(c|ab).
        # P(c|ab) = alpha('abc') + gamma('ab') * P(c|b)
        # alpha('abc') = max(count('abc') - discount, 0) / count('ab*')
        #              = max(1 - 0.75, 0) / 1
        #              = 0.25
        # gamma('ab')  = (discount * unique('ab*')) / count('ab*')
        #              = 0.75 * 1 / 1
        ("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (1 / 14))),
        # P(c|zb)
        # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332.
        ("c", ["z", "b"], (0.125 + 0.75 * (1 / 14))),
    ]
Esempio n. 6
0
class TestKneserNeyInterpolatedTrigram(metaclass=ParametrizedTests):
    @classmethod
    def setup_method(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = KneserNeyInterpolated(3, vocabulary=vocab)
        self.model.fit(training_text)

    score_tests = [
        # # of bigrams ending with c = 1
        # total # of unique bigrams = 14
        ("c", None, 1.0 / 14),
        # in vocabulary but unseen
        # # of bigrams ending with z = 0
        ("z", None, 0.0 / 14),
        # out of vocabulary should use "UNK" score
        # # of bigrams ending with <UNK> = 3
        ("y", None, 3 / 14),
        # alpha = max(count('bc') - discount,0)/# of bigrams starting 'b'
        # = (1 - 0.75)/2 = 0.125
        # gamma(['b']) = (discount * number of unique continuations after ['b'])/ # of bigrams starting 'b'
        # = (0.75 * 2)/2 = 0.75
        # the final should be: (alpha + gamma * unigram_score("c"))
        ("c", ["b"], (0.125 + 0.75 * (1 / 14))),
        # building on that, let's try 'a b c' as the trigram
        # alpha = max(count('abc') - discount,0)/# of trigrams starting "ab"
        # = max(1 - 0.1, 0)/1 = 0.25
        # gamma(['a', 'b']) = (discount * number of unique continuations after ['ab'])/ # of bigrams starting 'ab'
        # = 0.75 * 1/1
        # final: alpha + gamma*(P(c|b))
        # alpha of P(c|b) = max(# of trigrams ending in "bc" - discount,0)/# unique trigram continuations with 'b' in the middle
        # = (1-0.75)/2 =0.125
        # gamma of P(c|b) = (discount * # of unique continuations after 'b')/ # of unique bigram continuations with 'b' in the middle
        # = 0.75 * 2/2
        ("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (1 / 14))),
        # The ngram 'z b c' was not seen, so we should simply revert to
        # the score of the ngram 'b c'. See issue #2332.
        ("c", ["z", "b"], (0.125 + 0.75 * (1 / 14))),
    ]
def trigram_model(tokenized_text, test_sentences, sentence_count):

    n = 3
    average_perplexity = 0.0
    train_data, padded_vocab = padded_everygram_pipeline(n, tokenized_text)
    model = KneserNeyInterpolated(n)
    model.fit(train_data, padded_vocab)

    tokenized_text = [
        list(map(str.lower, nltk.tokenize.word_tokenize(sent)))
        for sent in test_sentences
    ]

    test_data, _ = padded_everygram_pipeline(n, tokenized_text)

    for test in list(test_data):
        ngrams = list(test)
        if model.perplexity(ngrams) != float('inf'):
            average_perplexity += model.perplexity(ngrams)

    average_perplexity /= sentence_count
    print(
        f"Average Perplexity for Trigram model on Test tweets: {round(average_perplexity, 4)}"
    )
Esempio n. 8
0
        temp = word_tokenize(sent)
        for idx, word in enumerate(temp):
            if word not in vnword:
                temp[idx] = 'unknown'
        result.append(temp)
    print('tokenize done')
    return result


if __name__ == '__main__':
    arg = get_arg()

    # get train data and tokenize
    with open(arg.doc_dir, 'r', encoding='utf-8') as fin:
        doc = fin.readlines()
    corpus = tokenize(doc)
    del doc

    vi_model = KneserNeyInterpolated(arg.ngram)
    train_data, padded_sent = padded_everygram_pipeline(arg.ngram, corpus)
    del corpus
    start_time = time.time()
    vi_model.fit(train_data, padded_sent)
    print('train %s-gram model in %d s' %
          (arg.ngram, time.time() - start_time))
    print('length of vocab = %s' % (len(vi_model.vocab)))

    with open(arg.model_dir, 'wb') as fout:
        pickle.dump(vi_model, fout)
    print('save model successfully!')
class SynonymParaphraser:
    def __init__(self, model=None, ngram=3):
        if True:
            stanza.download('sv')  # download Swedish model
            self.nlp = stanza.Pipeline(
                'sv')  # initialize Swedish neural pipeline
            self.base_url = 'https://www.synonymer.se/sv-syn/'

        # Build Language Model from corpus
        if model is None:
            with open('kneyser_lm.pkl', 'rb') as f:
                self.model = pickle.load(f)

        else:
            self.model = KneserNeyInterpolated(ngram)
            sentences = np.loadtxt(corpus_file, dtype='U', delimiter='\n')
            text = [
                list(map(str.lower, word_tokenize(sent))) for sent in sentences
            ]
            train_data, padded_sents = padded_everygram_pipeline(ngram, text)
            self.model.fit(train_data, padded_sents)

    def generate_paraphrases(self, source_file):
        # Read data and make a copy to store edited paraphrases
        source_data = pd.read_csv(source_file)['question1']
        paraphrases = source_data.copy()

        for i in range(1688, len(source_data)):
            # Clean source sentences and generate dependency parse treee
            source_data[i] = clean_str(source_data[i])
            doc = self.nlp(source_data[i])
            print(doc)

            # Iterate all words to find potential words to replace with synonyms
            candidate_words = []
            for word in doc.sentences[0].words:
                if word.upos in ["ADJ", "ADV", "NOUN", "VERB"] and word.feats:
                    candidate_word = {
                        'word': word.text,
                        'index': word.id - 1,
                        'POS': word.upos
                    }
                    valid_candidate = True
                    features = [
                        feature.split('=') for feature in word.feats.split('|')
                    ]
                    for feature in features:
                        if feature[0] == 'VerbForm' and feature[1] == 'Part':
                            valid_candidate = False
                            break
                        candidate_word[feature[0]] = feature[1]
                    if valid_candidate:
                        candidate_words.append(candidate_word)

            replacements = 0
            best_candidate = {'word': '', 'index': 0, 'diff': -np.inf}
            for j, candidate in enumerate(candidate_words):
                candidate_synonyms = self.get_synonyms(candidate['word'])

                if candidate_synonyms == None:
                    continue
                original = (candidate['word'],
                            self.get_score(candidate['word'],
                                           candidate['index'], source_data[i]))
                best_synonym = original
                synonym_count = 0
                for synonym in candidate_synonyms:
                    synonym = self.get_inflection(candidate, synonym)
                    if synonym is None:
                        continue
                    synonym_count += 1
                    # Calculate score for the synonym and compare to the current best
                    score = self.get_score(synonym, candidate['index'],
                                           source_data[i])
                    if score > best_synonym[1]:
                        best_synonym = (synonym, score)

                    diff = score - original[1]

                    if best_candidate['diff'] < diff:
                        best_candidate['word'] = synonym
                        best_candidate['index'] = candidate['index']
                        best_candidate['diff'] = diff
                        print(
                            f'New best candidate: {synonym} with score {diff}')

                # Build paraphrase sentence
                if best_synonym[0] != candidate['word']:
                    new_sentence = ''
                    for (k, w) in enumerate(source_data[i].split()):
                        if k == candidate['index'] and best_synonym[0] != w:
                            new_sentence += best_synonym[0]
                            replacements += 1
                            print(f'Replaced word {w} with {best_synonym[0]}')
                        else:
                            new_sentence += w
                        if k < len(doc.sentences[0].words) - 1:
                            new_sentence += ' '
                    source_data[i] = new_sentence

            # Assure at least one word is replaced with a synonym
            if replacements == 0 and best_candidate['word'] != '':
                print(best_candidate.items())
                new_sentence = ''
                for (k, w) in enumerate(source_data[i].split()):
                    if k == best_candidate['index']:
                        new_sentence += best_candidate['word']
                    else:
                        new_sentence += w
                    if k < len(doc.sentences[0].words) - 1:
                        new_sentence += ' '
                source_data[i] = new_sentence

            print(f'{i} sentences done')
            print(source_data[i])
            print(paraphrases[i])
            print('\n')
            with open('synonym_samples_final.txt', 'a') as f:
                f.write(source_data[i] + '\n')

        return source_data

    def get_inflection(self, word, synonym):
        pos = POS[word['POS']]
        url = f"https://ws.spraakbanken.gu.se/ws/karp/v4/query?q=extended||and|pos|equals|{POS[word['POS']]}||and|wf|equals|{synonym}&resource=saldom"
        response = requests.get(url).json()['hits']

        if response['total'] == 0:
            return None

        msd = self.word_grammar(word)
        for i in range(len(response['hits'])):
            if response['hits'][i]['_source']['FormRepresentations'][0][
                    'baseform'] in synonym:
                word_forms = response['hits'][i]['_source']['WordForms']

                for j in range(len(word_forms)):
                    if word_forms[j]['msd'] == msd:
                        if word['POS'] == 'NOUN' and 'Gender' in word.keys():
                            inherent = 'n' if word['Gender'] == 'Neut' else 'u'
                            if inherent != response['hits'][i]['_source'][
                                    'FormRepresentations'][0]['inherent']:
                                return None
                        return word_forms[j]['writtenForm']

    def get_synonyms(self, word):
        synonyms = set()

        url = self.base_url + word
        html_doc = requests.get(url).text
        soup = BeautifulSoup(html_doc, 'html.parser')
        soup = soup.find("div", {"id": "dict-default"})
        if soup == None:
            return None
        else:
            soup = soup.find("div", {"body"}).ul
        for synset in soup.find_all('li'):
            for syns in synset.find_all('ol', class_=lambda x: not x):
                for synonym in syns.find_all('a'):
                    if len(synonym.text.split()) > 1:
                        continue
                    synonyms.add(synonym.text)
        return synonyms

    def get_score(self, word, j, source_sentence):
        scores = []
        sentence_len = len(source_sentence.split())
        if sentence_len >= 3:
            if j >= 2:
                scores.append(
                    self.model.logscore(
                        word,
                        source_sentence.split()[(j - 2):(j - 1)]))
            if j < sentence_len - 2:
                scores.append(
                    self.model.logscore(
                        source_sentence.split()[j + 2],
                        [word, source_sentence.split()[j + 1]]))
            if j >= 1 and j < sentence_len - 1:
                scores.append(
                    self.model.logscore(
                        source_sentence.split()[j - 1],
                        [source_sentence.split()[j + 1], word]))
        else:
            if j == 0:
                scores.append(
                    self.model.logscore(source_sentence.split()[1], [word]))
            else:
                scores.append(
                    self.model.logscore(word, [source_sentence.split()[0]]))
        score = sum(scores) / len(scores)
        return score

    def word_grammar(self, word):
        grammar = None
        if word['POS'] == 'ADJ':
            if 'Degree' not in word:
                return None
            if word['Degree'] == 'Pos':
                grammar = 'pos'
            elif word['Degree'] == 'Cmp':
                grammar = 'komp'
                if 'Case' in word.keys() and word['Case'] == 'Nom':
                    grammar = grammar + ' nom'
                else:
                    grammar = grammar + ' gen'
                return grammar
            elif word['Degree'] == 'Sup':
                grammar = 'super'
                if 'Case' in word.keys() and word['Case'] == 'Nom':
                    grammar = grammar + ' nom'
                else:
                    grammar = grammar + ' gen'
                return grammar

            if 'Definite' not in word:
                return None
            if word['Definite'] == 'Ind':
                grammar = grammar + ' indef'
            elif word['Definite'] == 'Def':
                grammar = grammar + ' def'

            if 'Number' in word.keys():
                if word['Number'] == 'Sing':
                    grammar = grammar + ' sg'
                elif word['Number'] == 'Plur':
                    grammar = grammar + ' pl'

            if 'Gender' in word.keys() and word['Gender'] == 'Neut':
                grammar = grammar + ' n nom'
            else:
                grammar = grammar + ' u nom'

        elif word['POS'] == 'ADV':
            if 'Degree' not in word:
                return None
            else:
                if word['Degree'] == 'Pos':
                    grammar = 'pos'
                elif word['Degree'] == 'Cmp':
                    grammar = 'komp'
                elif word['Degree'] == 'Sup':
                    grammar = 'super'

        elif word['POS'] == 'VERB':
            if word['VerbForm'] == 'Inf':
                grammar = 'inf'
            elif word['VerbForm'] == 'Sup':
                grammar = 'sup'
            elif 'Tense' in word.keys() and word['Tense'] == 'Past':
                grammar = 'pret ind'
            elif word['Mood'] == 'Ind':
                grammar = 'pres ind'
            elif word['Mood'] == 'Imp':
                grammar = 'imper'
                return grammar

            if 'Voice' in word.keys() and word['Voice'] == 'Act':
                grammar = grammar + ' aktiv'
            else:
                grammar = grammar + ' s-form'

            # if
        elif word['POS'] == 'NOUN':
            if 'Number' not in word.keys():
                return None
            if word['Number'] == 'Sing':
                grammar = 'sg'
            elif word['Number'] == 'Plur':
                grammar = 'pl'

            if 'Definite' not in word.keys():
                return None
            elif word['Definite'] == 'Ind':
                grammar = grammar + ' indef'
            elif word['Definite'] == 'Def':
                grammar = grammar + ' def'

            if word['Case'] == 'Gen':
                grammar = grammar + ' gen'
            else:
                grammar = grammar + ' nom'

        return grammar
def kneserney_bigram_model(bigram_training_data, vocabulary):
    model = KneserNeyInterpolated(order=2, vocabulary=vocabulary)
    model.fit(bigram_training_data)
    return model
def kneserney_trigram_model(trigram_training_data, vocabulary):
    model = KneserNeyInterpolated(order=3, discount=0.75, vocabulary=vocabulary)
    model.fit(trigram_training_data)
    return model
print("corpus read")
tokens = nltk.word_tokenize(raw)
sents = [nltk.word_tokenize(s) for s in nltk.sent_tokenize(raw)]

voc = Voc(corpus)
print(voc)
for s in sents:
    for w in s:
        voc.addWord(w)
print(voc)
sents = [[str(SOS_token)] + [str(voc.word2index[w]) for w in s] + [str(EOS_token)] for s in sents]
print(sents[0])

# train, vocab = my_padded_everygram_pipeline(N, sents, left_pad_symbol="SOS", right_pad_symbol="EOS")
train, vocab = my_padded_everygram_pipeline(N, sents, left_pad_symbol="0", right_pad_symbol="0")
print("preprocessing ready")
LM.fit(train, vocab)
print("LM ready")
# LM.generate()
# print("how")
# print(LM.score("How"))
# print("are you")
# print(LM.score("you", ["are"]))

out_file = f"token_KneserNey_{N}_lm.pkl"
with open(out_file, "wb") as f:
    pickle.dump(LM, f)

print(LM.score("6"))
print(LM.score("4", ["8"]))
Esempio n. 13
0
    sentences_strings_ted.extend(sent for sent in re.split('[。?!]', m.groupdict()['postcolon']) if sent)

del input_text_noparens, input_text

sentences_strings_ted = [re.sub(r'[^\w\s]', '', sent) for sent in sentences_strings_ted]
sentences_strings_ted = [re.sub(r'[a-zA-Z0-9]', '', sent) for sent in sentences_strings_ted]
sentences_strings_ted = filter(None, sentences_strings_ted)
data = ' '.join([re.sub(r'\s', '', sent) for sent in sentences_strings_ted]).split(' ')
datax = [' '.join(sent).split(' ') for sent in data]

del sentences_strings_ted, data

# 训练 5-gram
lm = KneserNeyInterpolated(5)
train, vocab = padded_everygram_pipeline(5, datax)
lm.fit(train, vocab)

del train, vocab, datax
# 困惑度测试
test = '我想带你们体验一下,我们所要实现的“信任”的感觉。'
sent_list = re.sub(r'[^\w\s]', '', test)
sent_list = ','.join(sent_list).split(',')
text = list(ngrams(pad_both_ends(sent_list, 5), 5))

entropy = lm.entropy(text)  # 交叉熵
perplexity = lm.perplexity(text)  # 困惑度
print('交叉熵:%f' % entropy, '困惑度:%f' % perplexity)
# 储存模型  ... 以下内容 内存不足跑不起来 去 Colaboratory 或者 kaggle 跑蹭谷歌服务器
joblib.dump(lm, 'panti_gram.pkl')
# In[]
# 测试储存的模型