Example #1
0
def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=2):
    """
    Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param model: un des éléments de (MLE, Lidstone, Laplace)
    :param n: int, l'ordre du modèle
    :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet
    argument doit être renseigné
    :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK>
    :return: un modèle entraîné
    """
    lm = None
    ngrams, words = padded_everygram_pipeline(n, corpus)
    vocab = Vocabulary(words, unk_cutoff=unk_cutoff)
    if model=="MLE":
        lm = MLE(n, vocabulary=vocab)
        lm.fit(ngrams)
    elif model == "Lidstone":
        if gamma == None:
            raise Exception('Please enter a value for gamma')
        else:
            lm = Lidstone(gamma, order = n, vocabulary=vocab)
            lm.fit(ngrams)
    elif model=="Laplace":
        lm = Laplace(order = n, vocabulary=vocab)
        lm.fit(ngrams)
    else:
        raise Exception('Wrong model in train_LM_model')
    return lm
Example #2
0
def create_language_model(doc_ids: List[str, ], n: int = 3) -> MLE:
    sentences = []

    # doc_id を 1つず処理していく
    for doc_id in doc_ids:
        # doc_id に紐づく単語を取得
        all_tokens = datastore.get_annotation(doc_id, "token")

        # doc_id に紐づく文を取得
        # find_xs_in_y を使用し, 文に含まれている単語のみを抽出し, sentences に格納
        for sentence in datastore.get_annotation(doc_id, "sentence"):
            tokens = find_xs_in_y(all_tokens, sentence)

            sentences.append(["__BOS__"] +
                             [token['lemma']
                              for token in tokens] + ["__EOS__"])

    # ボキャブラリを作成
    vocab = Vocabulary([word for sentence in sentences for word in sentence])

    # n-gram を利用して, 1組 n 個の単語の組み合わせ作成
    ngram = [ngrams(sentence, n) for sentence in sentences]

    # MLE というモデルを用いて, 言語モデルを作成
    lm = MLE(order=n, vocabulary=vocab)
    lm.fit(ngram)

    return lm
Example #3
0
    def __init__(self,
                 training_set: List[str],
                 n_param: int = 3,
                 max_predict=4):
        super().__init__()
        '''
        Initialize the completions for the test phrase
        '''

        # convert sentence to list[words] using tokenizer
        # self.tokenizer = ToktokTokenizer()

        training_ngrams, padded_sentences = padded_everygram_pipeline(
            n_param,
            #list(map(self.tokenizer.tokenize, training_set)),
            list(map(wordpunct_tokenize, training_set)),
        )

        # print(len(training_ngrams))
        # temp = list(training_ngrams)
        # for i in range(10):
        #     print(list(temp[i]))

        self.model_obj = MLE(order=n_param)
        self.model_obj.fit(training_ngrams, padded_sentences)
        print('Vocab length: {}'.format(len(self.model_obj.vocab)))
        print('Counts: ', self.model_obj.counts)

        self.max_predict = max_predict
Example #4
0
class NGrams(BaseModel):
    '''
    NGram language model based on sentence completion

    '''
    def __init__(self,
                 training_set: List[str],
                 n_param: int = 3,
                 max_predict=4):
        super().__init__()
        '''
        Initialize the completions for the test phrase
        '''

        # convert sentence to list[words] using tokenizer
        # self.tokenizer = ToktokTokenizer()

        training_ngrams, padded_sentences = padded_everygram_pipeline(
            n_param,
            #list(map(self.tokenizer.tokenize, training_set)),
            list(map(wordpunct_tokenize, training_set)),
        )

        # print(len(training_ngrams))
        # temp = list(training_ngrams)
        # for i in range(10):
        #     print(list(temp[i]))

        self.model_obj = MLE(order=n_param)
        self.model_obj.fit(training_ngrams, padded_sentences)
        print('Vocab length: {}'.format(len(self.model_obj.vocab)))
        print('Counts: ', self.model_obj.counts)

        self.max_predict = max_predict

    def train(self, train_sentence: List[str]):
        raise NotImplementedError(
            'Train is not implemented for the NGrams model. Pass the training set in evaluate'
        )

    def complete(self, phrase: str):
        # input_tokens = self.tokenizer.tokenize(phrase.lower())
        input_tokens = wordpunct_tokenize(phrase.lower())

        results = []
        for res_len in range(self.max_predict):
            temp = self.model_obj.generate(res_len + 1, text_seed=input_tokens)

            if type(temp) == str:
                temp = [temp]

            # filter out the stop words
            temp = list(filter(lambda x: x != '</s>', temp))

            if len(temp) == 1:
                results.append(phrase + ' ' + temp[0])
            elif len(temp) > 1:
                results.append(phrase + ' ' + ' '.join(temp))

        return results
Example #5
0
def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=2):
    """
    Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param model: un des éléments de (MLE, Lidstone, Laplace)
    :param n: int, l'ordre du modèle
    :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet
    argument doit être renseigné
    :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK>
    :return: un modèle entraîné
    """
    #On veut condenser le corpus en une simple liste, pour pouvoir utiliser facilement Vocabulary
    flat_corpus = []
    for l in corpus:
        for w in l:
            flat_corpus.append(w)
    
    vocab = Vocabulary(flat_corpus, unk_cutoff)
    
    ngram_corpus = mnm.extract_ngrams(corpus,n)
    
    if (model == MLE):
        model_res = MLE(n)
        model_res.fit(ngram_corpus, vocab)
    
    if (model == Lidstone):
        model_res = Lidstone(gamma,n)
        model_res.fit(ngram_corpus, vocab)
    
    if (model == Laplace):
        model_res = Laplace(n)
        model_res.fit(ngram_corpus, vocab)
    
    return model_res
Example #6
0
def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=2):
    """
    Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param model: un des éléments de (MLE, Lidstone, Laplace)
    :param n: int, l'ordre du modèle
    :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet
    argument doit être renseigné
    :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK>
    :return: un modèle entraîné
    """
    ngrams, words = padded_everygram_pipeline(n, corpus)
    vocab = Vocabulary(words, unk_cutoff=unk_cutoff)

    if model == MLE:
        model = MLE(n, vocab)
        model.fit(ngrams)
    elif model == Lidstone and gamma != None:
        model = Lidstone(gamma, n, vocab)
        model.fit(ngrams)
    elif model == Laplace:
        model = Laplace(n, vocab)
        model.fit(ngrams)

    return model
Example #7
0
def create_language_model(doc_ids, N=3):
    sents = []
    for doc_id in doc_ids:
        all_tokens = datastore.get_annotation(doc_id, 'token')
        for sent in datastore.get_annotation(doc_id, 'sentence'):
            tokens = find_xs_in_y(all_tokens, sent)
            sents.append(['__BOS__'] + [token['lemma']
                                        for token in tokens] + ['__EOS__'])
    vocab = Vocabulary([word for sent in sents for word in sent])
    text_ngrams = [ngrams(sent, N) for sent in sents]
    lm = MLE(order=N, vocabulary=vocab)
    lm.fit(text_ngrams)
    return lm
def train_MLE_model(corpus, n):
    """
    Entraîne un modèle de langue n-gramme MLE de NLTK sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param n: l'ordre du modèle
    :return: un modèle entraîné
    """
    ngrams, words = padded_everygram_pipeline(n, corpus)

    mle_model = MLE(n)
    mle_model.fit(ngrams, words)

    return mle_model
def train_MLE_model(corpus, n):
    """
    Entraîne un modèle de langue n-gramme MLE de NLTK sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param n: l'ordre du modèle
    :return: un modèle entraîné
    """
    
    ngrams, words = padded_everygram_pipeline(n, corpus)
    vocab = Vocabulary(words, unk_cutoff=1)
    lm = MLE(n, vocabulary=vocab)
    lm.fit(ngrams)
    
    return lm
Example #10
0
def calc_prob(lm: MLE, lemmas, n: int = 3) -> float:
    probability = 1.0

    for ngram in ngrams(lemmas, n):
        prob = lm.score(lm.vocab.lookup(ngram[-1]),
                        lm.vocab.lookup(ngram[:-1]))

        prob = max(prob, 1e-8)

        probability *= prob

    return probability
Example #11
0
def train_MLE_model(corpus, n):
    """
    Entraîne un modèle de langue n-gramme MLE de NLTK sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param n: l'ordre du modèle
    :return: un modèle entraîné
    """
    # Creation of the vocabulary from the given corpus
    flat_corpus = []
    for document in corpus:
        for word in document:
            flat_corpus.append(word)
    vocab = Vocabulary(flat_corpus, unk_cutoff=2)
    # Extraction of the n-grams
    n_grams = mnm.extract_ngrams(corpus, n)
    # Creation and training of the model on the corpus
    model = MLE(n, vocab)
    model.fit(n_grams)
    print("Modèle d'ordre", n, "généré par nltk.lm")
    return model