Python padded_everygram_pipeline Beispiele, nltk.lm.preprocessing.padded_everygram_pipeline Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_train.py Projekt: punitjha/ECE365-Data_Science_and_Engineering

def test_d2_1_gp():
    global food_corpus, natr_corpus

    food_corpus_tk = train.tokenize_corpus(food_corpus)
    natr_corpus_tk = train.tokenize_corpus(natr_corpus)

    food_train, food_vocab = padded_everygram_pipeline(
        3, food_corpus_tk[:int(0.8 * len(food_corpus_tk))])
    natr_train, natr_vocab = padded_everygram_pipeline(
        3, natr_corpus_tk[:int(0.8 * len(natr_corpus_tk))])

    food_test = sum([['<s>'] + x + ['</s>']
                     for x in food_corpus_tk[int(0.8 * len(food_corpus_tk)):]],
                    [])
    natr_test = sum([['<s>'] + x + ['</s>']
                     for x in natr_corpus_tk[int(0.8 * len(natr_corpus_tk)):]],
                    [])

    food_lm = Laplace(3)
    natr_lm = Laplace(3)

    food_lm.fit(food_train, food_vocab)
    natr_lm.fit(natr_train, natr_vocab)

    eq_(int(evaluate.get_perplexity(food_lm, food_test[:2500])), 7318)
    eq_(int(evaluate.get_perplexity(food_lm, natr_test[:2500])), 7309)
    eq_(int(evaluate.get_perplexity(natr_lm, natr_test[:2500])), 5222)
    eq_(int(evaluate.get_perplexity(natr_lm, food_test[:2500])), 5354)

Beispiel #2

0

Datei anzeigen

def main(argv):
    """Trains an nltk language model.

    Loads in files of normalized text, partitions them into a train partition
    (3/4 of data) and a test partition (last 1/4 of data). Uses Laplace
    smoothing for unseen ngrams.
    """
    if len(argv) > 1:
        raise app.UsageError("Too many command-line arguments.")

    normalized_data = load_normalized_data(FLAGS.language, FLAGS.data_source,
                                           FLAGS.pass_valid, FLAGS.experiment)
    train_partition, test_partition = partition_data(normalized_data)
    train_ngrams, vocab = padded_everygram_pipeline(2, train_partition)
    test_ngrams, _ = padded_everygram_pipeline(2, test_partition)
    language_model = Laplace(2)
    language_model.fit(train_ngrams, vocab)

    avg_perp, count = compute_avg_perplexity(test_ngrams, language_model)
    print("\n----------------------------\n"
          "Language Model Parameters:\n"
          f"\tLanguage={FLAGS.language}\n"
          f"\tData Sources={FLAGS.data_source}\n"
          f"\tPass Valid={FLAGS.pass_valid}\n"
          f"\tExperiment={FLAGS.experiment}\n"
          "----------------------------\n")
    print(f"Average perplexity across {count} ngrams:\t{avg_perp}")

Beispiel #3

0

Datei anzeigen

 def test_d2_1_gp(self):
     nltk.download('punkt')
     food_corpus_tk = lab3.tokenize_corpus(self.food_corpus)
     natr_corpus_tk = lab3.tokenize_corpus(self.natr_corpus)
     food_train, food_vocab = padded_everygram_pipeline(
         3, food_corpus_tk[:int(0.8 * len(food_corpus_tk))])
     natr_train, natr_vocab = padded_everygram_pipeline(
         3, natr_corpus_tk[:int(0.8 * len(natr_corpus_tk))])
     food_test = sum(
         [['<s>'] + x + ['</s>']
          for x in food_corpus_tk[int(0.8 * len(food_corpus_tk)):]], [])
     natr_test = sum(
         [['<s>'] + x + ['</s>']
          for x in natr_corpus_tk[int(0.8 * len(natr_corpus_tk)):]], [])
     food_lm = Laplace(3)
     natr_lm = Laplace(3)
     food_lm.fit(food_train, food_vocab)
     natr_lm.fit(natr_train, natr_vocab)
     self.assertEqual(int(lab3.get_perplexity(food_lm, food_test[:2500])),
                      7318)
     self.assertEqual(int(lab3.get_perplexity(food_lm, natr_test[:2500])),
                      7309)
     self.assertEqual(int(lab3.get_perplexity(natr_lm, natr_test[:2500])),
                      5222)
     self.assertEqual(int(lab3.get_perplexity(natr_lm, food_test[:2500])),
                      5354)

Beispiel #4

0

Datei anzeigen

Datei: q2.py Projekt: nazaninsbr/NLP-UT

def calculate_word_ngrams(data):
    text_bigrams, text_unigrams = {}, {}
    for news_type in data.keys():
        all_news_type_texts = []
        for news in data[news_type]:
            all_news_texts = []
            for sent in news:
                all_news_texts.extend(sent)
            all_news_type_texts.append(all_news_texts)
        train_bi, vocab_bi = padded_everygram_pipeline(2, all_news_type_texts)
        text_bigrams[news_type] = {'train': train_bi, 'vocab': vocab_bi}
        train_uni, vocab_uni = padded_everygram_pipeline(1, all_news_type_texts)
        text_unigrams[news_type] = {'train': train_uni, 'vocab': vocab_uni}
    return text_unigrams, text_bigrams

Beispiel #5

0

Datei anzeigen

Datei: PPEvaluator.py Projekt: gozdeaslantas/NLP_Learning

    def compute_pp(self, n, tokenized_train, tokenized_test):
        train_data, padded_sents = padded_everygram_pipeline(
            n, tokenized_train)
        test_data, padded_sents = padded_everygram_pipeline(n, tokenized_test)
        model = Laplace(1)
        model.fit(train_data, padded_sents)

        s = 0
        for i, test in enumerate(test_data):
            p = model.perplexity(test)
            s += p

        perplexity = s / (i + 1)
        return perplexity

Beispiel #6

0

Datei anzeigen

def train_ngram_lm(tokenized_text, models, n=3, a=0.0015, unk_cutoff=10, discount=0.1):
    training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text)
    vocab = Vocabulary(padded_sents, unk_cutoff=unk_cutoff)
    lms = []
    for model in models:
        training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text)
        if model == 'Kneser Ney':
            lm = MKneserNeyInterpolated(order=n, discount=discount, vocabulary=vocab)
        elif model == 'WBI':
            lm = MWittenBellInterpolated(order=n, vocabulary=vocab)
        elif model == 'Lidstone':
            lm = MLidstone(gamma=a, order=n, vocabulary=vocab)
        lm.fit(training_ngrams)
        lms += [lm]
    return lms

Beispiel #7

0

Datei anzeigen

Datei: ex_ngram.py Projekt: mfkiwl/hdlp

def train_ngram_model(src_dict: dict, ngram_order=N_GRAM_ORDER):
    print(f"Training {ngram_order}-gram model on train dataset...")
    train_data, padded_sents = padded_everygram_pipeline(
        ngram_order, src_dict["train"])
    model = MLE(ngram_order)
    model.fit(train_data, padded_sents)
    return model

Beispiel #8

0

Datei anzeigen

Datei: format.py Projekt: minhptx/spade

    def fit(self, dirty_df: pd.DataFrame, col):
        tokenized_text = [
            word_tokenize(value) for value in dirty_df[col].values
        ]

        train_data, padded_sents = padded_everygram_pipeline(2, tokenized_text)
        self.model.fit(train_data, padded_sents)

Beispiel #9

0

Datei anzeigen

    def __init__(self,
                 training_set: List[str],
                 n_param: int = 3,
                 max_predict=4):
        super().__init__()
        '''
        Initialize the completions for the test phrase
        '''

        # convert sentence to list[words] using tokenizer
        # self.tokenizer = ToktokTokenizer()

        training_ngrams, padded_sentences = padded_everygram_pipeline(
            n_param,
            #list(map(self.tokenizer.tokenize, training_set)),
            list(map(wordpunct_tokenize, training_set)),
        )

        # print(len(training_ngrams))
        # temp = list(training_ngrams)
        # for i in range(10):
        #     print(list(temp[i]))

        self.model_obj = MLE(order=n_param)
        self.model_obj.fit(training_ngrams, padded_sentences)
        print('Vocab length: {}'.format(len(self.model_obj.vocab)))
        print('Counts: ', self.model_obj.counts)

        self.max_predict = max_predict

Beispiel #10

0

Datei anzeigen

Datei: project02.py Projekt: cavitcakir/Turkish-Text-Exploration

def generate_sentence(LM3_MLE,text):
    min_per = 10000000000000000000000
    min_text =""
    for i in range(5):
        starting_text = ["<s>"]
        starting_text.append(text)
        generated = generate_sent(LM3_MLE, starting_text)
        test_tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(generated)]
        test_data, _ = padded_everygram_pipeline(LM3_MLE.order, test_tokenized_text)

        sentences =[]
        for test in test_data:
            for each in list(test):
                sentences.append(each)

        ngram_list =[]
        for each in sentences:
            if(len(each) == LM3_MLE.order and (each[0] != "<s>" and each[-1] != "</s>")):
                ngram_list.append(each)
        
        if(len(ngram_list)>0):
            if(LM3_MLE.perplexity(ngram_list) < min_per):
                min_per = LM3_MLE.perplexity(ngram_list)
                min_text = generated
            elif(LM3_MLE.perplexity(ngram_list) == min_per and len(generated) > len(min_text)):
                min_per = LM3_MLE.perplexity(ngram_list)
                min_text = generated
    return text+" "+min_text,min_per

Beispiel #11

0

Datei anzeigen

def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=1):
    """
    Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param model: un des éléments de (MLE, Lidstone, Laplace)
    :param n: int, l'ordre du modèle
    :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet
    argument doit être renseigné
    :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK>
    :return: un modèle entraîné
    """

    train, words = padded_everygram_pipeline(n, corpus.copy())
    vocab = Vocabulary(words, unk_cutoff)

    if (model == Lidstone) and (gamma is not None):
        model = Lidstone(gamma,n,vocab)
        model.fit(train)
    elif model == MLE:
        model = mle.train_MLE_model(corpus, n)
    elif model == Laplace:
        model = Laplace(n,vocab)
        model.fit(train)

    return model

Beispiel #12

0

Datei anzeigen

Datei: q2.py Projekt: nazaninsbr/NLP-UT

def calculate_characters_ngrams(data):
    text_bigrams, text_unigrams = {}, {}
    for news_type in data.keys():
        all_news_type_texts = []
        for news in data[news_type]:
            all_news_texts = []
            for sent in news:
                for word in sent:
                    all_chars = [c for c in word]+[' ']
                    all_news_texts.extend(all_chars)
            all_news_type_texts.append(all_news_texts)
        train_bi, vocab_bi = padded_everygram_pipeline(2, all_news_type_texts)
        text_bigrams[news_type] = {'train': train_bi, 'vocab': vocab_bi}
        train_uni, vocab_uni = padded_everygram_pipeline(1, all_news_type_texts)
        text_unigrams[news_type] = {'train': train_uni, 'vocab': vocab_uni}
    return text_unigrams, text_bigrams

Beispiel #13

0

Datei anzeigen

Datei: nextWord.py Projekt: arrgee23/ml

def makeModel():
    #sentences = webtext.raw()+brown.raw()+reuters.raw()
    sentences = webtext.raw() + reuters.raw()
    # Tokenize the sentences
    try:  # Use the default NLTK tokenizer.
        from nltk import word_tokenize, sent_tokenize
        # Testing whether it works.
        # Sometimes it doesn't work on some machines because of setup issues.
        word_tokenize(
            sent_tokenize("This is a foobar sentence. Yes it is.")[0])

    except:  # Use a naive sentence tokenizer and toktok.
        import re
        from nltk.tokenize import ToktokTokenizer
        # See https://stackoverflow.com/a/25736515/610569
        sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
        # Use the toktok tokenizer that requires no dependencies.
        toktok = ToktokTokenizer()
        word_tokenize = word_tokenize = toktok.tokenize

    tokenized_text = [
        list(map(str.lower, word_tokenize(sent)))
        for sent in sent_tokenize(sentences)
    ]

    # Make it ready for making 3 grams
    n = 5
    train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

    model = MLE(n)  # Lets train a 3-grams model, previously we set n=3

    model.fit(train_data, padded_sents)
    #print(model.vocab)

    return model

Beispiel #14

0

Datei anzeigen

def create_LanguageModel(docs, model_type="MLE", ngram=3):
    global _ngram
    _ngram = ngram
    tokenized_text = []

    new_docs = preprocess(docs)

    for d in new_docs:
        text = sent_tokenize(d, language="turkish")
        for sent in text:
            temp = []
            for i in word_tokenize(sent, language="turkish"):
                temp.append(i.lower())
            tokenized_text.append(temp)

    training_ngrams, vocab = padded_everygram_pipeline(ngram, tokenized_text)

    if model_type == "MLE":
        model = MLE(ngram)  #, vocabulary=Vocabulary(vocab))
        model.fit(training_ngrams, vocab)
        # print(model.vocab)
        return model
    elif model_type == "KneserNeyInterpolated":
        model = KneserNeyInterpolated(ngram)
        model.fit(training_ngrams, vocab)  # padded_sents)
        # print(model.vocab)
        return model
    else:
        print("Unkown Model Type")
        return 0

Beispiel #15

0

Datei anzeigen

Datei: ngram_controlled_gen.py Projekt: shahhaard47/Script-Generation

 def _parallel_load_genre_to_datadict(self, genre):
     """ DOESN'T WORK """
     scripts = self.all_scripts_for_genre(self.df, genre)
     # print("processing :", genre, len(scripts))
     tokenized = self.tokenize_scripts(scripts, genre)
     ngrams, vocab = padded_everygram_pipeline(self.n, tokenized)
     self.data_dict[genre] = (ngrams, vocab)

Beispiel #16

0

Datei anzeigen

Datei: model.py Projekt: gaoshengqing21/thesis

 def fit(self, sequences: List[List]):
     train, vocab = padded_everygram_pipeline(self.config.GRAM_SIZE, sequences)
     model = MLE(self.config.GRAM_SIZE)
     model.fit(train, vocab)
     self.model = model
     if self.config.SAVE_PATH:
         self.save_model(self.config.SAVE_PATH)

Beispiel #17

0

Datei anzeigen

Datei: nltk_models.py Projekt: avillemin/NLP-INF8460

def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=2):
    """
    Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param model: un des éléments de (MLE, Lidstone, Laplace)
    :param n: int, l'ordre du modèle
    :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet
    argument doit être renseigné
    :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK>
    :return: un modèle entraîné
    """
    lm = None
    ngrams, words = padded_everygram_pipeline(n, corpus)
    vocab = Vocabulary(words, unk_cutoff=unk_cutoff)
    if model== MLE:
        lm = model(n, vocabulary=vocab)
        lm.fit(ngrams)
    elif model == Lidstone:
        if gamma == None:
            raise Exception('Please enter a value for gamma')
        else:
            lm = Lidstone(gamma, order = n, vocabulary=vocab)
            lm.fit(ngrams)
    elif model==Laplace:
        lm = Laplace(order = n, vocabulary=vocab)
        lm.fit(ngrams)
    else:
        raise Exception('Wrong model in train_LM_model')
    return lm

Beispiel #18

0

Datei anzeigen

Datei: nltk_models.py Projekt: MatthiasRamos/TP1-INF8460

def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=2):
    """
    Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param model: un des éléments de (MLE, Lidstone, Laplace)
    :param n: int, l'ordre du modèle
    :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet
    argument doit être renseigné
    :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK>
    :return: un modèle entraîné
    """
    if model not in [MLE, Laplace, Lidstone]:
        raise TypeError("Unkown model type! supported types: (MLE, Lidstone, Laplace)")

    ngrams, words = padded_everygram_pipeline(n, corpus)
    vocab = Vocabulary(words, unk_cutoff=unk_cutoff)

    params = {
        "order":n,
        "vocabulary":vocab,
    }
    if model == Lidstone:
        params["gamma"] = gamma
    ist_model = model(**params)
    ist_model.fit(ngrams)
    
    return ist_model

Beispiel #19

0

Datei anzeigen

    def tokenize_text(self, text):
        tokenized_text = [
            list(word_tokenize(sent)) for sent in sent_tokenize(text)
        ]

        train_data, padded_sents = padded_everygram_pipeline(
            NGRAM, tokenized_text)
        return padded_sents

Beispiel #20

0

Datei anzeigen

    def train_P(self):
        n = 3
        train_data, padded_sents = padded_everygram_pipeline(n, self.X_train)

        language_model = MLE(n)
        language_model.fit(train_data, padded_sents)
        language_model.vocab()
        return language_model

Beispiel #21

0

Datei anzeigen

    def build_ngram_lm(self, train):
        if not train: return None

        n = 5  # up to 5 gram language model
        train, vocab = padded_everygram_pipeline(n, train)
        model = KneserNeyInterpolated(n)
        model.fit(train, vocab)
        return model

Beispiel #22

0

Datei anzeigen

Datei: utils.py Projekt: souvikdgp16/dialog_zoo_naam_pai

def train_ngram_lm(dataset, data, ngram=3, gamma=0.5):
    print(f'[!] max 3-gram, Lidstone smoothing with gamma 0.5')
    train, vocab = padded_everygram_pipeline(ngram, data)
    lm = Lidstone(gamma, ngram)
    lm.fit(train, vocab)
    with open(f'./data/{dataset}/lm.pkl', 'wb') as f:
        pickle.dump(lm, f)
    print(f'[!] ngram language model saved into ./data/{dataset}/lm.pkl')

Beispiel #23

0

Datei anzeigen

Datei: query_lms.py Projekt: halolimat/work

    def score(self, patient_id):

        # txt="Cancer refers to any one of a large number of diseases characterized by the development of abnormal cells that divide uncontrollably and have the ability to infiltrate and destroy normal body tissue. Cancer often has the ability to spread throughout your body. Cancer is the second-leading cause of death in the world."

        # ===================================
        import json
        if False:
            hits=self.es.search_list("patient_id", [patient_id])
            with open("txt", "w") as f:
                json.dump(hits, f)
        else:
            with open("txt") as f:
                hits=json.load(f)

        seq=[]
        for hit in hits:
            for page in hit["_source"]["doc_pages"]:
                doc=self.nlp(page["page_contents"])
                seq+=[tuple(token.text for token in sent) for sent in doc.sents]

        m_names=list(self.models)
        scores=[]

        test_data, _ = padded_everygram_pipeline(2, seq)

        seq=[]
        for i, test in enumerate(test_data):
            seq.append(tuple([x for x in test]))

        for mname in self.models:
            # print(mname)
            # int_scores=[]
            # for i, test in enumerate(test_data):
            #     pp=self.models[mname].perplexity(test)
            #     scores.append(pp)
            #     int_scores.append(pp)
            #
            # print(min(int_scores))
            # scores.append(min(int_scores))

            start = timeit.default_timer()

            # the winning model is the model with the lowest perplexity
            pp=self.models[mname].perplexity(seq)
            scores.append(pp)
            print(mname)
            print(pp)
            print("----")
            print('Time: ', timeit.default_timer() - start)

        m=min(scores)

        pred_class=[]
        for idx in range(len(scores)):
            if scores[idx]==m:
                pred_class.append(m_names[idx])

        return pred_class

Beispiel #24

0

Datei anzeigen

Datei: lm.py Projekt: SixingYan/TextErrorDetect

def getEveryModel(n: int, text: List, ngrams):
    """ get mixed-n model """
    lm = MLE(n)

    train, vocab = padded_everygram_pipeline(n, text)

    lm.fit(train, vocab)

    return lm

Beispiel #25

0

Datei anzeigen

    def create_model_as_dict(self, corpus_name):
        # USO: dict(model["vale", "la"])['revancha']
        #                 previous_words    word

        # Create a placeholder for model
        # model = defaultdict(lambda: defaultdict(lambda: 0))

        print('leyendo corpus')
        reader = PlaintextCorpusReader(CORPUS_DIR, corpus_name)
        print('leyo corpus')
        train, vocab = padded_everygram_pipeline(self.ngram, reader.sents())
        print('everygram completed')

        model = dict()
        appearences = dict()

        # Cada elemento de list(train) es la lista con todos los ngrama (1,2,3,...) de cada sentencia
        i = 1
        print('inicializando')
        for everygram in train:
            # print('i:', i)
            # i += 1
            # j = 1
            for gram in everygram:
                # print('j:', j)
                j += 1
                if len(gram) == 1:
                    if gram[0] not in appearences:
                        appearences[gram[0]] = 1
                    else:
                        appearences[gram[0]] += 1
                elif len(gram) == 2:
                    if not gram[0] in model:
                        model[gram[0]] = dict()
                    if gram[1] not in model[gram[0]]:
                        model[gram[0]][gram[1]] = 1
                    else:
                        model[gram[0]][gram[1]] += 1
                elif len(gram) == 3:
                    if not (gram[0], gram[1]) in model:
                        model[(gram[0], gram[1])] = dict()
                    if gram[2] not in model[(gram[0], gram[1])]:
                        model[(gram[0], gram[1])][gram[2]] = 1
                    else:
                        model[(gram[0], gram[1])][gram[2]] += 1

        # print('conto todo', model)
        for w1 in model:
            # print('sumando', w1)
            total_count = float(sum(model[w1].values()))
            for w3 in model[w1]:
                # print('calculando', w3)
                model[w1][w3] /= total_count

        print('paso a probabilidad')

        return appearences, model

Beispiel #26

0

Datei anzeigen

 def test_padded_everygram_pipeline(self):
     expected_train = [[("<s>", ), ("<s>", "a"), ("a", ), ("a", "b"),
                        ("b", ), ("b", "c"), ("c", ), ("c", "</s>"),
                        ("</s>", )]]
     expected_vocab = ["<s>", "a", "b", "c", "</s>"]
     train_data, vocab_data = padded_everygram_pipeline(
         2, [["a", "b", "c"]])
     self.assertEqual([list(sent) for sent in train_data], expected_train)
     self.assertEqual(list(vocab_data), expected_vocab)

Beispiel #27

0

Datei anzeigen

    def fit(self, text: List[str], order: int):
        self.model = Laplace(order)
        self.order = order
        train_data, padded_sents = padded_everygram_pipeline(order, text)

        print('Fitting n-gram model', file=sys.stderr)
        self.model.fit(train_data, padded_sents)
        print(f'Vocabulary size: {self.model.vocab}', file=sys.stderr)
        return self

Beispiel #28

0

Datei anzeigen

Datei: project02.py Projekt: cavitcakir/Turkish-Text-Exploration

def create_LanguageModel(Docs,model_type,ngram):
    text = " ".join(Docs)
    text = text.replace("\\n"," ")
    tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)]
    train_data, padded_sents = padded_everygram_pipeline(ngram, tokenized_text)
    model = MLE(ngram)
    if model_type != "MLE":
        model = KneserNeyInterpolated(ngram) 
    model.fit(train_data, padded_sents)
    return model

Beispiel #29

0

Datei anzeigen

def create_and_fit_model(corpus):
    # Recibe un corpus tokenizado por sentencia y por palabra.
    train_data, padded_sents = padded_everygram_pipeline(NGRAM, corpus)

    # Crea modelo
    model = MLE(NGRAM)

    # Ajusta a los datos
    model.fit(train_data, padded_sents)

    return model

Beispiel #30

0

Datei anzeigen

Datei: corpus_ngram.py Projekt: ggaudreault/prep_challenge

 def create_model(self, model_nm):
     self.model = {
         "lidstone": Lidstone(0.5, self.ngram_order),
         "kneserney": KneserNeyInterpolated(self.ngram_order),
         "wittenbell": WittenBellInterpolated(self.ngram_order)
     }[model_nm]
     train, vocab = padded_everygram_pipeline(self.ngram_order, self.text)
     vocab = Vocabulary(vocab, unk_cutoff=2, unk_label="<UNK>")
     print("Creating ngram...")
     self.model.fit(train, vocab)
     print("done")

Beispiel #31

0

Datei anzeigen

Datei: test_preprocessing.py Projekt: rmalouf/nltk

 def test_padded_everygram_pipeline(self):
     expected_train = [
         [
             ("<s>",),
             ("a",),
             ("b",),
             ("c",),
             ("</s>",),
             ("<s>", "a"),
             ("a", "b"),
             ("b", "c"),
             ("c", "</s>"),
         ]
     ]
     expected_vocab = ["<s>", "a", "b", "c", "</s>"]
     train_data, vocab_data = padded_everygram_pipeline(2, [["a", "b", "c"]])
     self.assertEqual([list(sent) for sent in train_data], expected_train)
     self.assertEqual(list(vocab_data), expected_vocab)