Python tokenization Exemples, preprocess.tokenization Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : token_iterator.py Projet : elvis-alexander/StonyBrookCS

 def __next__(self):
     # reached end of file
     if self.curr_line + 1 == self.num_lines:
         raise StopIteration
     curr_index = self.curr_index
     curr_line = self.curr_line
     sentences = self.sentences
     if curr_index == 0:
         sentences[curr_line] = tokenization(sentences[curr_line])
     tok1 = split_token(sentences[curr_line][curr_index])
     tok2 = split_token(sentences[curr_line][curr_index + 1])
     self.curr_index += 1
     end_of_sentence = False
     if self.curr_index + 1 == len(self.sentences[curr_line]):
         self.curr_index = 0
         self.curr_line += 1
         end_of_sentence = True
     # return (word1, tag1), (word2, tag2)
     return (tok1[0], tok1[1]), (tok2[0], tok2[1]), end_of_sentence

Exemple #2

0

Afficher le fichier

Fichier : ngram.py Projet : elvis-alexander/StonyBrookCS

 def min_perplexity(self):
     dev = open(dev_file, 'r')
     perplexities = []
     sentences = sentence_segmentation(dev)
     lam_values = [0.1, 0.3, 0.5, 0.7, 0.9]
     for lam in lam_values:
         sigma = 0
         dev_len = 0
         for sen in sentences:
             tokens = tokenization(start_sym + ' ' + sen + ' ' + end_sym)
             x = start_sym
             for y in tokens[1:]:
                 if y == '':
                     continue
                 y = y.lower()
                 inter_pol = self.dev_interpolation(x, y, lam)
                 if inter_pol != 0:
                     sigma += log(inter_pol, 2)
                     dev_len += 1
                 x = y
         perplexities.append((pow(2, (-1 / float(dev_len)) * sigma), lam))
     # returns tuple (perplexity, lambda)
     return min(perplexities)

Exemple #3

0

Afficher le fichier

Fichier : test_preprocess.py Projet : yoveena24-hub/FIT3162_Group3B_CodeBundle

def testtokenize():
    df = pd.read_csv("testPreprocessfile.csv")
    df1 = preprocess.tokenization(df,"posts")
    df1.to_csv("testPreprocessfile.csv")
    assert df1['title'][0] == ['Any', 'ongoing', 'good' ,'deals', 'on', 'XPS', '15']
    assert df1['description'][0] == ['Hello','am','there']

Exemple #4

0

Afficher le fichier

tags = data['tags']
model_state = data["model_state"]

model = NeuralNetwork(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval()

bot_name = "Aapka Dost"
print("Let's chat! (type 'quit' to exit)")
while True:
    # sentence = "do you use credit cards?"
    sentence = input("You: ")
    if sentence == "quit":
        break

    sentence = tokenization(sentence)
    X = bagofWords(sentence, all_words)
    X = X.reshape(1, X.shape[0])
    X = torch.from_numpy(X).to(device)

    output = model(X)
    _, predicted = torch.max(output, dim=1)

    tag = tags[predicted.item()]

    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]
    if prob.item() > 0.8:
        for intent in intents['intents']:
            if tag == intent["tag"]:
                print(f"{bot_name}: {random.choice(intent['responses'])}")

Exemple #5

0

Afficher le fichier

Fichier : __init__.py Projet : weilk/SemEval-2019

from preprocess.make_lower_case import *
from preprocess.eliminate_stop_words import *
from preprocess.replace_negation_words import *
from preprocess.tokenization import *
from preprocess.one_hot_encode import *
from preprocess.embed_200 import *
from preprocess.spellingcheck import *
from preprocess.extract_redundant_words import *

make_lower_case = make_lower_case(0, "make_lower_case", 1)
eliminate_stop_words = eliminate_stop_words(-5, "eliminate_stop_words", 2)
replace_negation_words = replace_negation_words(5, "replace_negation_words", 3)
tokenization = tokenization(0, "tokenization", 4)
one_hot_encode = one_hot_encode(-100, "one_hot_encode", 5)
spellingcheck = spellingcheck(50, "spellingcheck", 6)
embed_200 = embed_200(0, "embed_200", 7)

Exemple #6

0

Afficher le fichier

Fichier : main.py Projet : shantanugodbole/Aapka-Dost

from model import NeuralNetwork

with open('intents.json', 'r') as file:
    intents = json.load(file)

# Create a list of all words for bag of words model.
all_words = []
tags = []
matcher = []

# Extracting tags and sentences from intents
for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        w = tokenization(pattern)
        all_words.extend(w)
        matcher.append((w, tag))

all_words = refineWords(all_words)
all_words = stemmed_output(all_words)

# Unique words and tags
all_words = sorted(set(all_words))
tags = sorted(set(tags))

# Creating training data
X_train = []
y_train = []

for (sentence, tag) in matcher:

Exemple #7

0

Afficher le fichier

from ngram import Ngram
from preprocess import sentence_segmentation, tokenization
from constants import start_sym, end_sym, train_file

# parses train file
# creates a dictionary mapping words to occurences
# creates a second dictionary mapping bigrams to occurences
if __name__ == '__main__':
    train = open(train_file, 'r')
    bigrams = dict()
    words = dict()
    total_tokens = 0
    for sen in sentence_segmentation(train):
        if sen == '':
            continue
        tokens = tokenization("{} {} {}".format(start_sym, sen, end_sym))
        x = tokens[0]
        words[x] = words.get(x, 0) + 1
        for y in tokens[1:]:
            if y == '':
                continue
            y = y.lower()
            bigrams[(x, y)] = bigrams.get((x, y), 0) + 1
            words[y] = words.get(y, 0) + 1
            x = y
        total_tokens += len(tokens)

    training_gram = Ngram(total_tokens, words, bigrams)
    # creates bigram.lm file
    training_gram.create_bigram_lm()
    # creates unigram.lm file

Exemple #8

0

Afficher le fichier

if __name__ == '__main__':
    if len(argv) < 4:
        print 'Please run with the following format python perplexity.py <path_to_bigram_lm> <path_to_unigram_lm> <path_to_test_file>'
        exit()
    # load unigram
    uni = load_unigram(open(argv[2]))
    words = uni['words']
    total_tokens = uni['total_tokens']
    # load bigram
    bigrams = load_bigram(open(argv[1]))
    training_gram = Ngram(total_tokens, words, bigrams)
    # calculate perplexities
    test_words = []
    sentences = sentence_segmentation(open(argv[3]))
    for sen in sentences:
        tokens = tokenization(start_sym + ' ' + sen + ' ' + end_sym)
        for tok in tokens:
            if tok == '':
                continue
            tok = tok.lower()
            test_words.append(tok)
    test_size = len(test_words)
    # calculating perplexities
    bi_perplexity = 0
    inter_perplexity = 0
    uni_perplexity = 0
    x = start_sym
    # calculate summation
    uni_perplexity += log(training_gram.laplace_unigram(x), 2)
    for y in test_words[1:]:
        bi_perplexity += log(training_gram.laplace_bigram(x, y), 2)