Python stemの例、nltk_utils.stem Pythonの例

コード例 #1

0

ファイルを表示

    def __init__(self, intents_file_path, model_name='data'):
        self.model_name = model_name
        with open(intents_file_path, 'r') as f:
            intents = json.load(f)

        self.all_words = []
        self.tags = []
        self.xy = []
        self.x_train = []
        self.y_train = []
        # loop through each sentence in intents patterns
        for intent in intents['intents']:
            tag = intent['tag']
            self.tags.append(tag)
            for pattern in intent['patterns']:
                # tokenize each word in the sentence
                w = tokenize(pattern)
                # add to our words list
                self.all_words.extend(w)
                # add to xy pair
                self.xy.append((w, tag))

        # stem and lower each word
        ignore_words = ['?', '.', '!']
        self.all_words = [
            stem(w) for w in self.all_words if w not in ignore_words
        ]
        # remove duplicates and sort
        self.all_words = sorted(set(self.all_words))
        self.tags = sorted(set(self.tags))

コード例 #2

0

ファイルを表示

def chat():
    
    user_responses = []
    bot_responses = []
    sentence1 = request.form['user_input']
    user_responses.append(sentence1)
    sentence = tokenize(sentence1)
    stemmed_words = [ stem(w) for w in sentence]
    no_of_pizza=0
    order_id = 0
    sts = ['Your food is being prepared', 'Our executive is out for delivery', 'Sorry for the inconvinence.. you will get you order within 10 mins']
    order_sts = sts[random.randint(0,2)]
    for w in stemmed_words:
        if w == 'order' or w =='want' or w == 'need':
            for wrd in stemmed_words:
                if re.match('^[0-9]*$', wrd):
                    no_of_pizza = int(wrd)
                    choices = list(range(100))
                    random.shuffle(choices)
                    order_id = choices.pop()
                    sts = ['Your food is being prepared', 'Our executive is out for delivery', 'Sorry for the inconvinence.. you will get you order within 10 mins']
                    order_sts = sts[random.randint(0,2)]
                    order_details = {'_id': order_id , 'Address':'none', 'Status': order_sts}
                    collection.insert_one(order_details)
                    bot = "your order has been recorded and your order id is {order_id}, kindly provide us delivery details"
                    return render_template('index.html', user_input=sentence1, bot_response = bot )
        elif w == 'address':
            result = collection.update({"_id":order_id}, {"$set":{"Address": sentence1}})
            bot = "your delivery details are recorded status of your order: {order_sts}"
            return render_template('index.html', user_input=sentence1, bot_response = bot )
        elif w == 'status':
            results = collection.find_one({"_id":order_id})
            if results == 'None':
                bot = 'No orderFound in this id'
                return render_template('index.html', user_input=sentence1, bot_response = bot )
            else:
                bot = results.get('Status')
                return render_template('index.html', user_input=sentence1, bot_response = bot )
                
    X = bag_of_words(sentence, all_words)
    X = X.reshape(1, X.shape[0])
    X = torch.from_numpy(X)
    
    output = model(X)
    _, predicted = torch.max(output, dim=1)
    tag = tags[predicted.item()]
    
    probs = torch.softmax(output, dim =1)
    prob = probs[0][predicted.item()] 
    
       
    for intent in intents['intents']:
            if tag == intent['tag']:
                bot = random.choice(intent["responses"])
                return render_template('index.html', user_input=sentence1, bot_response = bot )

コード例 #3

0

ファイルを表示

ファイル: train_data.py プロジェクト: NwekeChidi/Deep_Learning_Projects

def prepare_train_data(): 
    import numpy as np
    import json
    from nltk.corpus import stopwords
    from nltk_utils import tokenize, stem, convert_and_pad

    with open( "intents.json", "r" ) as f:
        data = json.load( f )

    all_words = [ ]
    tags = [ ]
    x_y = [ ]

    for d_point in data[ "intents" ]:
        tag = d_point[ 'tag' ]
        tags.append( tag )
        
        for pattern in d_point[ "patterns" ]:
            word = tokenize( pattern )
            word = [ s_w.lower() for s_w in word ]
            all_words.extend( word )
            x_y.append( (word, tag) )

    signs = [ '!', '?', ',', '.']
    all_words = [ stem( w ) for w in all_words if w not in signs ]
    all_words = sorted( set(all_words) )
    tags = sorted( set(tags) )

    X_train = []
    y_train = []

    for ( pattern_sentence, tag ) in x_y:
        bag_of_words = convert_and_pad( all_words, pattern_sentence )
        X_train.append( bag_of_words )

        label = tags.index( tag )
        y_train.append( float(label) )
    y_train = np.array( y_train )
    
    ## Testing fuction/
    # sents = ['hi', 'hello', 'i', 'you', 'bye', 'thank', 'cool']
    # ss = ['hello', 'how', 'are', 'you']
    # print(convert_and_pad(sents, ss))

    return X_train, y_train, [ all_words, tags, x_y ]

コード例 #4

0

ファイルを表示

def process():
    if 'username' in session:
        if request.method == 'POST':
            sentence = request.form['user_input']
            username = session['username']
            print(username)
            tag_result = request.form.get('tags')
            tage_value = str(tag_result)
            question = sentence
            #sentence = tokenize(sentence)

            sentence = tokenize(sentence.lower())
            stopsets = [
                'a', 'an', 'the', 'i', 'you', 'one', 'do', 'of', 'in', 'like',
                'for', 'from', 'to', 'as', 'by', 'about', 'off', 'did', 'am',
                'is', 'are', 'was', 'were', 'if', 'is', 'on', 'what', 'when',
                'where', 'which', 'and', 'tell', 'me', 'my', 'must', 'can',
                'could', 'would', 'that', 'or', 'anyone', 'any', 'many',
                'there'
            ]
            stopX = [stem(w) for w in sentence if w not in stopsets]
            print("\n>>>[", stopX)

            #X = bag_of_words(sentence, all_words)
            X = bag_of_words(stopX, all_words)
            X = X.reshape(1, X.shape[0])
            X = torch.from_numpy(X).to(device)
            output = model(X)
            _, predicted = torch.max(output, dim=1)

            tag = tags[predicted.item()]

            probs = torch.softmax(output, dim=1)
            prob = probs[0][predicted.item()]
            if prob.item() > 0.75:
                for intent in intents['intents']:
                    if tag == intent["tag"]:
                        if tag_result == 'Select':
                            return render_template(
                                'index.html',
                                user_input=question,
                                bot_response=random.choice(
                                    intent['responses']),
                                error=
                                'Please select any option from the dropdown')
                        else:
                            # f = open("file.txt", "a")
                            # f.write(tage_value + '\n' + question + '\n' + random.choice(intent['responses']) + '\n')
                            print(username)
                            print(tage_value)
                            print(question)
                            print(random.choice(intent['responses']))
                            chatTable = mongo.db.Chat
                            chatData = chatTable.find({'name': username})
                            all_questions = []
                            all_answers = []
                            all_yes_reviews = []
                            all_no_reviews = []
                            if chatData:
                                for x in chatData:
                                    questions = x['details']['question']
                                    answers = x['details']['answer']
                                    reviews = x['details']['review']
                                    all_questions.append(questions)
                                    all_answers.append(answers)
                                    if reviews == 'Yes':
                                        all_yes_reviews.append(reviews)
                                    elif reviews == 'No':
                                        all_no_reviews.append(reviews)
                                all_questions.reverse()
                                all_answers.reverse()
                                yes_count = len(all_yes_reviews)
                                no_count = len(all_no_reviews)
                                efficiency = (
                                    int(yes_count) /
                                    (int(yes_count) + int(no_count))) * 100
                                efficiency = round(efficiency, 2)

                                return render_template(
                                    'review.html',
                                    message=session['username'],
                                    user_names=username,
                                    tag_value=tage_value,
                                    user_input=question,
                                    bot_response=random.choice(
                                        intent['responses']),
                                    all_history=zip(all_questions,
                                                    all_answers),
                                    yes_count=yes_count,
                                    no_count=no_count,
                                    efficiency=efficiency)
                            else:
                                return render_template(
                                    'review.html',
                                    message=session['username'],
                                    user_names=username,
                                    tag_value=tage_value,
                                    user_input=question,
                                    bot_response=random.choice(
                                        intent['responses']),
                                    all_history=zip(all_questions,
                                                    all_answers))
                        # print(f"{bot_name}: {random.choice(intent['responses'])}")
            else:
                print("hello")
                if tag_result == 'Select':
                    return render_template(
                        'index.html',
                        user_input=question,
                        bot_response="I don't understand",
                        error='Please select any option the dropdown')
                else:
                    # f = open("file.txt", "a")
                    # f.write(tage_value + '\n' + question + '\n' + 'I donot understand\n')
                    bot_response = web_scraping(question)
                    chatTable = mongo.db.Chat
                    chatData = chatTable.find({'name': username})
                    all_questions = []
                    all_answers = []
                    all_yes_reviews = []
                    all_no_reviews = []
                    if chatData:
                        for x in chatData:
                            questions = x['details']['question']
                            answers = x['details']['answer']
                            reviews = x['details']['review']
                            all_questions.append(questions)
                            all_answers.append(answers)
                            if reviews == 'Yes':
                                all_yes_reviews.append(reviews)
                            elif reviews == 'No':
                                all_no_reviews.append(reviews)
                        all_questions.reverse()
                        all_answers.reverse()
                        yes_count = len(all_yes_reviews)
                        no_count = len(all_no_reviews)
                        efficiency = (int(yes_count) /
                                      (int(yes_count) + int(no_count))) * 100
                        efficiency = round(efficiency, 2)

                        return render_template('review.html',
                                               message=session['username'],
                                               user_names=username,
                                               tag_value=tage_value,
                                               user_input=question,
                                               bot_response=bot_response,
                                               all_history=zip(
                                                   all_questions, all_answers),
                                               yes_count=yes_count,
                                               no_count=no_count,
                                               efficiency=efficiency)
                    else:
                        return render_template('review.html',
                                               message=session['username'],
                                               user_names=username,
                                               tag_value=tage_value,
                                               user_input=question,
                                               bot_response=bot_response,
                                               all_history=zip(
                                                   all_questions, all_answers))
    return render_template('mainindex.html')

コード例 #5

0

ファイルを表示

#collect patterns and their tags
tags = []

#this will hold all patterns and tags
xy = []

for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        w = tokenize(pattern)
        all_words.extend(w)
        xy.append((w, tag))


all_words = [stem(w) for w in all_words if w not in punctuation]
all_words = sorted(set(all_words))
tags = sorted(set(tags))


X_train = []
y_train = []

for (pattern_sentence, tag) in xy:
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)
    
    label = tags.index(tag)
    y_train.append(label)  #1 hot

X_train = np.array(X_train)

コード例 #6

0

ファイルを表示

ファイル: train.py プロジェクト: vikaskbm/TestBot

    intents = json.load(f)

# loop through each sentence in our intents patterns
for intent in intents['intents']:
    tag = intent['tag']
    # add to tag list
    tags.append(tag)
    for pattern in intent['patterns']:
        # tokenize each word in the sentence
        w = tokenize(pattern)
        # add to our words list
        all_strings.extend(w)
        # add to ls pair
        ls.append((w, tag))

all_strings = [stem(w) for w in all_strings if w not in ignore_words]
all_strings = sorted(set(all_strings))
tags = sorted(set(tags))

for (pattern_sentence, tag) in ls:
    # X: bag of words for each pattern_sentence
    bag = bag_of_words(pattern_sentence, all_strings)
    x_train.append(bag)
    # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot
    label = tags.index(tag)
    y_train.append(label)

x_train = np.array(x_train)
y_train = np.array(y_train)

コード例 #7

0

ファイルを表示

ファイル: train.py プロジェクト: luckydipper/SW_festival

all_words = []
tags = []
xy = []

for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        w = tokenize(pattern)
        all_words.extend(w)
        xy.append((w, tag))

ignore_word = [",", ".", "'", '"', "?", "!", "^", "@", "#", "_",
               "-"]  #we need, regular expression
all_words = [stem(w) for w in all_words
             if w not in ignore_word]  #this is better than using map
all_words = sorted(set(all_words))
tags = sorted(set(tags))  # for order

print(all_words)
# print(tags)
# print(xy)

X_train = []
Y_train = []
for (pattern_sentence, tag) in xy:
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)

    label = tags.index(tag)

コード例 #8

0

ファイルを表示

ファイル: train.py プロジェクト: yixuanwang/mchacks-backend

        'intents']:  # Iterate through each intent which consists of a tag, pattern of text, bot response
    tags.append(intent['tag'])  # Get all tags
    for pattern in intent[
            'patterns']:  # Iterate through every user input possabilities
        tkn = tokenize(pattern)  # tokenize the user input possabilities
        all_words.extend(
            tkn
        )  # use extend instead of append to avoid getting an array of array (avoid flattening later)
        xy.append((tkn, intent['tag']
                   ))  # (x,y) pair with input tokens and their respective tags

to_ignore = [
    '?', '!', '.', ',', 'the', 'is', 'as', 'a', 'are', 'in', 'this', 'that'
]  # Some stop words and useless characters
all_words = sorted(
    set([stem(wrd) for wrd in all_words if wrd not in to_ignore
         ]))  # Apply stemming to all words and remove duplicates
tags = sorted(
    set(tags))  # Incase we add duplicate tags, not necessary but good practice

X_train, y_train = [], []
for (pattern_sentence, tag) in xy:
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)
    label = tags.index(tag)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

# Hyper-parameters

コード例 #9

0

ファイルを表示

ファイル: train.py プロジェクト: AnjalyG/ChatbotExperiment

    intents = json.load(f)

all_words = []
tags = []
xy = []

for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        w = tokenize(pattern)
        all_words.extend(w)
        xy.append((w, tag))

ignore_words = {'?', '!', '.', ','}
all_words = sorted(set([stem(w) for w in all_words if w not in ignore_words]))
tags = sorted(set(tags))

x_train, y_train = [], []

for (pattern_sentence, tag) in xy:
    bag = bag_of_words(pattern_sentence, all_words)
    x_train.append(bag)

    label = tags.index(tag)
    y_train.append(
        label
    )  # we are using cross entropy loss , therefore one hot encoding is not needed

x_train = np.array(x_train)
y_train = np.array(y_train)

コード例 #10

0

ファイルを表示

all_words = []

tags = []

xy = []

for item in intents['intents']:
    tag = item['tag']
    tags.append(tag)
    for pat in item['patterns']:
        w = tokenize(pat)
        all_words.extend(w)
        xy.append((w, tag))

stop_words = ['?', '!', '.', ',']
all_words = [stem(w) for w in all_words if w not in stop_words]
all_words = sorted(set(all_words))

tags = sorted(set(tags))

x_train = []
y_train = []

for (pat_sent, tag) in xy:
    bag = bag_of_words(pat_sent, all_words)
    x_train.append(bag)

    label = tags.index(tag)
    y_train.append(label)  # CrossEntropyLoss

x_train = np.array(x_train)

コード例 #11

0

ファイルを表示

all_words = []
tags = []
xy = []  # will hold tokenized patterns and tags

# loop over intents
for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        tokenized_pattern = tokenize(pattern)
        all_words.extend(tokenized_pattern)  # extend with array of words
        xy.append((tokenized_pattern, tag))
ignored_characters = ['?', "!", '.', ',']

# apply stemming to tokenized all_words list and excluding ignored_words
all_words = [stem(word) for word in all_words if word not in ignored_characters]
# remove duplicates and sort the list
all_words = sorted(set(all_words))

# create bag of words
x_train = []  # for bag of words
y_train = []  # associated number for each tag

# iterate over (patterns, tags)
for (tokenized_pattern, tag) in xy:
    bag = bag_of_words(tokenized_pattern, all_words)
    x_train.append(bag)

    # get index of each tag (label them) and append to y_train
    label = tags.index(tag)
    y_train.append(label)  # we want only the class labels (cross-entropy loss)

コード例 #12

0

ファイルを表示

def main(read_dir="high.json", write_dir="high.pth"):

    base_json_dir = "resource/jsonFile/" + read_dir
    base_pth_dir = "resource/pthFile/" + write_dir

    with open(base_json_dir, "r", encoding="UTF-8") as file:
        intents = json.load(file)

    all_words = []
    tags = []
    xy = []

    for intent in intents['intents']:
        tag = intent['tag']
        tags.append(tag)
        for pattern in intent['patterns']:
            w = tokenize(pattern)
            all_words.extend(w)
            xy.append((w, tag))

    ignore_word = [",", ".", "'", '"', "?", "!", "^", "@", "#", "_", "-",
                   "~"]  #we need, regular expression
    all_words = [stem(w) for w in all_words
                 if w not in ignore_word]  #this is better than using map
    all_words = sorted(set(all_words))
    tags = sorted(set(tags))  # for order

    X_train = []
    Y_train = []
    for (pattern_sentence, tag) in xy:
        bag = bag_of_words(pattern_sentence, all_words)
        X_train.append(bag)

        label = tags.index(tag)
        Y_train.append(label)

    X_train = np.array(X_train)
    Y_train = np.array(Y_train)

    # Hyper-parameters
    num_epochs = 1000
    batch_size = 8
    learning_rate = 0.001
    input_size = len(X_train[0])
    hidden_size = 8
    output_size = len(tags)

    class ChatDataset(Dataset):
        def __init__(self):
            self.n_samples = len(X_train)
            self.x_data = X_train
            self.y_data = Y_train

        # support indexing such that dataset[i] can be used to get i-th sample
        def __getitem__(self, index):
            return self.x_data[index], self.y_data[index]

        # we can call len(dataset) to return the size
        def __len__(self):
            return self.n_samples

    dataset = ChatDataset()
    train_loader = DataLoader(dataset=dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=0)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = NeuralNet(input_size, hidden_size, output_size).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Train the model
    for epoch in range(num_epochs):
        for (words, labels) in train_loader:
            words = words.to(device)
            labels = labels.to(dtype=torch.long).to(device)

            # Forward pass
            outputs = model(words)
            # if y would be one-hot, we must apply
            # labels = torch.max(labels, 1)[1]
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if (epoch + 1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    print(f'final loss: {loss.item():.4f}')

    data = {
        "model_state": model.state_dict(),
        "input_size": input_size,
        "hidden_size": hidden_size,
        "output_size": output_size,
        "all_words": all_words,
        "tags": tags
    }

    torch.save(data, base_pth_dir)

    print(f'training complete. write_dir saved to {base_pth_dir}')

コード例 #13

0

ファイルを表示

tags = []
xy = []
ignoreWords = ['?', "!", '.', ',']

for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)

    for pattern in intent['patterns']:
        word = tokenize(pattern)
        allWords.extend(word)

        xy.append((word, tag))

# stem the words in the allWords list while disregarding the ignoreWords list
allWords = [stem(word) for word in allWords if word not in ignoreWords]

# remove duplicates and sort the words
allWords = sorted(set(allWords))
tags = sorted(set(tags))

XTrain = []
YTrain = []

for (patternSentence, tag) in xy:
    bag = bagOfWords(patternSentence, allWords)

    XTrain.append(bag)

    label = tags.index(tag)
    # CrossEntropyLoss

コード例 #14

0

ファイルを表示

ファイル: train.py プロジェクト: SkuldProgrammer/Terra-AI

def train(num_epochs=500, learning_rate=0.001):
    global intents
    # Use of a JSON-File to read trough the training data
    with open("intents.json", "r", encoding="UTF-8") as f:
        intents = json.load(f)

    # Will hold every word to tokenize and stem them
    all_words = []

    # Will hold every tag to classify the words
    tags = []

    # Will hold patterns and tags
    xy = []

    # the JSON-file is treated like a dictionary, therefore we have to use a key for the loop
    for intent in intents["intents"]:
        tag = intent["tag"]
        tags.append(tag)
        for pattern in intent["patterns"]:
            w = tokenize(pattern)
            # We don´t want to have lists in the all_words list, therefore we extend instead of appending them
            all_words.extend(w)
            # to be able to link the words to the different tags
            xy.append((w, tag))

    # setting up the excluded characters
    ignore_words = ["?", "!", ".", ","]
    all_words = [stem(w) for w in all_words if w not in ignore_words]

    # getting a alphabetically sorted list without duplicate words (function of set)
    all_words = sorted(set(all_words))
    tags = sorted(set(tags))

    X_train = []
    Y_train = []

    for pattern_sentence, tag in xy:
        bag = bag_of_words(pattern_sentence, all_words)
        X_train.append(bag)

        # Get the index of the tag of the tags-list
        label = tags.index(tag)
        Y_train.append(label)  # CrossEntropyLoss

    # Create np.arrays, arrays with only zeros with the length of corresponding data
    X_train = np.array(X_train)
    Y_train = np.array(Y_train)

    # Dataset-Class to train it easily
    class ChatDataSet(Dataset):
        def __init__(self):
            self.n_samples = len(X_train)
            self.x_data = X_train
            self.y_data = Y_train

        def __getitem__(self, index):
            return self.x_data[index], self.y_data[index]

        def __len__(self):
            return self.n_samples

    # Hyperparameters
    batch_size = 8
    hidden_size = 80
    output_size = len(tags)
    input_size = len(all_words)

    # Creating a custom data-set to feed into the neural network
    dataset = ChatDataSet()
    train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)

    # Checking if working with gpu is available
    device = torch.device("cpu")

    # Defining the model and using it for training
    model = NeuralNet(input_size, hidden_size, output_size).to(device)

    # loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        for words, labels in train_loader:
            words = words.to(device)
            labels = labels.to(device, torch.int64)

            # forward
            outputs = model(words)
            loss = criterion(outputs, labels)

            # backward and optimizer step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if epoch % 100 == 0:
            print("Epoch " + str(epoch) + " finished! " + f"loss={loss.item():.4}" + "\n " + str(num_epochs - epoch)
                  + " remaining!")

    data = {
        "model_state": model.state_dict(),
        "input_size": input_size,
        "output_size": output_size,
        "hidden_size": hidden_size,
        "all_words": all_words,
        "tags": tags
    }

    FILE = "Terra-Speak.pth"
    torch.save(data, FILE)

    print(f"Training complete! Model named {FILE} saved.")

コード例 #15

0

ファイルを表示

xy=[]

for intent in data:
    tag = intent['tags']
    tags.append(tag)
    
    
    for pattern in intent['patterns']:
        w = tokenize(pattern)
        all_words.extend(w)
        xy.append((w,tag))


miss_words = ['?','!','.',',']

all_words = [stem(w) for w in all_words if w not in miss_words]
all_words = sorted(list(set(all_words)))
tags= sorted(tags)
print(tags)

x_train = []
y_train = []

for (pattern_sentence, tag) in xy:
    bag = bag_of_words(pattern_sentence,all_words)
    x_train.append(bag)
    label = tags.index(tag)
    y_train.append(label)
   

x_train = np.array(x_train)

コード例 #16

0

ファイルを表示

def train():
    nodes = retrieve('nodes')

    all_words = []
    ids = []
    xy = []
    # loop through each sentence in our node patterns
    for node in nodes:
        # add to id list
        ids.append(node['id'])
        for pattern in node['patterns']:
            # tokenize each word in the sentence
            w = tokenize(pattern)
            # add to our words list
            all_words.extend(w)
            # add to xy pair
            xy.append((w, node['id']))

    # stem and lower each word and remove stop words
    ignore_words = ['?', '.', '!', '(', ')']
    stop_words = retrieve('stop_words')
    all_words = [w for w in all_words if not w.lower() in stop_words]
    all_words = [stem(w) for w in all_words if w not in ignore_words]

    # remove duplicates and sort
    all_words = sorted(set(all_words))
    ids = sorted(set(ids))

    # create training data
    x_train = []
    y_train = []
    for (pattern_sentence, id) in xy:
        # X: bag of words for each pattern_sentence
        bag = bag_of_words(pattern_sentence, all_words)
        x_train.append(bag)

        # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot
        y_train.append(ids.index(id))

    x_train = np.array(x_train)
    y_train = np.array(y_train)

    # Hyper-parameters
    num_epochs = 1000
    batch_size = 8
    learning_rate = 0.001
    input_size = len(x_train[0])
    hidden_size = 8
    output_size = len(ids)

    class ChatDataset(Dataset):
        def __init__(self):
            self.n_samples = len(x_train)
            self.x_data = x_train
            self.y_data = y_train

        # support indexing such that dataset[i] can be used to get i-th sample
        def __getitem__(self, index):
            return self.x_data[index], self.y_data[index]

        # we can call len(dataset) to return the size
        def __len__(self):
            return self.n_samples

    dataset = ChatDataset()
    train_loader = DataLoader(dataset=dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=0)

    device = torch.device('cpu')

    model = NeuralNet(input_size, hidden_size, output_size).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Train the model
    for epoch in range(num_epochs):
        for (words, labels) in train_loader:
            words = words.to(device)
            labels = labels.to(dtype=torch.long).to(device)

            # Forward pass
            outputs = model(words)
            # if y would be one-hot, we must apply
            # labels = torch.max(labels, 1)[1]
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    data = {
        "model_state": model.state_dict(),
        "input_size": input_size,
        "hidden_size": hidden_size,
        "output_size": output_size,
        "all_words": all_words,
        "ids": ids
    }

    torch.save(data, "data.pth")

コード例 #17

0

ファイルを表示

inFile = open('intentions.json', 'r')
intents = json.load(inFile)

words = []
tags = []
combo = []

for i in intents['intents']:
    tag = tags.append(i["tag"])
    for w in i["patterns"]:
        token = makeToken(w)
        words.extend(token)
        combo.append((w, tag))

ignore = ['?', '.', ',', '!']
words = [stem(w) for w in words if w not in ignore]
words = sorted(set(words))
tags = sorted(tags)

train_data1 = []
train_data2 = []
for (pattern, tag) in combo:
    bag = makeBag(pattern, words)
    train_data1.append(bag)

    label = tags.index(tag)
    train_data2.append(label)

np_data1 = np.array(train_data1)
np_data2 = np.array(train_data2)

コード例 #18

0

ファイルを表示

ファイル: entrenamiento.py プロジェクト: alexescalante1/CHATBOT-PsicoRobo-v2.0

# recorrer cada oración en nuestros patrones de intenciones
for inten in intenciones['intenciones']:
    etiqueta = inten['etiqueta']
    # agregar a la lista de etiquetas
    etiquetas.append(etiqueta)
    for patron in inten['patrones']:
        # tokenizar cada palabra de la oración
        w = tokenize(patron)
        # agregar a nuestra lista de palabras
        todas_las_palabras.extend(w)
        # agregar al par xy
        xy.append((w, etiqueta))

# ignorar palabra
ignorar_palabras = ['?', '.', '!']
todas_las_palabras = [stem(w) for w in todas_las_palabras if w not in ignorar_palabras]
# eliminar duplicados y ordenar
todas_las_palabras = sorted(set(todas_las_palabras))
etiquetas = sorted(set(etiquetas))

print(len(xy), "patrones")
print(len(etiquetas), "etiquetas:", etiquetas)
print(len(todas_las_palabras), "palabras derivadas unicas:", todas_las_palabras)

# crear datos de entrenamiento
X_cola = []
Y_cola = []
for (patron_de_oracion, etiqueta) in xy:
    # X: bolsa de palabras para cada patron de sentencia
    bolsa = bag_of_words(patron_de_oracion, todas_las_palabras)
    X_cola.append(bolsa)

コード例 #19

0

ファイルを表示

ファイル: train.py プロジェクト: SantAndrea/NLP_Bot_MultiLanguages

]

#Lemmatization-----------------
t_lem = ''
for w in all_words:
    if w not in ignore_words:
        t_lem = t_lem + ' ' + w

lemma = lemmatize(t_lem)
all_words_lemma = []
for w in lemma:
    all_words_lemma.append(w[2])
#------------------------------

#Stem--------------------------
all_words = [stem(w) for w in all_words_lemma if w not in ignore_words]
#------------------------------

all_words = sorted(set(all_words))
tags = sorted(set(tags))

x_train = []
y_train = []
for (pattern_sentence, tag) in xy:
    bag = bag_of_words(pattern_sentence, all_words)
    x_train.append(bag)

    label = tags.index(tag)
    y_train.append(label)

x_train = np.array(x_train)

コード例 #20

0

ファイルを表示

ファイル: train.py プロジェクト: vamsigutta/chatbot-pytorch

with open('intents.json', 'r') as f:
    intents = json.load(f)

tags = []
all_words = []
xy = []
for intent in intents['intents']:
    tags.append(intent['tag'])
    for pattern in intent['patterns']:
        w = tokenize(pattern)
        all_words.extend([word.lower() for word in w])
        xy.append((w,intent['tag']))

ignore_words = string.punctuation
all_words = [stem(word) for word in all_words if word not in ignore_words]
tags = sorted(set(tags))
all_words = sorted(set(all_words))


x_train = []
y_train = []
for (sentences, tag) in xy:
    word_num = bag_of_words(sentences, all_words)
    label = tags.index(tag)
    x_train.append(word_num)
    y_train.append(label)

x_train = np.array(x_train)
y_train = np.array(y_train)

コード例 #21

0

ファイルを表示

ファイル: train.py プロジェクト: kevinyu609866/ChatBot

    
word_bank = []
tags = []
xy = []

for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        w = tokenize(pattern)
        word_bank.extend(w)
        xy.append((w, tag))


exclude_punctuation = ['?', '!', '.', ',']
word_bank = [stem(w) for w in word_bank if w not in exclude_punctuation]
word_bank = sorted(set(word_bank))
tags = sorted(set(tags))
print(tags)

X_train = []
Y_train = []

for (pattern_sent, tag) in xy:
    bag = bag_of_words(pattern_sent, word_bank)
    X_train.append(bag)
    
    labels = tags.index(tag)
    Y_train.append(labels)
    
X_train = np.array(X_train)

コード例 #22

0

ファイルを表示

with open('intents.json', 'r') as f:
    intents = json.load(f)

all_words = []
tags = []
xy = []
for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        w = tokenize(pattern)
        all_words.extend(w)
        xy.append((w, tag))

ignore_words = ['?', '!', '.', ',']
all_words = [stem(w) for w in all_words if w not in ignore_words]
all_words = sorted(set(all_words))
tags = sorted(set(tags))

X_train = []
y_train = []
for (pattern_sentece, tag) in xy:
    bag = bag_of_words(pattern_sentece, all_words)
    X_train.append(bag)

    label = tags.index(tag)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

コード例 #23

0

ファイルを表示

ファイル: train.py プロジェクト: ShafkatIslam/pytorch-chatbot-master

        # tokenize each word in the sentence
        w = tokenize(pattern)
        # add to our words list
        all_words.extend(w)
        # add to xy pair
        xy.append((w, tag))

# stem and lower each word
ignore_words = ['?', '.', '!', '-', ',', '0-9', '(', ')']
stopset = [
    'a', 'an', 'the', 'i', 'you', 'one', 'of', 'in', 'for', 'to', 'by',
    'about', 'off', 'did', 'am', 'is', 'are', 'was', 'were', 'if', 'is', 'on',
    'what', 'why', 'when', 'where', 'which', 'and', 'how', 'tell', 'me', 'my',
    'must', 'could', 'that', 'or', 'anyone', 'any', 'many', 'there'
]
all_word = [stem(w) for w in all_words if w not in ignore_words]
# all_word = [tokenize(w) for w in all_words if w not in ignore_words]
all_words = [stem(w) for w in all_word if w not in stopset]
#all_words = [tokenize(w) for w in all_word if w not in stopset]
# remove duplicates and sort
all_words = sorted(set(all_words))
tags = sorted(set(tags))

print(">>", len(xy), "patterns")
print(">>", len(tags), "tags:", tags)
print(len(all_words), "unique stemmed words:", all_words)

# create training data
X_train = []
y_train = []
for (pattern_sentence, tag) in xy: