def __init__(self, intents_file_path, model_name='data'): self.model_name = model_name with open(intents_file_path, 'r') as f: intents = json.load(f) self.all_words = [] self.tags = [] self.xy = [] self.x_train = [] self.y_train = [] # loop through each sentence in intents patterns for intent in intents['intents']: tag = intent['tag'] self.tags.append(tag) for pattern in intent['patterns']: # tokenize each word in the sentence w = tokenize(pattern) # add to our words list self.all_words.extend(w) # add to xy pair self.xy.append((w, tag)) # stem and lower each word ignore_words = ['?', '.', '!'] self.all_words = [ stem(w) for w in self.all_words if w not in ignore_words ] # remove duplicates and sort self.all_words = sorted(set(self.all_words)) self.tags = sorted(set(self.tags))
def chat(): user_responses = [] bot_responses = [] sentence1 = request.form['user_input'] user_responses.append(sentence1) sentence = tokenize(sentence1) stemmed_words = [ stem(w) for w in sentence] no_of_pizza=0 order_id = 0 sts = ['Your food is being prepared', 'Our executive is out for delivery', 'Sorry for the inconvinence.. you will get you order within 10 mins'] order_sts = sts[random.randint(0,2)] for w in stemmed_words: if w == 'order' or w =='want' or w == 'need': for wrd in stemmed_words: if re.match('^[0-9]*$', wrd): no_of_pizza = int(wrd) choices = list(range(100)) random.shuffle(choices) order_id = choices.pop() sts = ['Your food is being prepared', 'Our executive is out for delivery', 'Sorry for the inconvinence.. you will get you order within 10 mins'] order_sts = sts[random.randint(0,2)] order_details = {'_id': order_id , 'Address':'none', 'Status': order_sts} collection.insert_one(order_details) bot = "your order has been recorded and your order id is {order_id}, kindly provide us delivery details" return render_template('index.html', user_input=sentence1, bot_response = bot ) elif w == 'address': result = collection.update({"_id":order_id}, {"$set":{"Address": sentence1}}) bot = "your delivery details are recorded status of your order: {order_sts}" return render_template('index.html', user_input=sentence1, bot_response = bot ) elif w == 'status': results = collection.find_one({"_id":order_id}) if results == 'None': bot = 'No orderFound in this id' return render_template('index.html', user_input=sentence1, bot_response = bot ) else: bot = results.get('Status') return render_template('index.html', user_input=sentence1, bot_response = bot ) X = bag_of_words(sentence, all_words) X = X.reshape(1, X.shape[0]) X = torch.from_numpy(X) output = model(X) _, predicted = torch.max(output, dim=1) tag = tags[predicted.item()] probs = torch.softmax(output, dim =1) prob = probs[0][predicted.item()] for intent in intents['intents']: if tag == intent['tag']: bot = random.choice(intent["responses"]) return render_template('index.html', user_input=sentence1, bot_response = bot )
def prepare_train_data(): import numpy as np import json from nltk.corpus import stopwords from nltk_utils import tokenize, stem, convert_and_pad with open( "intents.json", "r" ) as f: data = json.load( f ) all_words = [ ] tags = [ ] x_y = [ ] for d_point in data[ "intents" ]: tag = d_point[ 'tag' ] tags.append( tag ) for pattern in d_point[ "patterns" ]: word = tokenize( pattern ) word = [ s_w.lower() for s_w in word ] all_words.extend( word ) x_y.append( (word, tag) ) signs = [ '!', '?', ',', '.'] all_words = [ stem( w ) for w in all_words if w not in signs ] all_words = sorted( set(all_words) ) tags = sorted( set(tags) ) X_train = [] y_train = [] for ( pattern_sentence, tag ) in x_y: bag_of_words = convert_and_pad( all_words, pattern_sentence ) X_train.append( bag_of_words ) label = tags.index( tag ) y_train.append( float(label) ) y_train = np.array( y_train ) ## Testing fuction/ # sents = ['hi', 'hello', 'i', 'you', 'bye', 'thank', 'cool'] # ss = ['hello', 'how', 'are', 'you'] # print(convert_and_pad(sents, ss)) return X_train, y_train, [ all_words, tags, x_y ]
def process(): if 'username' in session: if request.method == 'POST': sentence = request.form['user_input'] username = session['username'] print(username) tag_result = request.form.get('tags') tage_value = str(tag_result) question = sentence #sentence = tokenize(sentence) sentence = tokenize(sentence.lower()) stopsets = [ 'a', 'an', 'the', 'i', 'you', 'one', 'do', 'of', 'in', 'like', 'for', 'from', 'to', 'as', 'by', 'about', 'off', 'did', 'am', 'is', 'are', 'was', 'were', 'if', 'is', 'on', 'what', 'when', 'where', 'which', 'and', 'tell', 'me', 'my', 'must', 'can', 'could', 'would', 'that', 'or', 'anyone', 'any', 'many', 'there' ] stopX = [stem(w) for w in sentence if w not in stopsets] print("\n>>>[", stopX) #X = bag_of_words(sentence, all_words) X = bag_of_words(stopX, all_words) X = X.reshape(1, X.shape[0]) X = torch.from_numpy(X).to(device) output = model(X) _, predicted = torch.max(output, dim=1) tag = tags[predicted.item()] probs = torch.softmax(output, dim=1) prob = probs[0][predicted.item()] if prob.item() > 0.75: for intent in intents['intents']: if tag == intent["tag"]: if tag_result == 'Select': return render_template( 'index.html', user_input=question, bot_response=random.choice( intent['responses']), error= 'Please select any option from the dropdown') else: # f = open("file.txt", "a") # f.write(tage_value + '\n' + question + '\n' + random.choice(intent['responses']) + '\n') print(username) print(tage_value) print(question) print(random.choice(intent['responses'])) chatTable = mongo.db.Chat chatData = chatTable.find({'name': username}) all_questions = [] all_answers = [] all_yes_reviews = [] all_no_reviews = [] if chatData: for x in chatData: questions = x['details']['question'] answers = x['details']['answer'] reviews = x['details']['review'] all_questions.append(questions) all_answers.append(answers) if reviews == 'Yes': all_yes_reviews.append(reviews) elif reviews == 'No': all_no_reviews.append(reviews) all_questions.reverse() all_answers.reverse() yes_count = len(all_yes_reviews) no_count = len(all_no_reviews) efficiency = ( int(yes_count) / (int(yes_count) + int(no_count))) * 100 efficiency = round(efficiency, 2) return render_template( 'review.html', message=session['username'], user_names=username, tag_value=tage_value, user_input=question, bot_response=random.choice( intent['responses']), all_history=zip(all_questions, all_answers), yes_count=yes_count, no_count=no_count, efficiency=efficiency) else: return render_template( 'review.html', message=session['username'], user_names=username, tag_value=tage_value, user_input=question, bot_response=random.choice( intent['responses']), all_history=zip(all_questions, all_answers)) # print(f"{bot_name}: {random.choice(intent['responses'])}") else: print("hello") if tag_result == 'Select': return render_template( 'index.html', user_input=question, bot_response="I don't understand", error='Please select any option the dropdown') else: # f = open("file.txt", "a") # f.write(tage_value + '\n' + question + '\n' + 'I donot understand\n') bot_response = web_scraping(question) chatTable = mongo.db.Chat chatData = chatTable.find({'name': username}) all_questions = [] all_answers = [] all_yes_reviews = [] all_no_reviews = [] if chatData: for x in chatData: questions = x['details']['question'] answers = x['details']['answer'] reviews = x['details']['review'] all_questions.append(questions) all_answers.append(answers) if reviews == 'Yes': all_yes_reviews.append(reviews) elif reviews == 'No': all_no_reviews.append(reviews) all_questions.reverse() all_answers.reverse() yes_count = len(all_yes_reviews) no_count = len(all_no_reviews) efficiency = (int(yes_count) / (int(yes_count) + int(no_count))) * 100 efficiency = round(efficiency, 2) return render_template('review.html', message=session['username'], user_names=username, tag_value=tage_value, user_input=question, bot_response=bot_response, all_history=zip( all_questions, all_answers), yes_count=yes_count, no_count=no_count, efficiency=efficiency) else: return render_template('review.html', message=session['username'], user_names=username, tag_value=tage_value, user_input=question, bot_response=bot_response, all_history=zip( all_questions, all_answers)) return render_template('mainindex.html')
#collect patterns and their tags tags = [] #this will hold all patterns and tags xy = [] for intent in intents['intents']: tag = intent['tag'] tags.append(tag) for pattern in intent['patterns']: w = tokenize(pattern) all_words.extend(w) xy.append((w, tag)) all_words = [stem(w) for w in all_words if w not in punctuation] all_words = sorted(set(all_words)) tags = sorted(set(tags)) X_train = [] y_train = [] for (pattern_sentence, tag) in xy: bag = bag_of_words(pattern_sentence, all_words) X_train.append(bag) label = tags.index(tag) y_train.append(label) #1 hot X_train = np.array(X_train)
intents = json.load(f) # loop through each sentence in our intents patterns for intent in intents['intents']: tag = intent['tag'] # add to tag list tags.append(tag) for pattern in intent['patterns']: # tokenize each word in the sentence w = tokenize(pattern) # add to our words list all_strings.extend(w) # add to ls pair ls.append((w, tag)) all_strings = [stem(w) for w in all_strings if w not in ignore_words] all_strings = sorted(set(all_strings)) tags = sorted(set(tags)) for (pattern_sentence, tag) in ls: # X: bag of words for each pattern_sentence bag = bag_of_words(pattern_sentence, all_strings) x_train.append(bag) # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot label = tags.index(tag) y_train.append(label) x_train = np.array(x_train) y_train = np.array(y_train)
all_words = [] tags = [] xy = [] for intent in intents['intents']: tag = intent['tag'] tags.append(tag) for pattern in intent['patterns']: w = tokenize(pattern) all_words.extend(w) xy.append((w, tag)) ignore_word = [",", ".", "'", '"', "?", "!", "^", "@", "#", "_", "-"] #we need, regular expression all_words = [stem(w) for w in all_words if w not in ignore_word] #this is better than using map all_words = sorted(set(all_words)) tags = sorted(set(tags)) # for order print(all_words) # print(tags) # print(xy) X_train = [] Y_train = [] for (pattern_sentence, tag) in xy: bag = bag_of_words(pattern_sentence, all_words) X_train.append(bag) label = tags.index(tag)
'intents']: # Iterate through each intent which consists of a tag, pattern of text, bot response tags.append(intent['tag']) # Get all tags for pattern in intent[ 'patterns']: # Iterate through every user input possabilities tkn = tokenize(pattern) # tokenize the user input possabilities all_words.extend( tkn ) # use extend instead of append to avoid getting an array of array (avoid flattening later) xy.append((tkn, intent['tag'] )) # (x,y) pair with input tokens and their respective tags to_ignore = [ '?', '!', '.', ',', 'the', 'is', 'as', 'a', 'are', 'in', 'this', 'that' ] # Some stop words and useless characters all_words = sorted( set([stem(wrd) for wrd in all_words if wrd not in to_ignore ])) # Apply stemming to all words and remove duplicates tags = sorted( set(tags)) # Incase we add duplicate tags, not necessary but good practice X_train, y_train = [], [] for (pattern_sentence, tag) in xy: bag = bag_of_words(pattern_sentence, all_words) X_train.append(bag) label = tags.index(tag) y_train.append(label) X_train = np.array(X_train) y_train = np.array(y_train) # Hyper-parameters
intents = json.load(f) all_words = [] tags = [] xy = [] for intent in intents['intents']: tag = intent['tag'] tags.append(tag) for pattern in intent['patterns']: w = tokenize(pattern) all_words.extend(w) xy.append((w, tag)) ignore_words = {'?', '!', '.', ','} all_words = sorted(set([stem(w) for w in all_words if w not in ignore_words])) tags = sorted(set(tags)) x_train, y_train = [], [] for (pattern_sentence, tag) in xy: bag = bag_of_words(pattern_sentence, all_words) x_train.append(bag) label = tags.index(tag) y_train.append( label ) # we are using cross entropy loss , therefore one hot encoding is not needed x_train = np.array(x_train) y_train = np.array(y_train)
all_words = [] tags = [] xy = [] for item in intents['intents']: tag = item['tag'] tags.append(tag) for pat in item['patterns']: w = tokenize(pat) all_words.extend(w) xy.append((w, tag)) stop_words = ['?', '!', '.', ','] all_words = [stem(w) for w in all_words if w not in stop_words] all_words = sorted(set(all_words)) tags = sorted(set(tags)) x_train = [] y_train = [] for (pat_sent, tag) in xy: bag = bag_of_words(pat_sent, all_words) x_train.append(bag) label = tags.index(tag) y_train.append(label) # CrossEntropyLoss x_train = np.array(x_train)
all_words = [] tags = [] xy = [] # will hold tokenized patterns and tags # loop over intents for intent in intents['intents']: tag = intent['tag'] tags.append(tag) for pattern in intent['patterns']: tokenized_pattern = tokenize(pattern) all_words.extend(tokenized_pattern) # extend with array of words xy.append((tokenized_pattern, tag)) ignored_characters = ['?', "!", '.', ','] # apply stemming to tokenized all_words list and excluding ignored_words all_words = [stem(word) for word in all_words if word not in ignored_characters] # remove duplicates and sort the list all_words = sorted(set(all_words)) # create bag of words x_train = [] # for bag of words y_train = [] # associated number for each tag # iterate over (patterns, tags) for (tokenized_pattern, tag) in xy: bag = bag_of_words(tokenized_pattern, all_words) x_train.append(bag) # get index of each tag (label them) and append to y_train label = tags.index(tag) y_train.append(label) # we want only the class labels (cross-entropy loss)
def main(read_dir="high.json", write_dir="high.pth"): base_json_dir = "resource/jsonFile/" + read_dir base_pth_dir = "resource/pthFile/" + write_dir with open(base_json_dir, "r", encoding="UTF-8") as file: intents = json.load(file) all_words = [] tags = [] xy = [] for intent in intents['intents']: tag = intent['tag'] tags.append(tag) for pattern in intent['patterns']: w = tokenize(pattern) all_words.extend(w) xy.append((w, tag)) ignore_word = [",", ".", "'", '"', "?", "!", "^", "@", "#", "_", "-", "~"] #we need, regular expression all_words = [stem(w) for w in all_words if w not in ignore_word] #this is better than using map all_words = sorted(set(all_words)) tags = sorted(set(tags)) # for order X_train = [] Y_train = [] for (pattern_sentence, tag) in xy: bag = bag_of_words(pattern_sentence, all_words) X_train.append(bag) label = tags.index(tag) Y_train.append(label) X_train = np.array(X_train) Y_train = np.array(Y_train) # Hyper-parameters num_epochs = 1000 batch_size = 8 learning_rate = 0.001 input_size = len(X_train[0]) hidden_size = 8 output_size = len(tags) class ChatDataset(Dataset): def __init__(self): self.n_samples = len(X_train) self.x_data = X_train self.y_data = Y_train # support indexing such that dataset[i] can be used to get i-th sample def __getitem__(self, index): return self.x_data[index], self.y_data[index] # we can call len(dataset) to return the size def __len__(self): return self.n_samples dataset = ChatDataset() train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=0) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = NeuralNet(input_size, hidden_size, output_size).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Train the model for epoch in range(num_epochs): for (words, labels) in train_loader: words = words.to(device) labels = labels.to(dtype=torch.long).to(device) # Forward pass outputs = model(words) # if y would be one-hot, we must apply # labels = torch.max(labels, 1)[1] loss = criterion(outputs, labels) # Backward and optimize optimizer.zero_grad() loss.backward() optimizer.step() if (epoch + 1) % 100 == 0: print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}') print(f'final loss: {loss.item():.4f}') data = { "model_state": model.state_dict(), "input_size": input_size, "hidden_size": hidden_size, "output_size": output_size, "all_words": all_words, "tags": tags } torch.save(data, base_pth_dir) print(f'training complete. write_dir saved to {base_pth_dir}')
tags = [] xy = [] ignoreWords = ['?', "!", '.', ','] for intent in intents['intents']: tag = intent['tag'] tags.append(tag) for pattern in intent['patterns']: word = tokenize(pattern) allWords.extend(word) xy.append((word, tag)) # stem the words in the allWords list while disregarding the ignoreWords list allWords = [stem(word) for word in allWords if word not in ignoreWords] # remove duplicates and sort the words allWords = sorted(set(allWords)) tags = sorted(set(tags)) XTrain = [] YTrain = [] for (patternSentence, tag) in xy: bag = bagOfWords(patternSentence, allWords) XTrain.append(bag) label = tags.index(tag) # CrossEntropyLoss
def train(num_epochs=500, learning_rate=0.001): global intents # Use of a JSON-File to read trough the training data with open("intents.json", "r", encoding="UTF-8") as f: intents = json.load(f) # Will hold every word to tokenize and stem them all_words = [] # Will hold every tag to classify the words tags = [] # Will hold patterns and tags xy = [] # the JSON-file is treated like a dictionary, therefore we have to use a key for the loop for intent in intents["intents"]: tag = intent["tag"] tags.append(tag) for pattern in intent["patterns"]: w = tokenize(pattern) # We don´t want to have lists in the all_words list, therefore we extend instead of appending them all_words.extend(w) # to be able to link the words to the different tags xy.append((w, tag)) # setting up the excluded characters ignore_words = ["?", "!", ".", ","] all_words = [stem(w) for w in all_words if w not in ignore_words] # getting a alphabetically sorted list without duplicate words (function of set) all_words = sorted(set(all_words)) tags = sorted(set(tags)) X_train = [] Y_train = [] for pattern_sentence, tag in xy: bag = bag_of_words(pattern_sentence, all_words) X_train.append(bag) # Get the index of the tag of the tags-list label = tags.index(tag) Y_train.append(label) # CrossEntropyLoss # Create np.arrays, arrays with only zeros with the length of corresponding data X_train = np.array(X_train) Y_train = np.array(Y_train) # Dataset-Class to train it easily class ChatDataSet(Dataset): def __init__(self): self.n_samples = len(X_train) self.x_data = X_train self.y_data = Y_train def __getitem__(self, index): return self.x_data[index], self.y_data[index] def __len__(self): return self.n_samples # Hyperparameters batch_size = 8 hidden_size = 80 output_size = len(tags) input_size = len(all_words) # Creating a custom data-set to feed into the neural network dataset = ChatDataSet() train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True) # Checking if working with gpu is available device = torch.device("cpu") # Defining the model and using it for training model = NeuralNet(input_size, hidden_size, output_size).to(device) # loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) for epoch in range(num_epochs): for words, labels in train_loader: words = words.to(device) labels = labels.to(device, torch.int64) # forward outputs = model(words) loss = criterion(outputs, labels) # backward and optimizer step optimizer.zero_grad() loss.backward() optimizer.step() if epoch % 100 == 0: print("Epoch " + str(epoch) + " finished! " + f"loss={loss.item():.4}" + "\n " + str(num_epochs - epoch) + " remaining!") data = { "model_state": model.state_dict(), "input_size": input_size, "output_size": output_size, "hidden_size": hidden_size, "all_words": all_words, "tags": tags } FILE = "Terra-Speak.pth" torch.save(data, FILE) print(f"Training complete! Model named {FILE} saved.")
xy=[] for intent in data: tag = intent['tags'] tags.append(tag) for pattern in intent['patterns']: w = tokenize(pattern) all_words.extend(w) xy.append((w,tag)) miss_words = ['?','!','.',','] all_words = [stem(w) for w in all_words if w not in miss_words] all_words = sorted(list(set(all_words))) tags= sorted(tags) print(tags) x_train = [] y_train = [] for (pattern_sentence, tag) in xy: bag = bag_of_words(pattern_sentence,all_words) x_train.append(bag) label = tags.index(tag) y_train.append(label) x_train = np.array(x_train)
def train(): nodes = retrieve('nodes') all_words = [] ids = [] xy = [] # loop through each sentence in our node patterns for node in nodes: # add to id list ids.append(node['id']) for pattern in node['patterns']: # tokenize each word in the sentence w = tokenize(pattern) # add to our words list all_words.extend(w) # add to xy pair xy.append((w, node['id'])) # stem and lower each word and remove stop words ignore_words = ['?', '.', '!', '(', ')'] stop_words = retrieve('stop_words') all_words = [w for w in all_words if not w.lower() in stop_words] all_words = [stem(w) for w in all_words if w not in ignore_words] # remove duplicates and sort all_words = sorted(set(all_words)) ids = sorted(set(ids)) # create training data x_train = [] y_train = [] for (pattern_sentence, id) in xy: # X: bag of words for each pattern_sentence bag = bag_of_words(pattern_sentence, all_words) x_train.append(bag) # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot y_train.append(ids.index(id)) x_train = np.array(x_train) y_train = np.array(y_train) # Hyper-parameters num_epochs = 1000 batch_size = 8 learning_rate = 0.001 input_size = len(x_train[0]) hidden_size = 8 output_size = len(ids) class ChatDataset(Dataset): def __init__(self): self.n_samples = len(x_train) self.x_data = x_train self.y_data = y_train # support indexing such that dataset[i] can be used to get i-th sample def __getitem__(self, index): return self.x_data[index], self.y_data[index] # we can call len(dataset) to return the size def __len__(self): return self.n_samples dataset = ChatDataset() train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=0) device = torch.device('cpu') model = NeuralNet(input_size, hidden_size, output_size).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Train the model for epoch in range(num_epochs): for (words, labels) in train_loader: words = words.to(device) labels = labels.to(dtype=torch.long).to(device) # Forward pass outputs = model(words) # if y would be one-hot, we must apply # labels = torch.max(labels, 1)[1] loss = criterion(outputs, labels) # Backward and optimize optimizer.zero_grad() loss.backward() optimizer.step() data = { "model_state": model.state_dict(), "input_size": input_size, "hidden_size": hidden_size, "output_size": output_size, "all_words": all_words, "ids": ids } torch.save(data, "data.pth")
inFile = open('intentions.json', 'r') intents = json.load(inFile) words = [] tags = [] combo = [] for i in intents['intents']: tag = tags.append(i["tag"]) for w in i["patterns"]: token = makeToken(w) words.extend(token) combo.append((w, tag)) ignore = ['?', '.', ',', '!'] words = [stem(w) for w in words if w not in ignore] words = sorted(set(words)) tags = sorted(tags) train_data1 = [] train_data2 = [] for (pattern, tag) in combo: bag = makeBag(pattern, words) train_data1.append(bag) label = tags.index(tag) train_data2.append(label) np_data1 = np.array(train_data1) np_data2 = np.array(train_data2)
# recorrer cada oración en nuestros patrones de intenciones for inten in intenciones['intenciones']: etiqueta = inten['etiqueta'] # agregar a la lista de etiquetas etiquetas.append(etiqueta) for patron in inten['patrones']: # tokenizar cada palabra de la oración w = tokenize(patron) # agregar a nuestra lista de palabras todas_las_palabras.extend(w) # agregar al par xy xy.append((w, etiqueta)) # ignorar palabra ignorar_palabras = ['?', '.', '!'] todas_las_palabras = [stem(w) for w in todas_las_palabras if w not in ignorar_palabras] # eliminar duplicados y ordenar todas_las_palabras = sorted(set(todas_las_palabras)) etiquetas = sorted(set(etiquetas)) print(len(xy), "patrones") print(len(etiquetas), "etiquetas:", etiquetas) print(len(todas_las_palabras), "palabras derivadas unicas:", todas_las_palabras) # crear datos de entrenamiento X_cola = [] Y_cola = [] for (patron_de_oracion, etiqueta) in xy: # X: bolsa de palabras para cada patron de sentencia bolsa = bag_of_words(patron_de_oracion, todas_las_palabras) X_cola.append(bolsa)
] #Lemmatization----------------- t_lem = '' for w in all_words: if w not in ignore_words: t_lem = t_lem + ' ' + w lemma = lemmatize(t_lem) all_words_lemma = [] for w in lemma: all_words_lemma.append(w[2]) #------------------------------ #Stem-------------------------- all_words = [stem(w) for w in all_words_lemma if w not in ignore_words] #------------------------------ all_words = sorted(set(all_words)) tags = sorted(set(tags)) x_train = [] y_train = [] for (pattern_sentence, tag) in xy: bag = bag_of_words(pattern_sentence, all_words) x_train.append(bag) label = tags.index(tag) y_train.append(label) x_train = np.array(x_train)
with open('intents.json', 'r') as f: intents = json.load(f) tags = [] all_words = [] xy = [] for intent in intents['intents']: tags.append(intent['tag']) for pattern in intent['patterns']: w = tokenize(pattern) all_words.extend([word.lower() for word in w]) xy.append((w,intent['tag'])) ignore_words = string.punctuation all_words = [stem(word) for word in all_words if word not in ignore_words] tags = sorted(set(tags)) all_words = sorted(set(all_words)) x_train = [] y_train = [] for (sentences, tag) in xy: word_num = bag_of_words(sentences, all_words) label = tags.index(tag) x_train.append(word_num) y_train.append(label) x_train = np.array(x_train) y_train = np.array(y_train)
word_bank = [] tags = [] xy = [] for intent in intents['intents']: tag = intent['tag'] tags.append(tag) for pattern in intent['patterns']: w = tokenize(pattern) word_bank.extend(w) xy.append((w, tag)) exclude_punctuation = ['?', '!', '.', ','] word_bank = [stem(w) for w in word_bank if w not in exclude_punctuation] word_bank = sorted(set(word_bank)) tags = sorted(set(tags)) print(tags) X_train = [] Y_train = [] for (pattern_sent, tag) in xy: bag = bag_of_words(pattern_sent, word_bank) X_train.append(bag) labels = tags.index(tag) Y_train.append(labels) X_train = np.array(X_train)
with open('intents.json', 'r') as f: intents = json.load(f) all_words = [] tags = [] xy = [] for intent in intents['intents']: tag = intent['tag'] tags.append(tag) for pattern in intent['patterns']: w = tokenize(pattern) all_words.extend(w) xy.append((w, tag)) ignore_words = ['?', '!', '.', ','] all_words = [stem(w) for w in all_words if w not in ignore_words] all_words = sorted(set(all_words)) tags = sorted(set(tags)) X_train = [] y_train = [] for (pattern_sentece, tag) in xy: bag = bag_of_words(pattern_sentece, all_words) X_train.append(bag) label = tags.index(tag) y_train.append(label) X_train = np.array(X_train) y_train = np.array(y_train)
# tokenize each word in the sentence w = tokenize(pattern) # add to our words list all_words.extend(w) # add to xy pair xy.append((w, tag)) # stem and lower each word ignore_words = ['?', '.', '!', '-', ',', '0-9', '(', ')'] stopset = [ 'a', 'an', 'the', 'i', 'you', 'one', 'of', 'in', 'for', 'to', 'by', 'about', 'off', 'did', 'am', 'is', 'are', 'was', 'were', 'if', 'is', 'on', 'what', 'why', 'when', 'where', 'which', 'and', 'how', 'tell', 'me', 'my', 'must', 'could', 'that', 'or', 'anyone', 'any', 'many', 'there' ] all_word = [stem(w) for w in all_words if w not in ignore_words] # all_word = [tokenize(w) for w in all_words if w not in ignore_words] all_words = [stem(w) for w in all_word if w not in stopset] #all_words = [tokenize(w) for w in all_word if w not in stopset] # remove duplicates and sort all_words = sorted(set(all_words)) tags = sorted(set(tags)) print(">>", len(xy), "patterns") print(">>", len(tags), "tags:", tags) print(len(all_words), "unique stemmed words:", all_words) # create training data X_train = [] y_train = [] for (pattern_sentence, tag) in xy: