def __next__(self): # reached end of file if self.curr_line + 1 == self.num_lines: raise StopIteration curr_index = self.curr_index curr_line = self.curr_line sentences = self.sentences if curr_index == 0: sentences[curr_line] = tokenization(sentences[curr_line]) tok1 = split_token(sentences[curr_line][curr_index]) tok2 = split_token(sentences[curr_line][curr_index + 1]) self.curr_index += 1 end_of_sentence = False if self.curr_index + 1 == len(self.sentences[curr_line]): self.curr_index = 0 self.curr_line += 1 end_of_sentence = True # return (word1, tag1), (word2, tag2) return (tok1[0], tok1[1]), (tok2[0], tok2[1]), end_of_sentence
def min_perplexity(self): dev = open(dev_file, 'r') perplexities = [] sentences = sentence_segmentation(dev) lam_values = [0.1, 0.3, 0.5, 0.7, 0.9] for lam in lam_values: sigma = 0 dev_len = 0 for sen in sentences: tokens = tokenization(start_sym + ' ' + sen + ' ' + end_sym) x = start_sym for y in tokens[1:]: if y == '': continue y = y.lower() inter_pol = self.dev_interpolation(x, y, lam) if inter_pol != 0: sigma += log(inter_pol, 2) dev_len += 1 x = y perplexities.append((pow(2, (-1 / float(dev_len)) * sigma), lam)) # returns tuple (perplexity, lambda) return min(perplexities)
def testtokenize(): df = pd.read_csv("testPreprocessfile.csv") df1 = preprocess.tokenization(df,"posts") df1.to_csv("testPreprocessfile.csv") assert df1['title'][0] == ['Any', 'ongoing', 'good' ,'deals', 'on', 'XPS', '15'] assert df1['description'][0] == ['Hello','am','there']
tags = data['tags'] model_state = data["model_state"] model = NeuralNetwork(input_size, hidden_size, output_size).to(device) model.load_state_dict(model_state) model.eval() bot_name = "Aapka Dost" print("Let's chat! (type 'quit' to exit)") while True: # sentence = "do you use credit cards?" sentence = input("You: ") if sentence == "quit": break sentence = tokenization(sentence) X = bagofWords(sentence, all_words) X = X.reshape(1, X.shape[0]) X = torch.from_numpy(X).to(device) output = model(X) _, predicted = torch.max(output, dim=1) tag = tags[predicted.item()] probs = torch.softmax(output, dim=1) prob = probs[0][predicted.item()] if prob.item() > 0.8: for intent in intents['intents']: if tag == intent["tag"]: print(f"{bot_name}: {random.choice(intent['responses'])}")
from preprocess.make_lower_case import * from preprocess.eliminate_stop_words import * from preprocess.replace_negation_words import * from preprocess.tokenization import * from preprocess.one_hot_encode import * from preprocess.embed_200 import * from preprocess.spellingcheck import * from preprocess.extract_redundant_words import * make_lower_case = make_lower_case(0, "make_lower_case", 1) eliminate_stop_words = eliminate_stop_words(-5, "eliminate_stop_words", 2) replace_negation_words = replace_negation_words(5, "replace_negation_words", 3) tokenization = tokenization(0, "tokenization", 4) one_hot_encode = one_hot_encode(-100, "one_hot_encode", 5) spellingcheck = spellingcheck(50, "spellingcheck", 6) embed_200 = embed_200(0, "embed_200", 7)
from model import NeuralNetwork with open('intents.json', 'r') as file: intents = json.load(file) # Create a list of all words for bag of words model. all_words = [] tags = [] matcher = [] # Extracting tags and sentences from intents for intent in intents['intents']: tag = intent['tag'] tags.append(tag) for pattern in intent['patterns']: w = tokenization(pattern) all_words.extend(w) matcher.append((w, tag)) all_words = refineWords(all_words) all_words = stemmed_output(all_words) # Unique words and tags all_words = sorted(set(all_words)) tags = sorted(set(tags)) # Creating training data X_train = [] y_train = [] for (sentence, tag) in matcher:
from ngram import Ngram from preprocess import sentence_segmentation, tokenization from constants import start_sym, end_sym, train_file # parses train file # creates a dictionary mapping words to occurences # creates a second dictionary mapping bigrams to occurences if __name__ == '__main__': train = open(train_file, 'r') bigrams = dict() words = dict() total_tokens = 0 for sen in sentence_segmentation(train): if sen == '': continue tokens = tokenization("{} {} {}".format(start_sym, sen, end_sym)) x = tokens[0] words[x] = words.get(x, 0) + 1 for y in tokens[1:]: if y == '': continue y = y.lower() bigrams[(x, y)] = bigrams.get((x, y), 0) + 1 words[y] = words.get(y, 0) + 1 x = y total_tokens += len(tokens) training_gram = Ngram(total_tokens, words, bigrams) # creates bigram.lm file training_gram.create_bigram_lm() # creates unigram.lm file
if __name__ == '__main__': if len(argv) < 4: print 'Please run with the following format python perplexity.py <path_to_bigram_lm> <path_to_unigram_lm> <path_to_test_file>' exit() # load unigram uni = load_unigram(open(argv[2])) words = uni['words'] total_tokens = uni['total_tokens'] # load bigram bigrams = load_bigram(open(argv[1])) training_gram = Ngram(total_tokens, words, bigrams) # calculate perplexities test_words = [] sentences = sentence_segmentation(open(argv[3])) for sen in sentences: tokens = tokenization(start_sym + ' ' + sen + ' ' + end_sym) for tok in tokens: if tok == '': continue tok = tok.lower() test_words.append(tok) test_size = len(test_words) # calculating perplexities bi_perplexity = 0 inter_perplexity = 0 uni_perplexity = 0 x = start_sym # calculate summation uni_perplexity += log(training_gram.laplace_unigram(x), 2) for y in test_words[1:]: bi_perplexity += log(training_gram.laplace_bigram(x, y), 2)