def thread(self): if not self.isRunning(): self.log.delete("1.0", END) if(self.id =='2'): self.initialTimes = int(self.variable.get()) - 1 self.t1 = Glove.GloveRun(account=self.account.get(), password=self.password.get(), suffix=self.suffix.get(), times=self.variable.get(), canvas_draw=self.canvas, fig=self.fig, log=self.log, file=self.file, writingTimes=self.writingTimes, maxtimes=self.maxtimes,ax=self.ax, input = self.input, data_path=self.data_path) else: self.log.delete("1.0", END) self.initialTimes = int(self.variable.get()) - 1 self.t1 = LeapMotion.LeapRun(account=self.account.get(), password=self.password.get(), suffix=self.suffix.get(), times=self.variable.get(), ax1=self.ax1, ax2=self.ax2, canvas_draw=self.canvas, fig=self.fig, log=self.log, file=self.file, writingTimes=self.writingTimes, maxtimes=self.maxtimes, killAll=self.kill_all, data_path=self.data_path) self.t1.setDaemon(True) self.t1.start()
def experiment1(glove_file="../data/glove_vectors/glove.6B.100d.txt", question_dir="../data/all_sat/seven_sat_raw.txt"): # Look at text before blank def getBeforeBlankText(sentence): return sentence[:sentence.find("____")] # Look at text after blank def getAfterBlankText(sentence): return sentence[sentence.find("____") + len("____"):] print "Loading Questions" questions = loadQuestions(question_dir) print "num questions: " , len(questions) print "Loading Glove None" glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, v=False) print "Experimenting on 100 percent of questions" for i in range(int(math.floor(len(questions) * 1))):#change 1 to decimal to reduce amount of questions question = questions[i] #only want single blanks for now if len(re.findall ( '____(.*?)____', question.text, re.DOTALL)) != 0: continue answer_words = getStrippedAnswerWords(question.getCorrectWord()) answer_vec = glove.getVec(answer_words[0]) total_vec = glove.getAverageVec(filter(lambda x: x not in stopwords.words('english'), question.getSentence())) before_vec = glove.getAverageVec(filter(lambda x: x not in stopwords.words('english'), getBeforeBlankText(question.text))) after_vec = glove.getAverageVec(filter(lambda x: x not in stopwords.words('english'), getAfterBlankText(question.text))) # prints if using the sentence model (average of all sentence's VSMS) is less than # both before and after total_distance = cosine(answer_vec, total_vec) before_distance = cosine(answer_vec, before_vec) if len(before_vec) > 2 else 2 after_distance = cosine(answer_vec, after_vec) if len(after_vec) > 2 else 2 if total_distance < before_distance and total_distance < after_distance: continue #comment this out to print for every question print question.text, answer_words[0] print "total distance:", total_distance print "before distance: " , before_distance print "after distance: " , after_distance print "\n\n"
def experiment1(glove_file="../data/glove_vectors/glove.6B.100d.txt", question_dir="../data/all_sat/seven_sat_raw.txt"): # Look at text before blank def getBeforeBlankText(sentence): return sentence[:sentence.find("____")] # Look at text after blank def getAfterBlankText(sentence): return sentence[sentence.find("____") + len("____"):] print "Loading Questions" questions = loadQuestions(question_dir) print "num questions: ", len(questions) print "Loading Glove None" glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, v=False) print "Experimenting on 100 percent of questions" for i in range(int(math.floor( len(questions) * 1))): #change 1 to decimal to reduce amount of questions question = questions[i] #only want single blanks for now if len(re.findall('____(.*?)____', question.text, re.DOTALL)) != 0: continue answer_words = getStrippedAnswerWords(question.getCorrectWord()) answer_vec = glove.getVec(answer_words[0]) total_vec = glove.getAverageVec( filter(lambda x: x not in stopwords.words('english'), question.getSentence())) before_vec = glove.getAverageVec( filter(lambda x: x not in stopwords.words('english'), getBeforeBlankText(question.text))) after_vec = glove.getAverageVec( filter(lambda x: x not in stopwords.words('english'), getAfterBlankText(question.text))) # prints if using the sentence model (average of all sentence's VSMS) is less than # both before and after total_distance = cosine(answer_vec, total_vec) before_distance = cosine(answer_vec, before_vec) if len(before_vec) > 2 else 2 after_distance = cosine(answer_vec, after_vec) if len(after_vec) > 2 else 2 if total_distance < before_distance and total_distance < after_distance: continue #comment this out to print for every question print question.text, answer_words[0] print "total distance:", total_distance print "before distance: ", before_distance print "after distance: ", after_distance print "\n\n"
def createFeatureExtractorForAll(examples, unigrams, bigrams, glove_file): print "Loading Glove None" glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, v=False); all_features = [] for i in range(len(examples)*5): all_features.append([]) all_ys = [] low_ranks = [None, "pmi", "ppmi", "tfidf"]; #low_ranks = [None] print "Calculating VSM Methods" # Get Glove Based Models for lr in low_ranks: if lr != None: print "Loading Glove %s" %(lr) glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, weighting=lr, v=False); glove.lsa(250) count = 0 for example in examples: for a in example.answers: data = (example, a) features = createSingleExtractorVSM(data, glove, unigrams) all_features[count] += features count += 1 print "Calculating N-Grams + Interactions" # Get answers + Unigram/Bigram + Add in interactions index = 0 for example in examples: for i,word in enumerate(example.answers): if i == example.correctAnswer: all_ys.append(1) else: all_ys.append(0) unigram_d = unigramModel(unigrams, example, word) bigram_d = bigramModel(bigrams, example, word) all_features[index].append(unigram_d) all_features[index].append(bigram_d) # Bias Term all_features[index].append(1) #Interaction Terms num_feats = len(all_features[index]) for i in range(num_feats-1): for j in range(i+1, num_feats-1): all_features[index].append(all_features[index][i]*all_features[index][j]) index += 1 print "Done" return (all_features, all_ys)
def experiment2(glove_file="../data/glove_vectors/glove.6B.50d.txt", question_dir="../data/all_sat/seven_sat_raw.txt"): print "Loading Questions" questions = loadQuestions(question_dir) print "num questions: ", len(questions) print "Loading Glove None" glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, v=False) print "Training N Grams" #unigrams, bigrams, cgrams = getGrams(path="../data/Holmes_Training_Data/norvig.txt") # Count how many double blanks singles = [] doubles = [] for q in questions: if q.getSentence().count('____') > 1: doubles.append(q) else: singles.append(q) # Maps the number of models that answered the question right to the number of questions that that happened to print "Looking at every question for every model" num_right = {} for q in doubles: models_right = [] for name, model in vsm_models: answer, d = model(glove, q) if (answer == -1 or answer == None): continue if answer == q.getCorrectWord(): models_right.append(name) if len(models_right) in num_right: num_right[len(models_right)] += 1 else: num_right[len(models_right)] = 1 print models_right print num_right
def getQuestionClassifications(questions, unigrams, bigrams, glove_file): model_classes = getModelClassifications(); # Mapping of types of models/parameters to integer prelim_mapping_array = [None]*len(questions) # Map of question to a list of corresponding to models that correctly predicted the answer # First Check if the prelim mapping is in a pickle if len(getRecursiveFiles("../data/ml_data/sentence_train_prelim", filter_fn=lambda a: ".pickle" in a)) > 0: print "found Saved Prelimninary Mappings" prelim_mapping_array = loadPickle("../data/ml_data/sentence_train_prelim/com_triandev_prelimmaparray.pickle") else: print "Finding Preliminary Mapping" # Do unigram + bigram first for i,question in enumerate(questions): u_answer = unigramModel(unigrams, question) b_answer = bigramModel(bigrams, question) if u_answer[0] == question.getCorrectWord(): tups = ("Unigram", 2) # TODO: change if prelim_mapping_array[i] != None: prelim_mapping_array[i].append(tups) else: prelim_mapping_array[i] = [tups] if b_answer[0] == question.getCorrectWord(): tups = ("Bigram", 2) # TODO: change if prelim_mapping_array[i] != None: prelim_mapping_array[i].append(tups) else: prelim_mapping_array[i] = [tups] # Do glove based now for lr in low_ranks: print "Loading Glove %s" %(lr) glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, weighting=lr, v=False); glove.lsa(250) # TODO: change to 250 for model_name, model_form in param_models: for d_form, d_name in distances: whole_name = lr + model_name + d_name for i,q in enumerate(questions): answer = None if model_name == "Weighted VSM": answer = model_form(glove, unigrams, q, threshold=.95) else: answer = model_form(glove, q, threshold=.95) if answer[0] != None and answer[0] != -1 and answer[0] == q.getCorrectWord(): tups = (whole_name, answer[1]) # (Name, Distance) if prelim_mapping_array[i] != None: prelim_mapping_array[i].append(tups) else: prelim_mapping_array[i] = [tups] print "saving preliminary mapping" savePickle(prelim_mapping_array, "../data/ml_data/sentence_train_prelim/com_triandev_prelimmaparray.pickle") #print prelim_mapping_array # Classify each question now + return # For now, randomly picks out of the right ones real_mapping = [None]*len(questions) for i,q in enumerate(questions): if prelim_mapping_array[i] != None: #best_model = random.choice(prelim_mapping_array[i]) best_model = min(prelim_mapping_array[i], key=lambda x: x[1])[0] # Get the name of the model with the lowest distance real_mapping[i] = model_classes[best_model] else: real_mapping[i] = model_classes["None"] #print real_mapping return real_mapping
import numpy as np import Glove import json import tensorflow as tf from tensorflow import keras import pickle import pandas as pd import matplotlib.pyplot as plt model = keras.models.load_model("CardColorCategorizer.h5") glove_model = Glove.loadGloveModel() text = [] newText = "flash enchant creature or vehicle enchanted creature gets minus three minus zero" text.append(newText.split()) inputs = [] outputs = [] for input in text: keywordVector = [] finalKeywordVector = [] for word in input: if word in glove_model: keywordVector.append(glove_model[word]) for i in range(50): if len(keywordVector) > i: finalKeywordVector.extend(keywordVector[i]) else: finalKeywordVector.extend(np.zeros(50).tolist()) finalKeywordVector = np.array(finalKeywordVector) inputs.append(finalKeywordVector) inputs = np.array(inputs)
t = time.time(); if(v): print "\tLoading passages..."; questions = loadQuestions(directory="../data/dev_set/") if train else loadQuestions(directory="../data/test/"); # Initialize global variables global backoff global tagger if(v): if(save): print "\tTraining Language Models..."; else: print "\tLoading Language Models..."; unigrams, bigrams, backoff = getGrams(path=f); if(v): print "\tLoading Glove Vectors..."; glove = Glove(g, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, v=False); if(v): print "\tInitializing Part-Of-Speech Classifier"; #tagger = POSTagger( # 'stanford-postagger/models/english-bidirectional-distsim.tagger', # 'stanford-postagger/stanford-postagger.jar', # 'utf-8' # ); if(v): print "Finished loading all external data in " + str(int(time.time() - start)) + " seconds!" if(v): print "Starting program now..." # Main Method main(questions, glove); # Finished main execution
def score(question_dir="../data/cayman_all_training.txt", glove_file="../data/glove_vectors/glove.6B.300d.txt", ngram_path="../data/Holmes_Training_Data/norvig.txt", dev=True): print "Training N-Grams" # Load/Generate N-grams unigrams, bigrams, cgrams = getGrams(path=ngram_path) print "Loading Questions" # Load questions questions = loadQuestions(question_dir) # Holds questions to be evaluated eval_qs = None if dev: # Split into train/dev split = len(questions) - len(questions) / 10 inform("Splitting Data: " + str(split) + " questions in training and " + str(len(questions) - split) + " in dev...") train_questions, eval_qs = questions[:split], questions[split:] else: eval_qs = questions print "Loading Glove" # Loads Glove vectors glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, v=False) # For every VSM model for name, model in vsm_models: # We get the model's score print "Scoring ", name answer_guess_pairs = [] for question in eval_qs: guess = None # Weighted VSM has an extra parameter if name == "Weighted VSM": guess = model(glove, question, unigrams)[0] else: guess = model(glove, question)[0] # Get the correct answer answer = question.getCorrectWord() # Add to tuple GOLD and guessed answers answer_guess_pairs.append((guess, answer)) print "\n\n" scoring.score_model(answer_guess_pairs, verbose=True, modelname=name) # Now score Language models # For every Language model for name, model in language_models: # Do the same thing as before print "Scoring ", name answer_guess_pairs = [] # For every question for question in eval_qs: # Generate guess from model guess = model(unigrams, bigrams, question)[0] # Find GOLD answer (correct answer) answer = question.getCorrectWord() # Add tuple for scoring answer_guess_pairs.append((guess, answer)) print "\n\n" scoring.score_model(answer_guess_pairs, verbose=True, modelname=name)
def main(): if (v): print "Loading passages..." passages = loadPassages(f) # Initialize all the external data if (v): print "Loading all external data..." tfidf_array, allWords = computeTFIDFArray(passages) unigrams, bigrams, trigrams = getGrams() glove = Glove(g, delimiter=" ", header=False, quoting=csv.QUOTE_NONE) cooccurrences = cooccurrence() if (v): print "Running models..." # Initialize arrays to keep answers rand, nn, sent, tfidf, gram, syn, wdn, cc, an = [], [], [], [], [], [], [], [], [] # Loop through all the questions for passage in passages: for question in passage.questions: # Find relevant word targetword = re.findall( "[\xe2\x80\x9c\u2019\"\']([A-Za-z\s]+)[\xe2\x80\x9c\u2019\"\']", question.text)[0].lower() # Tokenize relevant sentence sentence = passage.text.split("\n")[ int(re.findall("[0-9]+", question.text)[0]) - 1] sentence = re.split("[^A-Za-z0-9]", sentence) sentence = filter(lambda x: len(x) > 0, sentence) sentence = map(lambda x: x.strip().lower(), sentence) # Get correct answer correctAnswer = question.answers[question.correctAnswer] # Get answers randAnswer = randomModel(question.answers) nnAnswer = nearestNeighborModel(targetword, question.answers, glove, threshold=.48) sentAnswer = sentenceModel(sentence, question.answers, glove, threshold=.44) tfidfAnswer = tfidfModel(sentence, question.answers, tfidf_array, allWords, glove, threshold=.44) gramAnswer = gramModel(sentence, question.answers, targetword, unigrams, bigrams, trigrams, glove, threshold=.42) wdnvec, wdnAnswer = wordnetModel(targetword, sentence, question.answers, glove, threshold=.46) synAnswer = synonymModel(targetword, wdnvec, sentence, question.answers, bigrams, trigrams, glove, threshold=.34) ccAnswer = cooccurrenceModel(targetword, sentence, question.answers, cooccurrences, glove) anAnswer = analogyModel(targetword, sentence, question.answers, cooccurrences, glove) # Guess the word if we can answer it rand.append((randAnswer, correctAnswer)) nn.append((nnAnswer, correctAnswer)) sent.append((sentAnswer, correctAnswer)) tfidf.append((tfidfAnswer, correctAnswer)) gram.append((gramAnswer, correctAnswer)) wdn.append((wdnAnswer, correctAnswer)) syn.append((synAnswer, correctAnswer)) cc.append((ccAnswer, correctAnswer)) an.append((anAnswer, correctAnswer)) score_model(rand, verbose=True, modelname="Random Model") score_model(nn, verbose=True, modelname="Nearest Neighbor Model") score_model(sent, verbose=True, modelname="Sentence-Based Model") score_model(tfidf, verbose=True, modelname="TFIDF Model") score_model(gram, verbose=True, modelname="Gram Model") score_model(wdn, verbose=True, modelname="WordNet Model") score_model(syn, verbose=True, modelname="Synonym Model") score_model(cc, verbose=True, modelname="Cooccurrence Model") score_model(an, verbose=True, modelname="Analogy Model")
def main(): if(v): print "Loading passages..."; passages = loadPassages(f); # Initialize all the external data if(v): print "Loading all external data..."; tfidf_array, allWords = computeTFIDFArray(passages); unigrams, bigrams, trigrams = getGrams(); glove = Glove(g, delimiter=" ", header=False, quoting=csv.QUOTE_NONE); cooccurrences = cooccurrence() if(v): print "Running models..." # Initialize arrays to keep answers rand, nn, sent, tfidf, gram, syn, wdn, cc, an = [], [], [], [], [], [], [], [], []; # Loop through all the questions for passage in passages: for question in passage.questions: # Find relevant word targetword = re.findall("[\xe2\x80\x9c\u2019\"\']([A-Za-z\s]+)[\xe2\x80\x9c\u2019\"\']", question.text)[0].lower(); # Tokenize relevant sentence sentence = passage.text.split("\n")[int(re.findall("[0-9]+", question.text)[0]) - 1]; sentence = re.split("[^A-Za-z0-9]", sentence); sentence = filter(lambda x: len(x) > 0, sentence); sentence = map(lambda x: x.strip().lower(), sentence); # Get correct answer correctAnswer = question.answers[question.correctAnswer]; # Get answers randAnswer = randomModel(question.answers); nnAnswer = nearestNeighborModel(targetword, question.answers, glove); sentAnswer = sentenceModel(sentence, question.answers, glove); tfidfAnswer = tfidfModel(sentence, question.answers, tfidf_array, allWords, glove); gramAnswer = gramModel(sentence, question.answers, targetword, unigrams, bigrams, trigrams, glove); synAnswer = synonymModel(targetword, sentence, question.answers, bigrams, trigrams, glove) wdnAnswer = wordnetModel(targetword, sentence, question.answers, glove, threshold=0.3) ccAnswer = cooccurrenceModel(targetword, sentence, question.answers,cooccurrences, glove) anAnswer = analogyModel(targetword, sentence, question.answers, cooccurrences, glove) # Guess the word if we can answer it rand.append( (randAnswer, correctAnswer) ); nn.append( (nnAnswer, correctAnswer) ); sent.append( (sentAnswer, correctAnswer) ); tfidf.append( (tfidfAnswer, correctAnswer) ); gram.append( (gramAnswer, correctAnswer) ); syn.append( (synAnswer, correctAnswer) ) wdn.append( (wdnAnswer, correctAnswer) ) cc.append( (ccAnswer, correctAnswer) ) an.append( (anAnswer, correctAnswer) ) print "NN: ", percentWrong(nn); print "Sent: ", percentWrong(sent); print "gram: ", percentWrong(gram); print "tfidf: ", percentWrong(tfidf); print "syn: ", percentWrong(syn); print "wdn: ", percentWrong(wdn); print "cc: ", percentWrong(cc); print "an: ", percentWrong(an); # names = ["NN","sent","gram","tfidf","syn","wdn","cc","an"] # for i, m1 in enumerate(zip(names, [nn, sent, gram, tfidf, syn, wdn, cc, an])): # for j, m2 in enumerate(zip(names, [nn, sent, gram, tfidf, syn, wdn, cc, an])): # if(i > j): # print m1[0], m2[0], percentWrong(combineModels(m1[1], m2[1])), len(combineModels(m1[1], m2[1])); score_model(rand, verbose=True, modelname="Random Model"); score_model(nn, verbose=True, modelname="Nearest Neighbor Model"); score_model(sent, verbose=True, modelname="Sentence-Based Model"); score_model(tfidf, verbose=True, modelname="TFIDF Model"); score_model(gram, verbose=True, modelname="Gram Model"); score_model(syn, verbose=True, modelname="Synonym Model") score_model(wdn, verbose=True, modelname="WordNet Model") score_model(cc, verbose=True, modelname="Cooccurrence Model") score_model(an, verbose=True, modelname="Analogy Model")