예제 #1
0
 def thread(self):
     if not self.isRunning():
         self.log.delete("1.0", END)
         if(self.id =='2'):
             self.initialTimes = int(self.variable.get()) - 1
             self.t1 = Glove.GloveRun(account=self.account.get(), password=self.password.get(),
                                      suffix=self.suffix.get(),
                                      times=self.variable.get(),
                                      canvas_draw=self.canvas,
                                      fig=self.fig, log=self.log,
                                      file=self.file, writingTimes=self.writingTimes,
                                      maxtimes=self.maxtimes,ax=self.ax, input = self.input, data_path=self.data_path)
         else:
             self.log.delete("1.0", END)
             self.initialTimes = int(self.variable.get()) - 1
             self.t1 = LeapMotion.LeapRun(account=self.account.get(), password=self.password.get(),
                                          suffix=self.suffix.get(),
                                          times=self.variable.get(),
                                          ax1=self.ax1, ax2=self.ax2,
                                          canvas_draw=self.canvas,
                                          fig=self.fig, log=self.log,
                                          file=self.file, writingTimes=self.writingTimes,
                                          maxtimes=self.maxtimes, killAll=self.kill_all, data_path=self.data_path)
         self.t1.setDaemon(True)
         self.t1.start()
예제 #2
0
def experiment1(glove_file="../data/glove_vectors/glove.6B.100d.txt", question_dir="../data/all_sat/seven_sat_raw.txt"):

	# Look at text before blank
	def getBeforeBlankText(sentence):
		return sentence[:sentence.find("____")]

	# Look at text after blank
	def getAfterBlankText(sentence):
		return sentence[sentence.find("____") + len("____"):]

	print "Loading Questions"
	questions = loadQuestions(question_dir)

	print "num questions: " , len(questions)

	print "Loading Glove None"
	glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, v=False)

	

	print "Experimenting on 100 percent of questions" 
	for i in range(int(math.floor(len(questions) * 1))):#change 1 to decimal to reduce amount of questions
		question = questions[i]

		#only want single blanks for now
		if len(re.findall ( '____(.*?)____', question.text, re.DOTALL)) != 0:
			continue

		answer_words = getStrippedAnswerWords(question.getCorrectWord())
		answer_vec = glove.getVec(answer_words[0])
		
		total_vec = glove.getAverageVec(filter(lambda x: x not in stopwords.words('english'), question.getSentence()))
		before_vec = glove.getAverageVec(filter(lambda x: x not in stopwords.words('english'), getBeforeBlankText(question.text)))
		after_vec = glove.getAverageVec(filter(lambda x: x not in stopwords.words('english'), getAfterBlankText(question.text)))
		
		# prints if using the sentence model (average of all sentence's VSMS) is less than
		# both before and after
		total_distance = cosine(answer_vec, total_vec)
		before_distance = cosine(answer_vec, before_vec) if len(before_vec) > 2 else 2
		after_distance = cosine(answer_vec, after_vec) if len(after_vec) > 2 else 2
		if total_distance < before_distance and total_distance < after_distance:
			continue #comment this out to print for every question
		print question.text, answer_words[0]
		print "total distance:", total_distance
		print "before distance: " , before_distance
		print "after distance: " , after_distance
		print "\n\n"
예제 #3
0
def experiment1(glove_file="../data/glove_vectors/glove.6B.100d.txt",
                question_dir="../data/all_sat/seven_sat_raw.txt"):

    # Look at text before blank
    def getBeforeBlankText(sentence):
        return sentence[:sentence.find("____")]

    # Look at text after blank
    def getAfterBlankText(sentence):
        return sentence[sentence.find("____") + len("____"):]

    print "Loading Questions"
    questions = loadQuestions(question_dir)

    print "num questions: ", len(questions)

    print "Loading Glove None"
    glove = Glove(glove_file,
                  delimiter=" ",
                  header=False,
                  quoting=csv.QUOTE_NONE,
                  v=False)

    print "Experimenting on 100 percent of questions"
    for i in range(int(math.floor(
            len(questions) *
            1))):  #change 1 to decimal to reduce amount of questions
        question = questions[i]

        #only want single blanks for now
        if len(re.findall('____(.*?)____', question.text, re.DOTALL)) != 0:
            continue

        answer_words = getStrippedAnswerWords(question.getCorrectWord())
        answer_vec = glove.getVec(answer_words[0])

        total_vec = glove.getAverageVec(
            filter(lambda x: x not in stopwords.words('english'),
                   question.getSentence()))
        before_vec = glove.getAverageVec(
            filter(lambda x: x not in stopwords.words('english'),
                   getBeforeBlankText(question.text)))
        after_vec = glove.getAverageVec(
            filter(lambda x: x not in stopwords.words('english'),
                   getAfterBlankText(question.text)))

        # prints if using the sentence model (average of all sentence's VSMS) is less than
        # both before and after
        total_distance = cosine(answer_vec, total_vec)
        before_distance = cosine(answer_vec,
                                 before_vec) if len(before_vec) > 2 else 2
        after_distance = cosine(answer_vec,
                                after_vec) if len(after_vec) > 2 else 2
        if total_distance < before_distance and total_distance < after_distance:
            continue  #comment this out to print for every question
        print question.text, answer_words[0]
        print "total distance:", total_distance
        print "before distance: ", before_distance
        print "after distance: ", after_distance
        print "\n\n"
예제 #4
0
def createFeatureExtractorForAll(examples, unigrams, bigrams, glove_file):
    print "Loading Glove None"
    glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, v=False);
    all_features = []
    for i in range(len(examples)*5):
        all_features.append([])
    all_ys = []
    low_ranks = [None, "pmi", "ppmi", "tfidf"];
    #low_ranks = [None]
    print "Calculating VSM Methods"
    # Get Glove Based Models
    for lr in low_ranks:
        if lr != None:
            print "Loading Glove %s" %(lr)
            glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, weighting=lr, v=False);
        glove.lsa(250)
        count = 0
        for example in examples:
            for a in example.answers:
                data = (example, a)
                features = createSingleExtractorVSM(data, glove, unigrams)
                all_features[count] += features
                count += 1
    print "Calculating N-Grams + Interactions"
    
    # Get answers + Unigram/Bigram + Add in interactions
    index = 0
    for example in examples:
        for i,word in enumerate(example.answers):
            if i == example.correctAnswer:
                all_ys.append(1)
            else:
                all_ys.append(0)

            unigram_d = unigramModel(unigrams, example, word)
            bigram_d = bigramModel(bigrams, example, word)
    
            all_features[index].append(unigram_d)
            all_features[index].append(bigram_d)
            
            # Bias Term
            all_features[index].append(1)
            
            #Interaction Terms
            num_feats = len(all_features[index])
            for i in range(num_feats-1):
                for j in range(i+1, num_feats-1):
                    all_features[index].append(all_features[index][i]*all_features[index][j])
            index += 1
    print "Done"
    return (all_features, all_ys)
예제 #5
0
def experiment2(glove_file="../data/glove_vectors/glove.6B.50d.txt",
                question_dir="../data/all_sat/seven_sat_raw.txt"):
    print "Loading Questions"
    questions = loadQuestions(question_dir)

    print "num questions: ", len(questions)

    print "Loading Glove None"
    glove = Glove(glove_file,
                  delimiter=" ",
                  header=False,
                  quoting=csv.QUOTE_NONE,
                  v=False)

    print "Training N Grams"
    #unigrams, bigrams, cgrams = getGrams(path="../data/Holmes_Training_Data/norvig.txt")

    # Count how many double blanks
    singles = []
    doubles = []
    for q in questions:
        if q.getSentence().count('____') > 1:
            doubles.append(q)
        else:
            singles.append(q)

    # Maps the number of models that answered the question right to the number of questions that that happened to
    print "Looking at every question for every model"
    num_right = {}
    for q in doubles:
        models_right = []
        for name, model in vsm_models:
            answer, d = model(glove, q)
            if (answer == -1 or answer == None): continue
            if answer == q.getCorrectWord():
                models_right.append(name)
        if len(models_right) in num_right:
            num_right[len(models_right)] += 1
        else:
            num_right[len(models_right)] = 1
        print models_right
    print num_right
예제 #6
0
def getQuestionClassifications(questions, unigrams, bigrams, glove_file):
    model_classes = getModelClassifications(); # Mapping of types of models/parameters to integer
    prelim_mapping_array = [None]*len(questions) # Map of question to a list of corresponding to models that correctly predicted the answer
    # First Check if the prelim mapping is in a pickle

    if len(getRecursiveFiles("../data/ml_data/sentence_train_prelim", filter_fn=lambda a: ".pickle" in a)) > 0:
        print "found Saved Prelimninary Mappings"
        prelim_mapping_array = loadPickle("../data/ml_data/sentence_train_prelim/com_triandev_prelimmaparray.pickle")
    else:
        print "Finding Preliminary Mapping"
        # Do unigram + bigram first
        for i,question in enumerate(questions):
            u_answer = unigramModel(unigrams, question)
            b_answer = bigramModel(bigrams, question)
            if u_answer[0] == question.getCorrectWord():
                tups = ("Unigram", 2) # TODO: change
                if prelim_mapping_array[i] != None:
                    prelim_mapping_array[i].append(tups)
                else:
                    prelim_mapping_array[i] = [tups]
            if b_answer[0] == question.getCorrectWord():
                tups = ("Bigram", 2) # TODO: change
                if prelim_mapping_array[i] != None:
                    prelim_mapping_array[i].append(tups)
                else:
                    prelim_mapping_array[i] = [tups]

        # Do glove based now
        for lr in low_ranks:
            print "Loading Glove %s" %(lr)
            glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, weighting=lr, v=False);
            glove.lsa(250) # TODO: change to 250
            for model_name, model_form in param_models:
                for d_form, d_name in distances:
                    whole_name = lr + model_name + d_name
                    for i,q in enumerate(questions):
                        answer = None
                        if model_name == "Weighted VSM":
                            answer = model_form(glove, unigrams, q, threshold=.95)
                        else:
                            answer = model_form(glove, q, threshold=.95)
                        if answer[0] != None and answer[0] != -1 and answer[0] == q.getCorrectWord():
                            tups = (whole_name, answer[1]) # (Name, Distance)
                            if prelim_mapping_array[i] != None:
                                prelim_mapping_array[i].append(tups)
                            else:
                                prelim_mapping_array[i] = [tups]
        print "saving preliminary mapping"
        savePickle(prelim_mapping_array, "../data/ml_data/sentence_train_prelim/com_triandev_prelimmaparray.pickle")
    #print prelim_mapping_array 

    # Classify each question now + return
    # For now, randomly picks out of the right ones
    real_mapping = [None]*len(questions)
    for i,q in enumerate(questions):
        if prelim_mapping_array[i] != None:
            #best_model = random.choice(prelim_mapping_array[i])
            best_model = min(prelim_mapping_array[i], key=lambda x: x[1])[0] # Get the name of the model with the lowest distance
            real_mapping[i] = model_classes[best_model]
        else:
            real_mapping[i] = model_classes["None"]
    #print real_mapping
    return real_mapping
import numpy as np
import Glove
import json
import tensorflow as tf
from tensorflow import keras
import pickle
import pandas as pd
import matplotlib.pyplot as plt

model = keras.models.load_model("CardColorCategorizer.h5")
glove_model = Glove.loadGloveModel()
text = []
newText = "flash enchant creature or vehicle enchanted creature gets minus three minus zero"
text.append(newText.split())

inputs = []
outputs = []
for input in text:
    keywordVector = []
    finalKeywordVector = []
    for word in input:
        if word in glove_model:
            keywordVector.append(glove_model[word])
    for i in range(50):
        if len(keywordVector) > i:
            finalKeywordVector.extend(keywordVector[i])
        else:
            finalKeywordVector.extend(np.zeros(50).tolist())
    finalKeywordVector = np.array(finalKeywordVector)
    inputs.append(finalKeywordVector)
inputs = np.array(inputs)
예제 #8
0
    t = time.time();

    if(v): print "\tLoading passages...";
    questions = loadQuestions(directory="../data/dev_set/") if train else loadQuestions(directory="../data/test/");
    
    # Initialize global variables
    global backoff
    global tagger

    if(v):
        if(save): print "\tTraining Language Models...";
        else: print "\tLoading Language Models...";
    unigrams, bigrams, backoff = getGrams(path=f);

    if(v): print "\tLoading Glove Vectors...";
    glove = Glove(g, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, v=False);

    if(v): print "\tInitializing Part-Of-Speech Classifier";
    #tagger = POSTagger(
    #        'stanford-postagger/models/english-bidirectional-distsim.tagger', 
    #        'stanford-postagger/stanford-postagger.jar',
    #        'utf-8'
    #    );

    if(v): print "Finished loading all external data in " + str(int(time.time() - start)) + " seconds!"
    if(v): print "Starting program now..."

    # Main Method
    main(questions, glove);

    # Finished main execution
예제 #9
0
파일: score.py 프로젝트: lxrogers/CS221SAT
def score(question_dir="../data/cayman_all_training.txt",
          glove_file="../data/glove_vectors/glove.6B.300d.txt",
          ngram_path="../data/Holmes_Training_Data/norvig.txt",
          dev=True):
    print "Training N-Grams"  # Load/Generate N-grams
    unigrams, bigrams, cgrams = getGrams(path=ngram_path)

    print "Loading Questions"  # Load questions
    questions = loadQuestions(question_dir)

    # Holds questions to be evaluated
    eval_qs = None

    if dev:
        # Split into train/dev
        split = len(questions) - len(questions) / 10
        inform("Splitting Data: " + str(split) +
               " questions in training and " + str(len(questions) - split) +
               " in dev...")
        train_questions, eval_qs = questions[:split], questions[split:]
    else:
        eval_qs = questions

    print "Loading Glove"  # Loads Glove vectors
    glove = Glove(glove_file,
                  delimiter=" ",
                  header=False,
                  quoting=csv.QUOTE_NONE,
                  v=False)

    # For every VSM model
    for name, model in vsm_models:

        # We get the model's score
        print "Scoring ", name
        answer_guess_pairs = []
        for question in eval_qs:
            guess = None

            # Weighted VSM has an extra parameter
            if name == "Weighted VSM":
                guess = model(glove, question, unigrams)[0]
            else:
                guess = model(glove, question)[0]

            # Get the correct answer
            answer = question.getCorrectWord()

            # Add to tuple GOLD and guessed answers
            answer_guess_pairs.append((guess, answer))

        print "\n\n"
        scoring.score_model(answer_guess_pairs, verbose=True, modelname=name)

    # Now score Language models
    # For every Language model
    for name, model in language_models:

        # Do the same thing as before
        print "Scoring ", name
        answer_guess_pairs = []

        # For every question
        for question in eval_qs:

            # Generate guess from model
            guess = model(unigrams, bigrams, question)[0]

            # Find GOLD answer (correct answer)
            answer = question.getCorrectWord()

            # Add tuple for scoring
            answer_guess_pairs.append((guess, answer))

        print "\n\n"
        scoring.score_model(answer_guess_pairs, verbose=True, modelname=name)
예제 #10
0
def main():
    if (v): print "Loading passages..."
    passages = loadPassages(f)

    # Initialize all the external data
    if (v): print "Loading all external data..."
    tfidf_array, allWords = computeTFIDFArray(passages)
    unigrams, bigrams, trigrams = getGrams()
    glove = Glove(g, delimiter=" ", header=False, quoting=csv.QUOTE_NONE)
    cooccurrences = cooccurrence()

    if (v): print "Running models..."
    # Initialize arrays to keep answers
    rand, nn, sent, tfidf, gram, syn, wdn, cc, an = [], [], [], [], [], [], [], [], []

    # Loop through all the questions
    for passage in passages:
        for question in passage.questions:

            # Find relevant word
            targetword = re.findall(
                "[\xe2\x80\x9c\u2019\"\']([A-Za-z\s]+)[\xe2\x80\x9c\u2019\"\']",
                question.text)[0].lower()

            # Tokenize relevant sentence
            sentence = passage.text.split("\n")[
                int(re.findall("[0-9]+", question.text)[0]) - 1]
            sentence = re.split("[^A-Za-z0-9]", sentence)
            sentence = filter(lambda x: len(x) > 0, sentence)
            sentence = map(lambda x: x.strip().lower(), sentence)

            # Get correct answer
            correctAnswer = question.answers[question.correctAnswer]

            # Get answers
            randAnswer = randomModel(question.answers)
            nnAnswer = nearestNeighborModel(targetword,
                                            question.answers,
                                            glove,
                                            threshold=.48)
            sentAnswer = sentenceModel(sentence,
                                       question.answers,
                                       glove,
                                       threshold=.44)
            tfidfAnswer = tfidfModel(sentence,
                                     question.answers,
                                     tfidf_array,
                                     allWords,
                                     glove,
                                     threshold=.44)
            gramAnswer = gramModel(sentence,
                                   question.answers,
                                   targetword,
                                   unigrams,
                                   bigrams,
                                   trigrams,
                                   glove,
                                   threshold=.42)
            wdnvec, wdnAnswer = wordnetModel(targetword,
                                             sentence,
                                             question.answers,
                                             glove,
                                             threshold=.46)
            synAnswer = synonymModel(targetword,
                                     wdnvec,
                                     sentence,
                                     question.answers,
                                     bigrams,
                                     trigrams,
                                     glove,
                                     threshold=.34)
            ccAnswer = cooccurrenceModel(targetword, sentence,
                                         question.answers, cooccurrences,
                                         glove)
            anAnswer = analogyModel(targetword, sentence, question.answers,
                                    cooccurrences, glove)

            # Guess the word if we can answer it
            rand.append((randAnswer, correctAnswer))
            nn.append((nnAnswer, correctAnswer))
            sent.append((sentAnswer, correctAnswer))
            tfidf.append((tfidfAnswer, correctAnswer))
            gram.append((gramAnswer, correctAnswer))
            wdn.append((wdnAnswer, correctAnswer))
            syn.append((synAnswer, correctAnswer))
            cc.append((ccAnswer, correctAnswer))
            an.append((anAnswer, correctAnswer))

    score_model(rand, verbose=True, modelname="Random Model")
    score_model(nn, verbose=True, modelname="Nearest Neighbor Model")
    score_model(sent, verbose=True, modelname="Sentence-Based Model")
    score_model(tfidf, verbose=True, modelname="TFIDF Model")
    score_model(gram, verbose=True, modelname="Gram Model")
    score_model(wdn, verbose=True, modelname="WordNet Model")
    score_model(syn, verbose=True, modelname="Synonym Model")
    score_model(cc, verbose=True, modelname="Cooccurrence Model")
    score_model(an, verbose=True, modelname="Analogy Model")
예제 #11
0
def main():
    if(v): print "Loading passages...";
    passages = loadPassages(f);

    # Initialize all the external data
    if(v): print "Loading all external data...";
    tfidf_array, allWords = computeTFIDFArray(passages);
    unigrams, bigrams, trigrams = getGrams();
    glove = Glove(g, delimiter=" ", header=False, quoting=csv.QUOTE_NONE);
    cooccurrences = cooccurrence()

    if(v): print "Running models..."
    # Initialize arrays to keep answers
    rand, nn, sent, tfidf, gram, syn, wdn, cc, an = [], [], [], [], [], [], [], [], [];
    
    # Loop through all the questions
    for passage in passages:
        for question in passage.questions:

            # Find relevant word
            targetword = re.findall("[\xe2\x80\x9c\u2019\"\']([A-Za-z\s]+)[\xe2\x80\x9c\u2019\"\']", question.text)[0].lower();

            # Tokenize relevant sentence
            sentence = passage.text.split("\n")[int(re.findall("[0-9]+", question.text)[0]) - 1];
            sentence = re.split("[^A-Za-z0-9]", sentence);
            sentence = filter(lambda x: len(x) > 0, sentence);
            sentence = map(lambda x: x.strip().lower(), sentence);

            # Get correct answer
            correctAnswer = question.answers[question.correctAnswer];


            # Get answers
            randAnswer = randomModel(question.answers);
            nnAnswer = nearestNeighborModel(targetword, question.answers, glove);
            sentAnswer = sentenceModel(sentence, question.answers, glove);
            tfidfAnswer = tfidfModel(sentence, question.answers, tfidf_array, allWords, glove);
            gramAnswer = gramModel(sentence, question.answers, targetword, unigrams, bigrams, trigrams, glove);
            synAnswer = synonymModel(targetword, sentence, question.answers, bigrams, trigrams, glove)
            wdnAnswer = wordnetModel(targetword, sentence, question.answers, glove, threshold=0.3)
            ccAnswer = cooccurrenceModel(targetword, sentence, question.answers,cooccurrences, glove)
            anAnswer = analogyModel(targetword, sentence, question.answers, cooccurrences, glove)


            # Guess the word if we can answer it
            rand.append( (randAnswer, correctAnswer) );
            nn.append( (nnAnswer, correctAnswer) );
            sent.append( (sentAnswer, correctAnswer) );
            tfidf.append( (tfidfAnswer, correctAnswer) );
            gram.append( (gramAnswer, correctAnswer) );
            syn.append( (synAnswer, correctAnswer) )
            wdn.append( (wdnAnswer, correctAnswer) )
            cc.append( (ccAnswer, correctAnswer) )
            an.append(  (anAnswer, correctAnswer) )

    print "NN: ", percentWrong(nn);
    print "Sent: ", percentWrong(sent);
    print "gram: ", percentWrong(gram);
    print "tfidf: ", percentWrong(tfidf);
    print "syn: ", percentWrong(syn);
    print "wdn: ", percentWrong(wdn);
    print "cc: ", percentWrong(cc);
    print "an: ", percentWrong(an);

    # names = ["NN","sent","gram","tfidf","syn","wdn","cc","an"]
    # for i, m1 in enumerate(zip(names, [nn, sent, gram, tfidf, syn, wdn, cc, an])):
    #     for j, m2 in enumerate(zip(names, [nn, sent, gram, tfidf, syn, wdn, cc, an])):
    #         if(i > j):
    #             print m1[0], m2[0], percentWrong(combineModels(m1[1], m2[1])), len(combineModels(m1[1], m2[1]));

    score_model(rand, verbose=True, modelname="Random Model");
    score_model(nn, verbose=True, modelname="Nearest Neighbor Model");
    score_model(sent, verbose=True, modelname="Sentence-Based Model");
    score_model(tfidf, verbose=True, modelname="TFIDF Model");
    score_model(gram, verbose=True, modelname="Gram Model");
    score_model(syn, verbose=True, modelname="Synonym Model")
    score_model(wdn, verbose=True, modelname="WordNet Model")
    score_model(cc, verbose=True, modelname="Cooccurrence Model")
    score_model(an, verbose=True, modelname="Analogy Model")