Beispiel #1
0
def createFeatureExtractorForAll(examples, unigrams, bigrams, glove_file):
    print "Loading Glove None"
    glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, v=False);
    all_features = []
    for i in range(len(examples)*5):
        all_features.append([])
    all_ys = []
    low_ranks = [None, "pmi", "ppmi", "tfidf"];
    #low_ranks = [None]
    print "Calculating VSM Methods"
    # Get Glove Based Models
    for lr in low_ranks:
        if lr != None:
            print "Loading Glove %s" %(lr)
            glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, weighting=lr, v=False);
        glove.lsa(250)
        count = 0
        for example in examples:
            for a in example.answers:
                data = (example, a)
                features = createSingleExtractorVSM(data, glove, unigrams)
                all_features[count] += features
                count += 1
    print "Calculating N-Grams + Interactions"
    
    # Get answers + Unigram/Bigram + Add in interactions
    index = 0
    for example in examples:
        for i,word in enumerate(example.answers):
            if i == example.correctAnswer:
                all_ys.append(1)
            else:
                all_ys.append(0)

            unigram_d = unigramModel(unigrams, example, word)
            bigram_d = bigramModel(bigrams, example, word)
    
            all_features[index].append(unigram_d)
            all_features[index].append(bigram_d)
            
            # Bias Term
            all_features[index].append(1)
            
            #Interaction Terms
            num_feats = len(all_features[index])
            for i in range(num_feats-1):
                for j in range(i+1, num_feats-1):
                    all_features[index].append(all_features[index][i]*all_features[index][j])
            index += 1
    print "Done"
    return (all_features, all_ys)
Beispiel #2
0
def getQuestionClassifications(questions, unigrams, bigrams, glove_file):
    model_classes = getModelClassifications(); # Mapping of types of models/parameters to integer
    prelim_mapping_array = [None]*len(questions) # Map of question to a list of corresponding to models that correctly predicted the answer
    # First Check if the prelim mapping is in a pickle

    if len(getRecursiveFiles("../data/ml_data/sentence_train_prelim", filter_fn=lambda a: ".pickle" in a)) > 0:
        print "found Saved Prelimninary Mappings"
        prelim_mapping_array = loadPickle("../data/ml_data/sentence_train_prelim/com_triandev_prelimmaparray.pickle")
    else:
        print "Finding Preliminary Mapping"
        # Do unigram + bigram first
        for i,question in enumerate(questions):
            u_answer = unigramModel(unigrams, question)
            b_answer = bigramModel(bigrams, question)
            if u_answer[0] == question.getCorrectWord():
                tups = ("Unigram", 2) # TODO: change
                if prelim_mapping_array[i] != None:
                    prelim_mapping_array[i].append(tups)
                else:
                    prelim_mapping_array[i] = [tups]
            if b_answer[0] == question.getCorrectWord():
                tups = ("Bigram", 2) # TODO: change
                if prelim_mapping_array[i] != None:
                    prelim_mapping_array[i].append(tups)
                else:
                    prelim_mapping_array[i] = [tups]

        # Do glove based now
        for lr in low_ranks:
            print "Loading Glove %s" %(lr)
            glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, weighting=lr, v=False);
            glove.lsa(250) # TODO: change to 250
            for model_name, model_form in param_models:
                for d_form, d_name in distances:
                    whole_name = lr + model_name + d_name
                    for i,q in enumerate(questions):
                        answer = None
                        if model_name == "Weighted VSM":
                            answer = model_form(glove, unigrams, q, threshold=.95)
                        else:
                            answer = model_form(glove, q, threshold=.95)
                        if answer[0] != None and answer[0] != -1 and answer[0] == q.getCorrectWord():
                            tups = (whole_name, answer[1]) # (Name, Distance)
                            if prelim_mapping_array[i] != None:
                                prelim_mapping_array[i].append(tups)
                            else:
                                prelim_mapping_array[i] = [tups]
        print "saving preliminary mapping"
        savePickle(prelim_mapping_array, "../data/ml_data/sentence_train_prelim/com_triandev_prelimmaparray.pickle")
    #print prelim_mapping_array 

    # Classify each question now + return
    # For now, randomly picks out of the right ones
    real_mapping = [None]*len(questions)
    for i,q in enumerate(questions):
        if prelim_mapping_array[i] != None:
            #best_model = random.choice(prelim_mapping_array[i])
            best_model = min(prelim_mapping_array[i], key=lambda x: x[1])[0] # Get the name of the model with the lowest distance
            real_mapping[i] = model_classes[best_model]
        else:
            real_mapping[i] = model_classes["None"]
    #print real_mapping
    return real_mapping