Example #1
0
def load_questions(args):
    """
	Load the question text to be analyzed
	"""

    train_reader, test_reader = read_csv_data.read_csv_data()
    train = list(train_reader)
    test = list(test_reader)
    random.shuffle(train)

    # split train for X-val
    if args.limit > 0:
        train = train[0:args.limit]

    #trainx, testx = utils.split_list(train, (args.split, 100.-args.split))
    #print ("len(xval_train) = {}, len(xval_test) = {}"\
    #        .format(len(trainx), len(testx)))

    pdb.set_trace()
    questions_pos = []

    for ind in range(0, len(test)):
        kk = test[ind]
        questions_pos.append(gen_pos(kk))
        sys.stdout.write("Parse Progress: %f%%   \r" %
                         (ind * 100 / float(len(test))))
    sys.stdout.flush()

    #filepath = 'pos/train_pos.pkl'
    filepath = 'pos/test_pos.pkl'
    pickle.dump(questions_pos, open(filepath, 'wb'))

    pdb.set_trace()
Example #2
0
def load_questions(args):
	"""
	Load the question text to be analyzed
	"""

	train_reader, test_reader = read_csv_data.read_csv_data()
	train = list(train_reader); test = list(test_reader)
	random.shuffle(train)

	# split train for X-val
	if args.limit > 0:
	    train = train[0:args.limit]

	#trainx, testx = utils.split_list(train, (args.split, 100.-args.split))
	#print ("len(xval_train) = {}, len(xval_test) = {}"\
	#        .format(len(trainx), len(testx)))

	

	pdb.set_trace()
	questions_pos = []

	for ind in range(0,len(test)):
		kk = test[ind]
		questions_pos.append(gen_pos(kk))
		sys.stdout.write("Parse Progress: %f%%   \r" % (ind*100/float(len(test))) )
        sys.stdout.flush()


	#filepath = 'pos/train_pos.pkl'
	filepath = 'pos/test_pos.pkl'
	pickle.dump(questions_pos,open(filepath, 'wb'))

	pdb.set_trace()
def read_answers():
    """
    Read in the test and train data and find all distinct answers.  It is
    assumed that each answer is the title of a wikipedia page.

    Args:
        None

    Returns:
        list of distinct answer strings
    """
    train_reader, test_reader = read_csv_data.read_csv_data()
    
    train_answers = set()
    for row in train_reader:
        for ans in (row["answerA"], row["answerB"], row["answerC"],\
                row["answerD"]):
            train_answers.add(ans)

    test_answers = set()
    for row in test_reader:
        for ans in (row["answerA"], row["answerB"], row["answerC"],\
                row["answerD"]):
            test_answers.add(ans)
    
    return list(train_answers | test_answers)
Example #4
0
def read_answers():
    """
    Read in the test and train data and find all distinct answers.  It is
    assumed that each answer is the title of a wikipedia page.

    Args:
        None

    Returns:
        list of distinct answer strings
    """
    train_reader, test_reader = read_csv_data.read_csv_data()

    train_answers = set()
    for row in train_reader:
        for ans in (row["answerA"], row["answerB"], row["answerC"],\
                row["answerD"]):
            train_answers.add(ans)

    test_answers = set()
    for row in test_reader:
        for ans in (row["answerA"], row["answerB"], row["answerC"],\
                row["answerD"]):
            test_answers.add(ans)

    return list(train_answers | test_answers)
def answer_questions(args):
    """
    Answer questions on the real-deal dataset by doing the following:
        1. Extract (or load) feature strings for the training and test set
        2. Parse the feature strings to compute feature vectors.
        2. ???
        3. Profit

    Args:
        args: ArgumentParser arguments defined in __main__

    Returns:
        None
    """
    pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb'))
    
    if not args.load:
        train_reader, test_reader = read_csv_data.read_csv_data()
        train = list(train_reader); test = list(test_reader)
        
        print ("len(train) = {}, len(test) = {}"\
                .format(len(train), len(test)))
 
        analyzer = similarity.Analyzer()
        feat = similarity.Featurizer(analyzer, pages_dict)
        
        print ("Computing feature strings:")
        fs, fv = feat.compute_feat_strings(train + test, print_info=True)
        
        pickle.dump((train, test, fs, fv, analyzer, feat),
                open('../data/realdeal_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)

    elif args.load: # load pre-comuted feature strings
        print ("Loading precomputed feature strings for real-deal train and test:")
        train, test, fs, fv, analyzer, feat = \
                pickle.load(open('../data/realdeal_feat_strings.pkl', 'rb'))
    
    ## Here we do some cross-validation
    X = feat.compute_feats(fs)
    X = X.tocsr() # might already be CSR
    X.sort_indices() # needed for cosine-type measures

    # running into memory issues, so release this guy
    feat = None

    # try some LDA stuff
    print ("Training LDA topic model")
    topic_mod = lda.LDA(n_topics=20, n_iter=500)
    tm_feat = topic_model.Featurizer(analyzer, pages_dict) # use the same feature strings as similarity
    tm_fs = topic_model.add_wiki_categories(train+test, fs, fv, pages_dict) # adding these seems to hurt public test performance.  Does slightly better on Xval
    topic_X = tm_feat.compute_feats(tm_fs)
    topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic

    print ("Evaluating train data (overfitting!):")
    acc_train = test_xval(train, X, fv,\
            #scorer=similarity.Scorer.cosine, print_info=True)
            scorer=similarity.Scorer.cosine, topics=topics, print_info=True)
    print ("Train accuracy = {}\n".format(acc_train))

    print ("Making predictions for test data:")
    our_answers = make_predictions(test, X, fv,\
            #scorer=similarity.Scorer.cosine, print_info=True)
            scorer=similarity.Scorer.cosine, topics=topics, print_info=True)
    answer_file = "../data/our_answers.csv"
    print ("Writing predictions to {}:".format(answer_file))
    o = csv.DictWriter(open(answer_file, 'w'), ["id", "correctAnswer"])
    o.writeheader()
    for q,a in itertools.izip(test, our_answers):
        d = {"id": q['id'], "correctAnswer": a}
        o.writerow(d)
def answer_xval_lr(args):
    """
    Answer questions on a cross-validation dataset by doing the following:
        1. Extract (or load) feature strings for the training and test set
        2. Parse the feature strings to compute feature vectors.
        2. ???
        3. Profit

    Args:
        args: ArgumentParser arguments defined in __main__

    Returns:
        None
    """
    pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb'))
    
    if not args.load:
        train_reader, _ = read_csv_data.read_csv_data()
        train = list(train_reader)
        random.shuffle(train)

        # split train for X-val
        if args.limit > 0:
            train = train[0:args.limit]
        
        trainx, testx = utils.split_list(train, (args.split, 100.-args.split))
        print ("len(xval_train) = {}, len(xval_test) = {}"\
                .format(len(trainx), len(testx)))
 
        analyzer = similarity.Analyzer()
        feat = similarity.Featurizer(analyzer, pages_dict)
        
        print ("Computing feature strings:")
        fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True)
        
        pickle.dump((trainx, testx, fs, fv, analyzer, feat),
                open('../data/xval_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)

    elif args.load: # load pre-comuted feature strings
        print ("Loading precomputed feature strings for trainx and testx:")
        trainx, testx, fs, fv, analyzer, feat = \
                pickle.load(open('../data/xval_feat_strings.pkl', 'rb'))
    
    ## Here we do some cross-validation
    X = feat.compute_feats(fs)
    X = X.tocsr() # might already be CSR
    X.sort_indices() # needed for cosine-type measures

    #X_scaler = sklearn.preprocessing.StandardScaler(with_mean=False, with_std=True)
    #X = X_scaler.fit_transform(X)

    # try some LDA stuff
    print ("Training LDA topic model")
    topic_mod = lda.LDA(n_topics=20, n_iter=150)
    tm_analyzer = topic_model.Analyzer()
    tm_feat = topic_model.Featurizer(tm_analyzer, pages_dict) # use the same feature strings as similarity
    tm_fs = topic_model.add_wiki_categories(trainx+testx, fs, fv, pages_dict)
    topic_X = tm_feat.compute_feats(tm_fs)
    topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic

    #topics_scaler = sklearn.preprocessing.StandardScaler(with_mean=True, with_std=True)
    #topics = topics_scaler.fit_transform(topics)

    # compute similarity for each question and each answer (of 4)
    # use this as X (e.g. NLP similarity, LDA similarity)
    # binary classification with LR (i.e. is the answer right or not)
    
    print ("Evaluating train data:")
    X_lr_train, y_lr_train = compute_scores(trainx, X, fv,\
            scorer=similarity.Scorer.cosine, topics=topics, train=True,\
            print_info=True)
    print ("Training LR")
    # standardizing
    lr_scaler = sklearn.preprocessing.StandardScaler(with_mean=True, with_std=True)
    X_lr_train = lr_scaler.fit_transform(X_lr_train)

    # alpha sets the weight on regularization term
    lr = sklearn.linear_model.SGDClassifier(loss='log', penalty='l2',\
            n_iter=100, shuffle=True, fit_intercept=True, class_weight={0:.1, 1:.9})
    lr.fit(X_lr_train, y_lr_train)
    #lr.coef_[0,0] = 0.75
    #lr.coef_[0,1] = 0.25
    #lr.intercept_[0] = 0.0
    print (lr.coef_)
    print (lr.intercept_)
    our_answers = lr_make_predictions(X_lr_train, lr)
    acc_trainx = compute_accuracy(trainx, our_answers)
    print ("Train accuracy = {}\n".format(acc_trainx))

    print ("Evaluating test data:")
    X_lr_test = compute_scores(testx, X, fv,\
            scorer=similarity.Scorer.cosine, topics=topics, print_info=True)
    X_lr_test = lr_scaler.transform(X_lr_test)
    our_answers = lr_make_predictions(X_lr_test, lr)
    acc_testx = compute_accuracy(testx, our_answers)
    print ("Test accuracy = {}\n".format(acc_testx))
def answer_xval(args):
    """
    Answer questions on a cross-validation dataset by doing the following:
        1. Extract (or load) feature strings for the training and test set
        2. Parse the feature strings to compute feature vectors.
        2. ???
        3. Profit

    Args:
        args: ArgumentParser arguments defined in __main__

    Returns:
        None
    """
    pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb'))
    
    if not args.load:
        train_reader, _ = read_csv_data.read_csv_data()
        train = list(train_reader)
        random.shuffle(train)

        # split train for X-val
        if args.limit > 0:
            train = train[0:args.limit]
        
        trainx, testx = utils.split_list(train, (args.split, 100.-args.split))
        print ("len(xval_train) = {}, len(xval_test) = {}"\
                .format(len(trainx), len(testx)))
 
        analyzer = similarity.Analyzer()
        feat = similarity.Featurizer(analyzer, pages_dict)
        
        print ("Computing feature strings:")
        fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True)
        
        pickle.dump((trainx, testx, fs, fv, analyzer, feat),
                open('../data/xval_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)

    elif args.load: # load pre-comuted feature strings
        print ("Loading precomputed feature strings for trainx and testx:")
        trainx, testx, fs, fv, analyzer, feat = \
                pickle.load(open('../data/xval_feat_strings.pkl', 'rb'))

    #XXX use this one instead   
    #feat = None
    #analyzer = similarity.Analyzer()
    #feat = similarity.Featurizer(analyzer, pages_dict)

    ## Here we do some cross-validation
    X = feat.compute_feats(fs)
    X = X.tocsr() # might already be CSR
    X.sort_indices() # needed for cosine-type measures

    # try some LDA stuff
    print ("Training LDA topic model")
    topic_mod = lda.LDA(n_topics=20, n_iter=150)
    tm_analyzer = topic_model.Analyzer()
    tm_feat = topic_model.Featurizer(tm_analyzer, pages_dict) # use the same feature strings as similarity
    tm_fs = topic_model.add_wiki_categories(trainx+testx, fs, fv, pages_dict)
    topic_X = tm_feat.compute_feats(tm_fs)
    topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic

    print ("Evaluating train data:")
    acc_trainx = test_xval(trainx, X, fv,\
            #scorer=similarity.Scorer.cosine, print_info=True)
            scorer=similarity.Scorer.cosine, topics=topics, print_info=True)
    print ("Train accuracy = {}\n".format(acc_trainx))

    print ("Evaluating test data:")
    acc_testx = test_xval(testx, X, fv,\
            #scorer=similarity.Scorer.cosine, print_info=True)
            scorer=similarity.Scorer.cosine, topics=topics, print_info=True)
    print ("Test accuracy = {}\n".format(acc_testx))
def answer_xval(args):
    """
    Answer questions on a cross-validation dataset by doing the following:
        1. Extract (or load) feature strings for the training and test set
        2. Parse the feature strings to compute feature vectors.
        2. ???
        3. Profit

    Args:
        args: ArgumentParser arguments defined in __main__

    Returns:
        None
    """
    pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb'))
    train_pos = pickle.load(open('pos/train_pos.pkl', 'rb'))
    test_pos = pickle.load(open('pos/test_pos.pkl', 'rb'))
    all_pos = train_pos + test_pos

    example_d = pickle.load(open('pos/sim_prob_dict.pkl', 'rb'))

    row_num = 0
    old_ans = []
    with open('pos/our_answers.csv', 'rb') as csvfile:
         ans_reader = csv.reader(csvfile, delimiter=',')
         for row in ans_reader:
            if row_num > 0:
                old_ans.append({'id':row[0],'correctAnswer':row[1]})
            row_num += 1

    if not args.load:
        train_reader, test_reader = read_csv_data.read_csv_data()
        train = list(train_reader)
        test = list(test_reader)
        all_data = train + test
        random.shuffle(train)

        # split train for X-val
        if args.limit > 0:
            train = train[0:args.limit]
        
        trainx, testx = utils.split_list(train, (args.split, 100.-args.split))
        print ("len(xval_train) = {}, len(xval_test) = {}"\
                .format(len(trainx), len(testx)))
 
        #analyzer = similarity.Analyzer()
        #feat = similarity.Featurizer(analyzer, pages_dict)
        
        #print ("Computing feature strings:")
        #fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True)

#####################################
        #use_data = train
        #use_pos = train_pos
        use_data = all_data
        use_pos = all_pos

        ind = 0
        num_this = 0
        ans_types = {}
        num_q = 0
        old_relevant = []
        #for kk in trainx:
        for kk in use_data:
            for kk_pos in use_pos:
            #for kk_pos in train_pos:
                if kk_pos['id'] == kk['id']:
                    break

            #for kk_old in old_ans:
            #    if kk_old['id'] == kk['id']:
            #        break
            #ans_types.append(question_features2(kk))
            #ans_types.append(question_features2(kk_pos))
            [k,t] = question_features2(kk_pos)
            if k != 0:
                ans_types[k] = t
                num_q += 1
            #old_relevant.append(kk_old)
            ind += 1
            sys.stdout.write("Parse Progress: %f%%   \r" % (ind*100/float(len(use_data))) )
            sys.stdout.flush()

        num_empty = 0
        for ans in ans_types:
            if not(ans):
                num_empty += 1

        pred_list = {}
        ind = 0
        max_ind = len(use_data)
        for kk in range(0,len(use_data)):
            #if ind > max_ind:
                #break
            if use_data[kk]['id'] in ans_types.keys():
                ind += 1
                pred_list[use_data[kk]['id']] = answer_question(use_data[kk], \
                    ans_types[use_data[kk]['id']], pages_dict)
            else:
                ind += 1
                pred_list[use_data[kk]['id']] = []
            sys.stdout.write("Parse Progress: %f%%   \r" % (ind*100/max_ind) )
            sys.stdout.flush()        


        '''
        for kk in range(0,len(ans_types)):
            if ind > max_ind:
                break
            if (ans_types[kk]):
                ind += 1
                #pred_list.append(google_ans(trainx[kk], ans_types[kk]))
                #pred_list.append(answer_question(trainx[kk], ans_types[kk], pages_dict))
                pred_list.append(answer_question(use_data[kk], ans_types[kk], pages_dict))

            else:
                ind += 1
                pred_list.append([])
            sys.stdout.write("Parse Progress: %f%%   \r" % (ind*100/max_ind) )
            sys.stdout.flush()        '''    

        corr = 0
        total = 0
        for p in range(0,len(train)):
            q_key = train[p]['id']
            if q_key in pred_list.keys():
                if pred_list[q_key]:
                    if pred_list[q_key] == train[p]['correctAnswer']:
                    #if pred_list[p] == old_relevant[p]['correctAnswer']:
                        corr += 1
                    total +=1

        print ('Performance: ' + str(corr/float(total)))
        print ('Fraction Answered: ' + str(float(total)/float(len(use_data))))

        final_answers = pickle_ans(pred_list, use_data)

        pdb.set_trace()

        filepath = 'pos/metric_dict_10_90.pkl'
        pickle.dump(final_answers,open(filepath, 'wb'))
Example #9
0
def answer_xval(args):
    """
    Answer questions on a cross-validation dataset by doing the following:
        1. Extract (or load) feature strings for the training and test set
        2. Parse the feature strings to compute feature vectors.
        2. ???
        3. Profit

    Args:
        args: ArgumentParser arguments defined in __main__

    Returns:
        None
    """
    pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb'))
    train_pos = pickle.load(open('pos/train_pos.pkl', 'rb'))
    test_pos = pickle.load(open('pos/test_pos.pkl', 'rb'))
    all_pos = train_pos + test_pos

    example_d = pickle.load(open('pos/sim_prob_dict.pkl', 'rb'))

    row_num = 0
    old_ans = []
    with open('pos/our_answers.csv', 'rb') as csvfile:
        ans_reader = csv.reader(csvfile, delimiter=',')
        for row in ans_reader:
            if row_num > 0:
                old_ans.append({'id': row[0], 'correctAnswer': row[1]})
            row_num += 1

    if not args.load:
        train_reader, test_reader = read_csv_data.read_csv_data()
        train = list(train_reader)
        test = list(test_reader)
        all_data = train + test
        random.shuffle(train)

        # split train for X-val
        if args.limit > 0:
            train = train[0:args.limit]

        trainx, testx = utils.split_list(train,
                                         (args.split, 100. - args.split))
        print ("len(xval_train) = {}, len(xval_test) = {}"\
                .format(len(trainx), len(testx)))

        #analyzer = similarity.Analyzer()
        #feat = similarity.Featurizer(analyzer, pages_dict)

        #print ("Computing feature strings:")
        #fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True)

        #####################################
        #use_data = train
        #use_pos = train_pos
        use_data = all_data
        use_pos = all_pos

        ind = 0
        num_this = 0
        ans_types = {}
        num_q = 0
        old_relevant = []
        #for kk in trainx:
        for kk in use_data:
            for kk_pos in use_pos:
                #for kk_pos in train_pos:
                if kk_pos['id'] == kk['id']:
                    break

            #for kk_old in old_ans:
            #    if kk_old['id'] == kk['id']:
            #        break
            #ans_types.append(question_features2(kk))
            #ans_types.append(question_features2(kk_pos))
            [k, t] = question_features2(kk_pos)
            if k != 0:
                ans_types[k] = t
                num_q += 1
            #old_relevant.append(kk_old)
            ind += 1
            sys.stdout.write("Parse Progress: %f%%   \r" %
                             (ind * 100 / float(len(use_data))))
            sys.stdout.flush()

        num_empty = 0
        for ans in ans_types:
            if not (ans):
                num_empty += 1

        pred_list = {}
        ind = 0
        max_ind = len(use_data)
        for kk in range(0, len(use_data)):
            #if ind > max_ind:
            #break
            if use_data[kk]['id'] in ans_types.keys():
                ind += 1
                pred_list[use_data[kk]['id']] = answer_question(use_data[kk], \
                    ans_types[use_data[kk]['id']], pages_dict)
            else:
                ind += 1
                pred_list[use_data[kk]['id']] = []
            sys.stdout.write("Parse Progress: %f%%   \r" %
                             (ind * 100 / max_ind))
            sys.stdout.flush()
        '''
        for kk in range(0,len(ans_types)):
            if ind > max_ind:
                break
            if (ans_types[kk]):
                ind += 1
                #pred_list.append(google_ans(trainx[kk], ans_types[kk]))
                #pred_list.append(answer_question(trainx[kk], ans_types[kk], pages_dict))
                pred_list.append(answer_question(use_data[kk], ans_types[kk], pages_dict))

            else:
                ind += 1
                pred_list.append([])
            sys.stdout.write("Parse Progress: %f%%   \r" % (ind*100/max_ind) )
            sys.stdout.flush()        '''

        corr = 0
        total = 0
        for p in range(0, len(train)):
            q_key = train[p]['id']
            if q_key in pred_list.keys():
                if pred_list[q_key]:
                    if pred_list[q_key] == train[p]['correctAnswer']:
                        #if pred_list[p] == old_relevant[p]['correctAnswer']:
                        corr += 1
                    total += 1

        print('Performance: ' + str(corr / float(total)))
        print('Fraction Answered: ' + str(float(total) / float(len(use_data))))

        final_answers = pickle_ans(pred_list, use_data)

        pdb.set_trace()

        filepath = 'pos/metric_dict_10_90.pkl'
        pickle.dump(final_answers, open(filepath, 'wb'))