def answer_questions(args):
    """
    Answer questions on the real-deal dataset by doing the following:
        1. Extract (or load) feature strings for the training and test set
        2. Parse the feature strings to compute feature vectors.
        2. ???
        3. Profit

    Args:
        args: ArgumentParser arguments defined in __main__

    Returns:
        None
    """
    pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb'))
    
    if not args.load:
        train_reader, test_reader = read_csv_data.read_csv_data()
        train = list(train_reader); test = list(test_reader)
        
        print ("len(train) = {}, len(test) = {}"\
                .format(len(train), len(test)))
 
        analyzer = similarity.Analyzer()
        feat = similarity.Featurizer(analyzer, pages_dict)
        
        print ("Computing feature strings:")
        fs, fv = feat.compute_feat_strings(train + test, print_info=True)
        
        pickle.dump((train, test, fs, fv, analyzer, feat),
                open('../data/realdeal_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)

    elif args.load: # load pre-comuted feature strings
        print ("Loading precomputed feature strings for real-deal train and test:")
        train, test, fs, fv, analyzer, feat = \
                pickle.load(open('../data/realdeal_feat_strings.pkl', 'rb'))
    
    ## Here we do some cross-validation
    X = feat.compute_feats(fs)
    X = X.tocsr() # might already be CSR
    X.sort_indices() # needed for cosine-type measures

    # running into memory issues, so release this guy
    feat = None

    # try some LDA stuff
    print ("Training LDA topic model")
    topic_mod = lda.LDA(n_topics=20, n_iter=500)
    tm_feat = topic_model.Featurizer(analyzer, pages_dict) # use the same feature strings as similarity
    tm_fs = topic_model.add_wiki_categories(train+test, fs, fv, pages_dict) # adding these seems to hurt public test performance.  Does slightly better on Xval
    topic_X = tm_feat.compute_feats(tm_fs)
    topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic

    print ("Evaluating train data (overfitting!):")
    acc_train = test_xval(train, X, fv,\
            #scorer=similarity.Scorer.cosine, print_info=True)
            scorer=similarity.Scorer.cosine, topics=topics, print_info=True)
    print ("Train accuracy = {}\n".format(acc_train))

    print ("Making predictions for test data:")
    our_answers = make_predictions(test, X, fv,\
            #scorer=similarity.Scorer.cosine, print_info=True)
            scorer=similarity.Scorer.cosine, topics=topics, print_info=True)
    answer_file = "../data/our_answers.csv"
    print ("Writing predictions to {}:".format(answer_file))
    o = csv.DictWriter(open(answer_file, 'w'), ["id", "correctAnswer"])
    o.writeheader()
    for q,a in itertools.izip(test, our_answers):
        d = {"id": q['id'], "correctAnswer": a}
        o.writerow(d)
def answer_xval(args):
    """
    Answer questions on a cross-validation dataset by doing the following:
        1. Extract (or load) feature strings for the training and test set
        2. Parse the feature strings to compute feature vectors.
        2. ???
        3. Profit

    Args:
        args: ArgumentParser arguments defined in __main__

    Returns:
        None
    """
    pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb'))
    
    if not args.load:
        train_reader, _ = read_csv_data.read_csv_data()
        train = list(train_reader)
        random.shuffle(train)

        # split train for X-val
        if args.limit > 0:
            train = train[0:args.limit]
        
        trainx, testx = utils.split_list(train, (args.split, 100.-args.split))
        print ("len(xval_train) = {}, len(xval_test) = {}"\
                .format(len(trainx), len(testx)))
 
        analyzer = similarity.Analyzer()
        feat = similarity.Featurizer(analyzer, pages_dict)
        
        print ("Computing feature strings:")
        fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True)
        
        pickle.dump((trainx, testx, fs, fv, analyzer, feat),
                open('../data/xval_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)

    elif args.load: # load pre-comuted feature strings
        print ("Loading precomputed feature strings for trainx and testx:")
        trainx, testx, fs, fv, analyzer, feat = \
                pickle.load(open('../data/xval_feat_strings.pkl', 'rb'))

    #XXX use this one instead   
    #feat = None
    #analyzer = similarity.Analyzer()
    #feat = similarity.Featurizer(analyzer, pages_dict)

    ## Here we do some cross-validation
    X = feat.compute_feats(fs)
    X = X.tocsr() # might already be CSR
    X.sort_indices() # needed for cosine-type measures

    # try some LDA stuff
    print ("Training LDA topic model")
    topic_mod = lda.LDA(n_topics=20, n_iter=150)
    tm_analyzer = topic_model.Analyzer()
    tm_feat = topic_model.Featurizer(tm_analyzer, pages_dict) # use the same feature strings as similarity
    tm_fs = topic_model.add_wiki_categories(trainx+testx, fs, fv, pages_dict)
    topic_X = tm_feat.compute_feats(tm_fs)
    topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic

    print ("Evaluating train data:")
    acc_trainx = test_xval(trainx, X, fv,\
            #scorer=similarity.Scorer.cosine, print_info=True)
            scorer=similarity.Scorer.cosine, topics=topics, print_info=True)
    print ("Train accuracy = {}\n".format(acc_trainx))

    print ("Evaluating test data:")
    acc_testx = test_xval(testx, X, fv,\
            #scorer=similarity.Scorer.cosine, print_info=True)
            scorer=similarity.Scorer.cosine, topics=topics, print_info=True)
    print ("Test accuracy = {}\n".format(acc_testx))
def answer_xval_lr(args):
    """
    Answer questions on a cross-validation dataset by doing the following:
        1. Extract (or load) feature strings for the training and test set
        2. Parse the feature strings to compute feature vectors.
        2. ???
        3. Profit

    Args:
        args: ArgumentParser arguments defined in __main__

    Returns:
        None
    """
    pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb'))
    
    if not args.load:
        train_reader, _ = read_csv_data.read_csv_data()
        train = list(train_reader)
        random.shuffle(train)

        # split train for X-val
        if args.limit > 0:
            train = train[0:args.limit]
        
        trainx, testx = utils.split_list(train, (args.split, 100.-args.split))
        print ("len(xval_train) = {}, len(xval_test) = {}"\
                .format(len(trainx), len(testx)))
 
        analyzer = similarity.Analyzer()
        feat = similarity.Featurizer(analyzer, pages_dict)
        
        print ("Computing feature strings:")
        fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True)
        
        pickle.dump((trainx, testx, fs, fv, analyzer, feat),
                open('../data/xval_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)

    elif args.load: # load pre-comuted feature strings
        print ("Loading precomputed feature strings for trainx and testx:")
        trainx, testx, fs, fv, analyzer, feat = \
                pickle.load(open('../data/xval_feat_strings.pkl', 'rb'))
    
    ## Here we do some cross-validation
    X = feat.compute_feats(fs)
    X = X.tocsr() # might already be CSR
    X.sort_indices() # needed for cosine-type measures

    #X_scaler = sklearn.preprocessing.StandardScaler(with_mean=False, with_std=True)
    #X = X_scaler.fit_transform(X)

    # try some LDA stuff
    print ("Training LDA topic model")
    topic_mod = lda.LDA(n_topics=20, n_iter=150)
    tm_analyzer = topic_model.Analyzer()
    tm_feat = topic_model.Featurizer(tm_analyzer, pages_dict) # use the same feature strings as similarity
    tm_fs = topic_model.add_wiki_categories(trainx+testx, fs, fv, pages_dict)
    topic_X = tm_feat.compute_feats(tm_fs)
    topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic

    #topics_scaler = sklearn.preprocessing.StandardScaler(with_mean=True, with_std=True)
    #topics = topics_scaler.fit_transform(topics)

    # compute similarity for each question and each answer (of 4)
    # use this as X (e.g. NLP similarity, LDA similarity)
    # binary classification with LR (i.e. is the answer right or not)
    
    print ("Evaluating train data:")
    X_lr_train, y_lr_train = compute_scores(trainx, X, fv,\
            scorer=similarity.Scorer.cosine, topics=topics, train=True,\
            print_info=True)
    print ("Training LR")
    # standardizing
    lr_scaler = sklearn.preprocessing.StandardScaler(with_mean=True, with_std=True)
    X_lr_train = lr_scaler.fit_transform(X_lr_train)

    # alpha sets the weight on regularization term
    lr = sklearn.linear_model.SGDClassifier(loss='log', penalty='l2',\
            n_iter=100, shuffle=True, fit_intercept=True, class_weight={0:.1, 1:.9})
    lr.fit(X_lr_train, y_lr_train)
    #lr.coef_[0,0] = 0.75
    #lr.coef_[0,1] = 0.25
    #lr.intercept_[0] = 0.0
    print (lr.coef_)
    print (lr.intercept_)
    our_answers = lr_make_predictions(X_lr_train, lr)
    acc_trainx = compute_accuracy(trainx, our_answers)
    print ("Train accuracy = {}\n".format(acc_trainx))

    print ("Evaluating test data:")
    X_lr_test = compute_scores(testx, X, fv,\
            scorer=similarity.Scorer.cosine, topics=topics, print_info=True)
    X_lr_test = lr_scaler.transform(X_lr_test)
    our_answers = lr_make_predictions(X_lr_test, lr)
    acc_testx = compute_accuracy(testx, our_answers)
    print ("Test accuracy = {}\n".format(acc_testx))
Exemple #4
0
def make_sim_and_topic_dicts():
    pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb'))

    #print ("Loading precomputed feature strings for trainx and testx:")
    #train, test, fs, fv, analyzer, feat = \
    #        pickle.load(open('../data/xval_feat_strings.pkl', 'rb'))
    
    print ("Loading precomputed feature strings for real-deal train and test:")
    train, test, fs, fv, analyzer, feat = \
            pickle.load(open('../data/realdeal_feat_strings.pkl', 'rb'))


    ## Here we do some cross-validation
    X = feat.compute_feats(fs)
    X = X.tocsr() # might already be CSR
    X.sort_indices() # needed for cosine-type measures

    feat = None

    # try some LDA stuff
    print ("Training LDA topic model")
    #topic_mod = lda.LDA(n_topics=20, n_iter=150)
    topic_mod = lda.LDA(n_topics=20, n_iter=500)
    tm_analyzer = topic_model.Analyzer()
    tm_feat = topic_model.Featurizer(tm_analyzer, pages_dict) # use the same feature strings as similarity
    tm_fs = topic_model.add_wiki_categories(train+test, fs, fv, pages_dict)
    topic_X = tm_feat.compute_feats(tm_fs)
    topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic

    # compute similarity for each question and each answer (of 4)
    # use this as X (e.g. NLP similarity, LDA similarity)
    # binary classification with LR (i.e. is the answer right or not)
    
    print ("Evaluating train data:")
    X_lr_train = compute_scores(train, X, fv,\
            scorer=similarity.Scorer.cosine, topics=topics, train=False,\
            #scorer=similarity.Scorer.cosine, topics=None, train=False,\
            print_info=True)
    
    sim_prob_dict = dict()
    topic_prob_dict = dict()
    for ind, q in enumerate(train):
        arr = np.exp(X_lr_train[4*ind:4*ind+4,0]) # soft-max
        sim_prob_dict[q['id']] = arr / sum(arr)

        arr = np.exp(X_lr_train[4*ind:4*ind+4,1])
        topic_prob_dict[q['id']] = arr / sum(arr)
        #print (q['id'], sim_prob_dict[q['id']], topic_prob_dict[q['id']])

    print ("Evaluating test data:")
    X_lr_test = compute_scores(test, X, fv,\
            scorer=similarity.Scorer.cosine, topics=topics, train=False,\
            print_info=True)
    
    for ind, q in enumerate(test):
        arr = np.exp(X_lr_test[4*ind:4*ind+4,0]) # soft-max
        sim_prob_dict[q['id']] = arr / sum(arr)

        arr = np.exp(X_lr_test[4*ind:4*ind+4,1])
        topic_prob_dict[q['id']] = arr / sum(arr)
        #print (q['id'], sim_prob_dict[q['id']], topic_prob_dict[q['id']])
 
    pickle.dump(sim_prob_dict, open('../data/sim_prob_dict.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
    pickle.dump(topic_prob_dict, open('../data/topic_prob_dict.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)