Example #1
1
def learning_curve():
    n = 50000
    nsteps = 10
    full = cu.get_sample_data_frame(n)
    data = full.ix[0 : int(n * 0.6) - 1].reset_index()
    cval = full.ix[int(n * 0.6) : int(n * 0.8) - 1].reset_index()
    test = full.ix[int(n * 0.8) : n - 1].reset_index()
    step = len(data) / nsteps
    ndata = len(data)
    mvec = range(step, ndata + step, step)
    test_features = features.extract_features(test)
    data_error = []
    cval_error = []
    for i in range(len(mvec)):
        m = mvec[i]
        print "running for size", m
        train = data.ix[0 : m - 1].reset_index()
        fea = features.extract_features(train)
        rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=False, n_jobs=5)
        rf.fit(fea, train["OpenStatus"])
        new_priors = cu.load_priors("train.csv")
        old_priors = cu.compute_priors(train.OpenStatus)
        # predict train
        probs = rf.predict_proba(fea)
        # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
        y_true = compute_y_true(train)
        score = multiclass_log_loss(y_true, probs)
        data_error.append(score)
        # predict cval
        probs = rf.predict_proba(test_features)
        # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
        y_true = compute_y_true(test)
        score = multiclass_log_loss(y_true, probs)
        cval_error.append(score)
    return mvec, data_error, cval_error
Example #2
0
def learning_curve():
    n = 50000
    nsteps = 10
    full = cu.get_sample_data_frame(n)
    data = full.ix[0:int(n*.6)-1].reset_index()
    cval = full.ix[int(n*.6):int(n*.8)-1].reset_index()
    test = full.ix[int(n*.8):n-1].reset_index()
    step = len(data) / nsteps
    ndata = len(data)
    mvec = range(step, ndata + step, step)
    test_features = features.extract_features(test)
    data_error = []
    cval_error = []
    for i in range(len(mvec)):
        m = mvec[i]
        print 'running for size', m
        train = data.ix[0:m-1].reset_index()
        fea = features.extract_features(train)
        rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=False, n_jobs=5)
        rf.fit(fea, train["OpenStatus"])
        new_priors = cu.load_priors('train.csv')
        old_priors = cu.compute_priors(train.OpenStatus)
        # predict train
        probs = rf.predict_proba(fea)
        #probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
        y_true = compute_y_true(train)
        score = multiclass_log_loss(y_true, probs)
        data_error.append(score)
        # predict cval
        probs = rf.predict_proba(test_features)
        #probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
        y_true = compute_y_true(test)
        score = multiclass_log_loss(y_true, probs)
        cval_error.append(score)
    return mvec, data_error, cval_error
Example #3
0
def measure_prior(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    priors = cu.load_priors("train.csv")
    num_samples = len(test)
    probs = np.kron(np.ones((num_samples, 1)), priors)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score
Example #4
0
def measure_prior(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    priors = cu.load_priors('train.csv')
    num_samples = len(test)
    probs = np.kron(np.ones((num_samples,1)), priors)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score
Example #5
0
def measure_bayes(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    nbfd = features.naive_features(data)
    nbft = features.naive_features(test)
    nb = nltk.NaiveBayesClassifier.train(nbfd)
    probs = []
    for i in range(len(nbft)):
        p = nb.prob_classify(nbft[i][0])
        probs.append([p.prob(s) for s in p.samples()])
    probs = np.array(probs)
    new_priors = cu.load_priors("train.csv")
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, nb, y_true, probs
Example #6
0
def measure_bayes(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    nbfd = features.naive_features(data)
    nbft = features.naive_features(test)
    nb = nltk.NaiveBayesClassifier.train(nbfd)
    probs = []
    for i in range(len(nbft)):
        p = nb.prob_classify(nbft[i][0])
        probs.append([p.prob(s) for s in p.samples()])
    probs = np.array(probs)
    new_priors = cu.load_priors('train.csv')
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, nb, y_true, probs
Example #7
0
def measure_model(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    # data = full.ix[len(full)/4:].reset_index() # last n/4 * 3 records
    # test = full.ix[:(len(full)/4)-1].reset_index() # first n/4 records
    # data = cu.get_dataframe('train-sample.csv')
    # test = cu.get_dataframe('public_leaderboard.csv')
    fea = features.extract_features(data)
    test_features = features.extract_features(test)
    rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=5)
    rf.fit(fea, data["OpenStatus"])
    probs = rf.predict_proba(test_features)
    new_priors = cu.load_priors("train.csv")
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, rf, fea
Example #8
0
def measure_model(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    #data = full.ix[len(full)/4:].reset_index() # last n/4 * 3 records
    #test = full.ix[:(len(full)/4)-1].reset_index() # first n/4 records
    #data = cu.get_dataframe('train-sample.csv')
    #test = cu.get_dataframe('public_leaderboard.csv')
    fea = features.extract_features(data)
    test_features = features.extract_features(test)
    rf = RandomForestClassifier(n_estimators=50, verbose=2,
                                compute_importances=True, n_jobs=5)
    rf.fit(fea, data["OpenStatus"])
    probs = rf.predict_proba(test_features)
    new_priors = cu.load_priors('train.csv')
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, rf, fea
Example #9
0
def measure_svm(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    vocab = [w.strip() for w in file("vocab4.txt")][0:1000]
    vidx = get_vocab_index_lookup(vocab)
    print "extracting data features"
    xdata = extract_svm_features(vidx, data)
    print "extracting test features"
    xtest = extract_svm_features(vidx, test)
    labels = sorted(cu.labels)
    ydata = data.OpenStatus.apply(labels.index).tolist()
    model = svm.sparse.SVC(probability=True)
    print "fitting model"
    model.fit(xdata, ydata)
    print "rest"
    probs = model.predict_proba(xtest)
    new_priors = cu.load_priors("train.csv")
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, model
Example #10
0
def measure_svm(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    vocab = [w.strip() for w in file('vocab4.txt')][0:1000]
    vidx = get_vocab_index_lookup(vocab)
    print 'extracting data features'
    xdata = extract_svm_features(vidx, data)
    print 'extracting test features'
    xtest = extract_svm_features(vidx, test)
    labels = sorted(cu.labels)
    ydata = data.OpenStatus.apply(labels.index).tolist()
    model = svm.sparse.SVC(probability=True)
    print 'fitting model'
    model.fit(xdata, ydata)
    print 'rest'
    probs = model.predict_proba(xtest)
    new_priors = cu.load_priors('train.csv')
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, model
Example #11
0
def measure_lda(datasize=1000, testsize=500):
    # The number of documents to analyze each iteration
    batchsize = 100
    # The total number of questions on Stack Overflow
    D = 3.3e6
    # The number of topics
    K = 100
    # How many documents to look at
    documentstoanalyze = datasize / batchsize
    # Our vocabulary
    vocab = [w.strip() for w in file("./vocab2.txt")]
    W = len(vocab)
    # the data
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    lda = OnlineLDA(vocab, K, D, 1.0 / K, 1.0 / K, 1024.0, 0.7)
    make_topic_columns(lda, data, K, D, batchsize)
    make_topic_columns(lda, test, K, D, batchsize)

    # data = full.ix[len(full)/4:].reset_index() # last n/4 * 3 records
    # test = full.ix[:(len(full)/4)-1].reset_index() # first n/4 records
    # data = cu.get_dataframe('train-sample.csv')
    # test = cu.get_dataframe('public_leaderboard.csv')

    fea = features.extract_features(data)
    test_features = features.extract_features(test)
    rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=5)
    rf.fit(fea, data["OpenStatus"])
    probs = rf.predict_proba(test_features)
    new_priors = cu.load_priors("train.csv")
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, rf, fea, data, datagamma
Example #12
0
def measure_lda(datasize=1000, testsize=500):
    # The number of documents to analyze each iteration
    batchsize = 100
    # The total number of questions on Stack Overflow
    D = 3.3e6
    # The number of topics
    K = 100
    # How many documents to look at
    documentstoanalyze = datasize / batchsize
    # Our vocabulary
    vocab = [w.strip() for w in file('./vocab2.txt')]
    W = len(vocab)
    # the data
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    lda = OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    make_topic_columns(lda, data, K, D, batchsize)
    make_topic_columns(lda, test, K, D, batchsize)
    
    #data = full.ix[len(full)/4:].reset_index() # last n/4 * 3 records
    #test = full.ix[:(len(full)/4)-1].reset_index() # first n/4 records
    #data = cu.get_dataframe('train-sample.csv')
    #test = cu.get_dataframe('public_leaderboard.csv')

    fea = features.extract_features(data)
    test_features = features.extract_features(test)
    rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=5)
    rf.fit(fea, data["OpenStatus"])
    probs = rf.predict_proba(test_features)
    new_priors = cu.load_priors('train.csv')
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, rf, fea, data, datagamma