def loadOrgData():
  global x_org,x_t_org,d_org
  if x_org == []:
    x_org = loadData(scale=1,train=True)
    x_t_org = loadData(scale=1,train=False)
    d_org = len(x_org[0,:])
    reset = False
  else:
    print "Data already loaded."
Example #2
0
def doStuff(name, scale=1, P=100):
    y = y_org

    # Step 0: load data (zoomed or not)
    print("Reading training data.")
    x = loadData(scale=scale, train=True)

    print("Reading test data.")
    x_t = loadData(scale=scale, train=False)
    nTest = len(x_t)
    d = len(x[0, :])
    d2 = d / P

    # Step 1: divide data set into P subsets (TODO: boosting?)
    # we use P estimators
    sum_preds = np.zeros(nTest)
    sum_r = 0
    for i in range(0, P):
        if i % 5 == 0:
            print("Make prediction for subset/estimator %s..." % (i + 1))
        minI = d2 * i
        maxI = min((d2) * (i + 1), d) - 1
        x_fi = x[:, i::P]
        x_t_fi = x_t[:, i::P]
        # Step 2: apply each estimator on data matrix x_fi and age vector y
        y_pred_i, r = fi_prediction(x_fi, x_t_fi, y)

        # insert prediction for test data i here
        sum_preds = sum_preds + y_pred_i
        sum_r = sum_r + r
    """ Step 3: combine all estimates """
    y_t_pred = [i / float(P) for i in sum_preds]
    r = sum_r / float(P)
    """Step 4: post-process and save prediction"""
    # TODO: calculate and sum of save covariances
    prefix = "%s_CV_P%s_zoom%sFULL" % (name, P, 1 / float(scale))
    prep = lambda i: int(i)
    y_t_pp = [prep(i) for i in y_t_pred]
    savedFilename = saveCSV(y_t_pp, prefix)
    print("Saved predictions into %s" % savedFilename)
    """ Step 5: make histogram plot of age """
    # (because no visualization for flat data matrix...)
    plt.hist(y, color="black", rwidth=0.7)
    #plt.hist(y_pred,color="darkgreen",rwidth=0.5)
    plt.hist(y_t_pp, color="darkblue", rwidth=0.5)
    plt.legend(
        ["ages given for X", "ages predicted for X", "ages predicted for X_t"])
    savedPlotFname = prefix + ".png"
    plt.savefig(savedPlotFname)
    print("Saved age diagram in %s" % savedPlotFname)
    plt.clf()
    print("Average of coefficients: %s" % r)
    # retuns a colleciton of stuff to return
    return (x, y, x_t, y_t_pred, y_t_pp, r)
Example #3
0
def doStuff(name, alpha=77, scale=1, P=100):
    y = y_org

    # Step 0: load data (zoomed or not)
    print("Reading training data.")
    #x = sharedmem.empty(n_max)
    x = loadData(scale=scale, train=True)

    print("Reading test data.")
    #x = sharedmem.empty(n_test_max)
    x_t = loadData(scale=scale, train=False)
    nTest = len(x_t)
    print x
    d = len(x[0, :])
    d2 = d / P
    """ Step 1: divide data set into P subsets (TODO: boosting?) """
    # we use P estimators
    #  results = pool.map(each_elem,range(0,P))
    results = range(0, P)
    processes = [
        Process(target=each_elem, args=(i, x, x_t, results))
        for i in range(0, P)
    ]
    for p in processes:
        p.start()
    for p in processes:
        p.join()
    """ Step 3: combine all estimates """
    y_t_pred = reduce(lambda a, x: a + x[0], results, 0) / float(P)
    r = reduce(lambda a, x: a + x[1], results, 0) / float(P)
    """Step 4: post-process and save prediction"""
    # TODO: calculate and sum of save covariances
    prefix = "%s_alpha%s_P%s_zoom%sFULL" % (name, alpha, P, 1 / float(scale))
    prep = lambda i: int(i)
    y_t_pp = [prep(i) for i in y_t_pred]
    savedFilename = saveCSV(y_t_pp, prefix)
    print("Saved predictions into %s" % savedFilename)
    """ Step 5: make histogram plot of age """
    # (because no visualization for flat data matrix...)
    plt.hist(y, color="black", rwidth=0.7)
    #plt.hist(y_pred,color="darkgreen",rwidth=0.5)
    plt.hist(y_t_pp, color="darkblue", rwidth=0.5)
    plt.legend(
        ["ages given for X", "ages predicted for X", "ages predicted for X_t"])
    savedPlotFname = prefix + ".png"
    plt.savefig(savedPlotFname)
    print("Saved age diagram in %s" % savedPlotFname)
    plt.clf()

    # retuns a colleciton of stuff to return
    return (x, y, x_t, y_t_pred, y_t_pp)
Example #4
0
def train_model():
    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())

        train = loadData("./data/split_data/train.csv")

        for _ in range(epochs):
            for idx, row in train.iterrows():

                if not row["content"]:
                    continue

                feed_dict, labels = get_feed_dict(row)

                predicted, currentLoss = sess.run([logits, train_loss],
                                                  feed_dict=feed_dict)

                # Prints first 50 characters of the content with loss
                print(row["content"][:50], " - Loss:", currentLoss)

                sess.run(update_step, feed_dict=feed_dict)

        print("\nModel trained!")

        save_path = saver.save(sess, "./model_dir/model1/model.ckpt")
        print("Model saved in path: %s" % save_path)
def tf_idf_advanced(comments):
    commentList = list2str(comments)
    print('Loading Vectorizer')
    if os.path.exists('models/vectorizer_imdb_tfidf_advanced.pkl'):
        with open('models/vectorizer_imdb_tfidf_advanced.pkl', 'rb') as fw:
            Vectorizer = pickle.load(fw)
        fw.close()
    else:
        print('reading data')
        filename = 'dataset/aclImdb/train/pos'
        filename1 = 'dataset/aclImdb/train/neg'
        trainComments, labels = pre.loadData(filename, filename1)
        trainCommentList = list2str(trainComments)
        Vectorizer = TfidfVectorizer(max_features=10000,
                                     input='content',
                                     analyzer=stemmed_words,
                                     stop_words='english',
                                     encoding='utf-8',
                                     decode_error='ignore',
                                     lowercase=True,
                                     ngram_range=(1, 3))
        Vectorizer.fit_transform(trainCommentList)
        with open('models/vectorizer_imdb_tfidf_advanced.pkl', 'wb') as fw:
            pickle.dump(Vectorizer, fw)
        fw.close()
    print('Vectorizing comments')
    return Vectorizer.transform(commentList)
Example #6
0
def tsne_impl():
    from sklearn.manifold import TSNE
    X, y = preprocess.loadData()
    X_embedded = TSNE(n_components=2).fit_transform(X)
    print(X_embedded.shape)
    colors = np.random.rand(X_embedded.shape[0])
    scored_indices = y == 1
    not_scored_indices = y == 0
    fig, ax = plt.subplots()
    ax.scatter(X_embedded[scored_indices, 0],
               X_embedded[scored_indices, 1],
               c='red',
               label='Scored',
               marker='*')
    ax.scatter(X_embedded[not_scored_indices, 0],
               X_embedded[not_scored_indices, 1],
               c='blue',
               label='Not scored',
               marker='+')
    ax.legend()
    plt.show()
Example #7
0
def run():

    for gap in xrange(7):
        expansion = False
        data, comm = preprocess.loadData(gap, expansion)
        train_data, test_data = preprocess.split(data, comm)
        train = np.array(train_data[0])
        test = np.array(test_data[0])
        test_com = test_data[1]

        train_x = train[:, 1:]
        train_y = train[:, 0]

        print train_x.shape

        reg = gcv(train_x, train_y)

        if len(sys.argv) > 1 and sys.argv[1] == 'output':
            test_x = test[:, 1:]
            pred = reg.predict(test_x)
            output_test(pred, test_com, gap)
Example #8
0
def estimate_savings(model=None):
    """
    Calculate estimated savings
    """
    # Iterating and labeling
    pivot_frame = preprocess.loadData(file_name="pivot_data_new.csv")
    temp_group = pivot_frame.groupby(
        ['airline', 'flight_path', 'days_to_depart'])
    max_price = 0
    i = 0
    for indexes, res in tqdm(temp_group):
        if (i % 36 == 0):
            max_price = 0
        i += 1
        m = res["mean"].iloc[0]
        max_price = max(max_price, m)
        pivot_frame.loc[(pivot_frame['airline'] == indexes[0]) &
                        (pivot_frame['flight_path'] == indexes[1]) &
                        (pivot_frame['days_to_depart'] == indexes[2]),
                        'delta'] = max_price - m
    print("Total Savings =", pivot_frame['delta'].sum())
    print("Average Savings =", pivot_frame['delta'].mean())
Example #9
0
def PCA():
    from sklearn.decomposition import PCA
    X, y = preprocess.loadData()
    pca = PCA(n_components=2, svd_solver='full')
    X_embedded = pca.fit_transform(X, y)
    print(X.shape)
    print(X_embedded.shape)
    colors = np.random.rand(X_embedded.shape[0])
    scored_indices = y == 1
    not_scored_indices = y == 0
    fig, ax = plt.subplots()
    ax.scatter(X_embedded[scored_indices, 0],
               X_embedded[scored_indices, 1],
               c='red',
               label='Scored',
               marker='*')
    ax.scatter(X_embedded[not_scored_indices, 0],
               X_embedded[not_scored_indices, 1],
               c='blue',
               label='Not scored',
               marker='+')
    ax.legend()
    plt.show()
Example #10
0
def bagging(file_path,
            output_path,
            model_type,
            begin_day,
            end_day,
            expansion=False,
            save_model=True,
            output=True,
            online=False,
            steps=1000,
            train_set="base",
            valid_set="base",
            test_set="base",
            cv=4,
            shuffle=False,
            weight=[50, 15, 15, 5],
            cla=0):
    final_lis = []
    for gap in range(begin_day, end_day):
        print "*" * 20, gap, "*" * 20
        print "Gap %d Start Time:" % gap, time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        data, comm = preprocess.loadData(gap, expansion=expansion)
        #train, test = preprocess.split(data, comm)
        train, valid, test_online = preprocess.split_lastweek(data, comm, cla)

        train_data = np.array(train[0])
        valid_data = np.array(valid[0])
        test_data = np.array(test_online[0])

        train_x = train_data[:, 1:]
        train_y = train_data[:, 0]
        valid_x = valid_data[:, 1:]
        valid_y = valid_data[:, 0]
        test_x = test_data[:, 1:]
        valid_x_nonan = valid_x.copy()
        valid_x_nonan[np.isnan(valid_x_nonan)] = -1
        test_x_nonan = test_x.copy()
        test_x_nonan[np.isnan(test_x_nonan)] = -1
        if shuffle:
            idx = np.random.permutation(train_y.size)
            train_x = train_x[idx]
            train_y = train_y[idx]

        #train_len = train_data.shape[0]
        #win_size = train_len / cv + 1
        #print >> sys.stderr, train_len
        #print >> sys.stderr, win_size

        clfs = []

        application = ["regression_l2"] * 2 + ["huber"] * 2 + ["fair"] * 6
        boosting = ["dart"] * 1 + ["gbrt"] * 11
        learning_rate = [0.015, 0.02, 0.03, 0.04]
        #metric = ["l1"] + ["l2"] + ["huber"] *4 + ["fair"] *3
        num_leaves = [32] * 1 + [64] * 2 + [128] * 2 + [256]
        feature_fraction = [0.5, 0.6, 0.7, 0.8, 0.9]
        bagging_fraction = [0.5, 0.6, 0.7, 0.8, 0.9]
        lambda_l1 = [0.5, 0.6, 0.7, 0.8, 0.9]
        lambda_l2 = [0.5, 0.6, 0.7, 0.8, 0.9]
        drop_rate = [0.3, 0.5, 0.7, 0.9]
        skip_drop = [0.3, 0.5, 0.7, 0.9]
        huber_delta = [0.6, 0.8, 0.9]
        fair_c = [0.6, 0.8, 0.9]
        max_bin = range(200, 400)
        feature_fraction_seed = range(1, 20)
        bagging_seed = range(1, 20)
        drop_seed = range(1, 20)

        for i in range(0, weight[1]):
            dic = {
                "application": random.choice(application),
                "boosting": random.choice(boosting),
                "learning_rate": random.choice(learning_rate),
                "num_leaves": random.choice(num_leaves),
                "feature_fraction": random.choice(feature_fraction),
                "bagging_fraction": random.choice(bagging_fraction),
                "lambda_l1": random.choice(lambda_l1),
                "lambda_l2": random.choice(lambda_l2),
                "drop_rate": random.choice(drop_rate),
                "skip_drop": random.choice(skip_drop),
                "max_bin": random.choice(max_bin),
                "huber_delta": random.choice(huber_delta),
                "fair_c": random.choice(fair_c),
                "feature_fraction_seed": random.choice(feature_fraction_seed),
                "bagging_seed": random.choice(bagging_seed),
                "drop_seed": random.choice(drop_seed)
            }
            clfs.append((fitLGBModel, dic))

        obj = [myObjective6] * 10 + ["reg:linear"]
        learning_rate = [0.015, 0.02, 0.03, 0.04]
        seed = range(1, 20)
        max_depth = range(5, 11)
        subsample = [0.5, 0.6, 0.7, 0.8, 0.9]
        colsample_bytree = [0.5, 0.6, 0.7, 0.8, 0.9]
        colsample_bylevel = [0.5, 0.6, 0.7, 0.8, 0.9]
        gamma = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
        for i in range(0, weight[0]):
            dic = {
                "objective": random.choice(obj),
                "learning_rate": random.choice(learning_rate),
                "seed": random.choice(seed),
                "max_depth": random.choice(max_depth),
                "subsample": random.choice(subsample),
                "colsample_bytree": random.choice(colsample_bytree),
                "colsample_bylevel": random.choice(colsample_bylevel),
                "gamma": random.choice(gamma)
            }
            clfs.append((fitXGBModel, dic))

        max_features = [0.5, 0.6, 0.65, 0.7, 0.8]
        max_depth = range(5, 11)
        min_samples_leaf = [2, 10, 30, 50]
        random_state = range(0, 8)
        for i in range(0, weight[2]):
            dic = {
                "max_features": random.choice(max_features),
                "max_depth": random.choice(max_depth),
                "min_samples_leaf": random.choice(min_samples_leaf),
                "random_state": random.choice(random_state)
            }
            clfs.append((fitRFModel, dic))

        loss = ["lad", "huber"]
        learning_rate = [0.01, 0.02, 0.03, 0.04]
        max_features = [0.5, 0.6, 0.65, 0.7, 0.8]
        max_depth = range(5, 11)
        subsample = [0.5, 0.6, 0.7, 0.8]
        random_state = range(0, 8)
        for i in range(0, weight[3]):
            dic = {
                "loss": random.choice(loss),
                "learning_rate": random.choice(learning_rate),
                "max_features": random.choice(max_features),
                "max_depth": random.choice(max_depth),
                "subsample": random.choice(subsample),
                "random_state": random.choice(random_state)
            }
            clfs.append((fitGBRTModel, dic))

        #stage2_train = np.zeros((train_x.shape[0], len(clfs)))
        stage2_valid = np.zeros((valid_x.shape[0], len(clfs)))
        stage2_test = np.zeros((test_x.shape[0], len(clfs)))

        bagging = np.random.randint(0,
                                    train_x.shape[0],
                                    size=(len(clfs), train_x.shape[0]))

        for idx, clf in enumerate(clfs):
            print "Gap", gap, "Model", idx
            print clf[1]
            print "Start Time:", time.strftime('%Y-%m-%d %H:%M:%S',
                                               time.localtime(time.time()))
            #skf = list(StratifiedKFold(train_y, cv, shuffle=True, random_state=idx))
            #stage2_valid_temp = np.zeros((valid_x.shape[0], len(skf)))
            #stage2_test_temp = np.zeros((test_x.shape[0], len(skf)))
            #train = train_x[bagging[idx]]
            #test = test_x[bagging[idx]]
            #for i, (train, test) in enumerate(skf):
            #print "Fold:", i, "Start Time:", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            X_train = train_x[bagging[idx]]
            y_train = train_y[bagging[idx]]
            print X_train.shape
            print y_train.shape
            #X_test = train_x[test]
            #y_test = train_y[test]
            if clf[0] in [
                    fitLRModel, fitRidgeModel, fitLinearSVRModel, fitGBRTModel,
                    fitRFModel
            ]:
                X_train[np.isnan(X_train)] = -1
                valid_x_tmp = valid_x_nonan
                test_x_tmp = test_x_nonan
            else:
                valid_x_tmp = valid_x
                test_x_tmp = test_x
            reg, eva = clf[0](X_train, y_train, valid_x, valid_y, clf[1])
            if save_model:
                joblib.dump(reg, "%s/%d_%d.m" % (file_path, gap, idx))
                #stage2_train[test, idx] = reg.predict(X_test)
            #stage2_valid_temp[:, idx] = reg.predict(valid_x)
            #stage2_test_temp[:, i] = reg.predict(test_x_tmp)
            stage2_valid[:, idx] = reg.predict(valid_x_tmp)
            stage2_test[:, idx] = reg.predict(test_x_tmp)
            print "Gap ", gap, "Model %d: %.6f" % (
                idx, evaluate(reg.predict(valid_x_tmp), valid_y))

        if gap == -1:
            for idx in range(len(clfs)):
                if clfs[idx][0] in [
                        fitLRModel, fitRidgeModel, fitLinearSVRModel,
                        fitGBRTModel, fitRFModel
                ]:
                    valid_x_tmp = valid_x_nonan
                    test_x_tmp = test_x_nonan
                else:
                    valid_x_tmp = valid_x
                    test_x_tmp = test_x
                reg = joblib.load("%s/%d_%d.m" % (file_path, gap, idx))
                stage2_valid[:, idx] = reg.predict(valid_x_tmp)
                stage2_test[:, idx] = reg.predict(test_x_tmp)

        #print "Final LR:", "Start Time:", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
        #reg1, eva1 = fitLRModel(stage2_train, train_y, stage2_valid, valid_y, {})
        #print "Final LR Eval", eva1
        #print "Final Ridge:", "Start Time:", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
        #reg2, eva2 = fitRidgeModel(stage2_train, train_y, stage2_valid, valid_y, {})
        #print "Final Ridge Eval", eva2
        #print "Final XGB:", "Start Time:", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
        #reg3, eva3 = fitXGBModel(stage2_train, train_y, stage2_valid, valid_y,
        #                        {'max_depth':2, 'learning_rate':0.03, 'n_estimators':1000, 'seed':5,
        #                         'gamma':0.9, 'subsample':1.0, 'colsample_bytree':1.0, 'colsample_bylevel':1.0})
        #print "Final XGB Eval", eva3
        eva_avg = evaluate(stage2_valid.mean(1), valid_y)
        eva_med = evaluate(np.median(stage2_valid, axis=1), valid_y)
        print "Final AVG Eval", eva_avg
        print "Final MED Eval", eva_med

        final_lis.append(np.min([eva_avg, eva_med]))

        #print "Best Eval", np.min([eva1, eva2, eva3])
        #final_lis.append(np.min([eva1, eva2, eva3]))

        #if save_model:
        #  joblib.dump(reg1, "%s/stage2_avg_%d.m"%(file_path, gap))
        #  joblib.dump(reg3, "%s/stage2_med_%d.m"%(file_path, gap))

        if output:
            #output_test(stage2_test.mean(1), test_online[1], gap, "avg", eva_avg, output_path)
            output_test(np.median(stage2_test, axis=1), test_online[1], gap,
                        "median", eva_med, output_path)

        print "Gap %d Start Time:" % gap, time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    print "Week 1 Eval:", np.average(final_lis)
Example #11
0
def run():

    err = []
    #for gap in xrange(14):
    for i in xrange(7):
        gap = i
        expansion = False
        data, comm = preprocess.loadData(gap, expansion)
        eps = []
        for t in [1, 2]:
            train_data, valid_data, test_data = preprocess.split_lastweek(
                data, comm, t)
            train = np.array(train_data[0])
            valid = np.array(valid_data[0])
            test = np.array(test_data[0])
            test_com = test_data[1]

            train_x = train[:, 1:]
            train_y = train[:, 0]

            train_x[np.isnan(train_x)] = -1
            valid_x = valid[:, 1:]
            valid_x[np.isnan(valid_x)] = -1
            valid_y = valid[:, 0]

            print train_x.shape
            print valid_x.shape
            regs = []
            preds = []
            for seed in [1, 2, 3, 5, 7, 9, 11, 15, 20, 25, 30, 40, 80]:
                #for seed in [1]:
                reg, e, ep, pred = fitRFModel(train_x, train_y, valid_x,
                                              valid_y, seed, gap)
                #eps += list(ep)
                regs.append(reg)
                preds.append(pred)
                #pred = map(lambda x: int(max(0,x)),pred)
                #er = np.abs(pred) - valid_y
                #err_p = np.abs(er) / (np.abs(pred) + valid_y)
                #eps += list(err_p)
            pred = np.median(preds, axis=0)
            er = np.abs(pred) - valid_y
            err_p = np.abs(er) / (np.abs(pred) + valid_y)
            eps += list(err_p)
            ''' 
            with open('%d_seed%d_gap%d_valid_with_svd'%(t,seed,gap),'w') as fout:
                for com,e,t,p in zip(valid_data[1],ep,valid_y,pred):
                    fout.write("%f\t%f\t%d\t%s\t%s\n"%(e,p,t,com[0],com[1].strftime("%Y-%m-%d")))
            '''
            #print train_x.shape
            #reg = gcv(train_x,train_y)
            #pred = reg.predict(valid_x)
            #e = evaluate(pred,valid_y)
            #print e

            if len(sys.argv) > 1 and sys.argv[1] == 'output':
                test_x = test[:, 1:]
                test_x[np.isnan(test_x)] = -1
                preds = []
                for reg in regs:
                    pred = reg.predict(test_x)
                    preds.append(pred)
                pred = np.median(preds, axis=0)
                output_test(pred, test_com, gap, t)
        e = np.mean(eps)
        print e
        err.append(e)

    print np.mean(err)
Example #12
0

if __name__ == '__main__':

    data_params = {
        'reload':
        False,  #When True, parse time domain raw data again, use when data changes
        'max_items_per_scan': 2,  # maximum number of items in a scanf
        'train_test_split': 0.7,  #size of training data
        'only_max': False,
        'saved_path': "../new_res/*.json",
        'use_backproj':
        True  # set to false to use clean signal instead of backproj
    }
    # reload_data()
    trainX_, testX_, trainY_, testY_ = loadData(**data_params)
    trainX, trainY = processData(
        [trainX_, trainY_], commands=["crop", "transpose", "flip_x", "flip_y"])
    testX, testY = processData([testX_, testY_], commands=["crop"])

    # trainX, trainY = processData([trainX_, trainY_],commands = ["crop"])
    # testX, testY = processData([testX_, testY_],commands = ["crop"])

    N = len(trainX)
    idx = np.arange(N)
    np.random.seed(5)
    np.random.shuffle(idx)
    trainX, trainY = trainX[idx], trainY[idx]

    # combinedX = np.concatenate((trainX,testX),axis = 0)
    # combinedY = np.concatenate((trainY,testY),axis = 0)# (34, 40, 20, 21, 5)
                            color = color_map[j][0]
                            with tag('span', style=f'background: rgba({color[2]}, {color[1]}, {color[0]}, {heatmap[j]});', title=int(X_test[i, j])):
                                text(word + ' ')
                    with tag('p'):
                        text(
                            f'Pred: {Y[i]}, Label: {T[i]}')
                    doc.stag('hr')

    with open(out_dir / 'out.html', 'w') as f:
        f.write(doc.getvalue())


if __name__ == "__main__":
    init_logger()

    data_train, _ = loadData('train.csv', ('title', 'text'),
                             vocab_size=_vocab_size)
    X_train, T_train, X_test, T_test = get_train_test_set(data_train)

    logging.info(
        f'X_train {X_train.shape}, T_train {T_train.shape}, X_test {X_test.shape}, T_test {T_test.shape}')

    T_train = np.hstack((T_train.reshape(-1, 1), (1 - T_train).reshape(-1, 1)))
    T_test = np.hstack((T_test.reshape(-1, 1), (1 - T_test).reshape(-1, 1)))

    # check from the cache

    # test with title first
    if os.path.exists(cache_dir / CNN._cache_path):
        cnn = CNN.from_cache()
    else:
        cnn = CNN(X_train.shape[1], vocab_size=_vocab_size)
Example #14
0
def tf_idf_2doc(comments, labels, feat=10000):
    commentList = list2str(comments)
    rng = np.random.RandomState(seed=3)
    indices = np.arange(len(commentList))
    rng.shuffle(indices)
    commentarray = np.array(commentList)
    labelarray = np.array(labels)
    commentList = commentarray[indices]
    labels = labelarray[indices]
    commentList = commentList.tolist()
    labels = labels.tolist()
    print('Loading Vectorizer')
    if feat == 10000:
        if os.path.exists('models/vectorizer_imdb_tfidf_2doc.pkl'):
            with open('models/vectorizer_imdb_tfidf_2doc.pkl', 'rb') as fw:
                Vectorizer = pickle.load(fw)
            fw.close()
        else:
            print('reading data')
            filename = 'dataset/aclImdb/train/pos'
            filename1 = 'dataset/aclImdb/train/neg'
            trainComments, labels = pre.loadData(filename, filename1)
            trainComments = list2str(trainComments)
            trainCommentList = [
                list_to_str(trainComments[0:12499]),
                list_to_str(trainComments[12500:24999])
            ]
            Vectorizer = TfidfVectorizer(max_features=10000,
                                         input='content',
                                         analyzer=stemmed_words,
                                         stop_words='english',
                                         encoding='utf-8',
                                         decode_error='ignore',
                                         lowercase=True,
                                         ngram_range=(1, 3))
            Vectorizer.fit_transform(trainCommentList)
            with open('models/vectorizer_imdb_tfidf_2doc.pkl', 'wb') as fw:
                pickle.dump(Vectorizer, fw)
            fw.close()
    if feat == 3000:
        if os.path.exists('models/vectorizer_imdb_tfidf_2doc_feat3000.pkl'):
            with open('models/vectorizer_imdb_tfidf_2doc_feat3000.pkl',
                      'rb') as fw:
                Vectorizer = pickle.load(fw)
            fw.close()
        else:
            print('reading data')
            filename = 'dataset/aclImdb/train/pos'
            filename1 = 'dataset/aclImdb/train/neg'
            trainComments, labels = pre.loadData(filename, filename1)
            trainComments = list2str(trainComments)
            trainCommentList = [
                list_to_str(trainComments[0:12499]),
                list_to_str(trainComments[12500:24999])
            ]
            Vectorizer = TfidfVectorizer(max_features=3000,
                                         input='content',
                                         analyzer=stemmed_words,
                                         stop_words='english',
                                         encoding='utf-8',
                                         decode_error='ignore',
                                         lowercase=True,
                                         ngram_range=(1, 3))
            Vectorizer.fit_transform(trainCommentList)
            with open('models/vectorizer_imdb_tfidf_2doc_feat3000.pkl',
                      'wb') as fw:
                pickle.dump(Vectorizer, fw)
            fw.close()
    print('Vectorizing comments')
    return Vectorizer.transform(commentList), labels
Example #15
0
import preprocess as pre
import numpy as np
import matplotlib.pyplot as plt
import gc
import InitializeModel as im
import tf_idf as tfidf
import scipy.sparse as sp
import pickle
'''
print('Loading model')
model = keras.models.load_model('models/simple_model.h5')
'''

print('reading IMDB_data')
(train_data, train_labels) = pre.loadData('dataset/aclImdb/train/pos',
                                          'dataset/aclImdb/train/neg')
(test_data, test_labels) = pre.loadData('dataset/aclImdb/test/pos',
                                        'dataset/aclImdb/test/neg')
'''
optional:Analyzing Dataset
'''
print("Categories:", np.unique(train_labels))
print("Number of unique words:", len(np.unique(np.hstack(train_data))))

# 将word_index反转,实现将整数索引到单词的映射
'''
# Simple Vectoring data
print('Vectoring data')
X_train = pre.vectorize_sequences(train_data)
X_test = pre.vectorize_sequences(test_data)
'''
Example #16
0
from preprocess import cross_10folds
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import svm
from numpy import *
import time

# In[4]:

# 这里可以选择数据集
# small是小数据集
# farm-ad是整个数据集
fileName = r"data\small"  # 测试小数据集
# fileName = r"data\farm-ads" # 测试整个大的数据集

totdata_x, totdata_y = loadData(fileName)  # 加载数据

startTime = time.time()

# 下面可以选择多种算法

handw_KNN(totdata_x, totdata_y)  # 测试手写KNN算法

# handw_LDA1(totdata_x, totdata_y) # 测试手写的LDA1
# handw_LDA2(totdata_x, totdata_y) # 测试手写的LDA2
# sklearn_LDA(totdata_x, totdata_y) # 测试sklearn中的LDA

# handw_SVM(totdata_x, totdata_y) # 测试手写的SVM算法
# sklearn_SVM(totdata_x, totdata_y) # 测试sklearn中的SVM算法

endTime = time.time()
Created on Mon May 25 10:37:03 2020

@author: ASUS
"""

import keras
import imdb
import numpy as np
import preprocess as pre
import tf_idf as tfidf
import pickle
import matplotlib.pyplot as plt
print('Loading test data')
filename1 = 'dataset/aclimdb/test/neg'
filename = 'dataset/aclimdb/test/pos'
Comments, labels = pre.loadData(filename, filename1)


def analyzeModel(modelType='tfidf', feat=10000, Word=Comments, labels=labels):
    if modelType == 'BOW':
        print('Loading model')
        model = keras.models.load_model('models/BOW_default_40000.h5')
        model.summary()
        fhis = open('report/BOW_default_40000.txt', 'rb')
        training_detail(fhis)
        fhis.close()
        word_index = imdb.get_word_index()
        print('\nPreprocessing data')
        Words = pre.setOfWordsListToVecTor(word_index, Word)
        Words = pre.conf_data(x_train=Words, num_words=10000)
        Words = pre.vectorize_sequences(Words)
Example #18
0
        labels = []

        for idx, row in test.iterrows():

            if not row["content"]:
                continue

            feed_dict, label = get_feed_dict(row)

            predicted_logits = sess.run(logits, feed_dict=feed_dict)

            predicted = util.normalize_predictions(
                predicted_logits[0][0][1:-1])

            print(row["content"][:50], "\n", "Actual:", label, "\nPredicted:",
                  predicted, "\n")

            predictions.append(predicted)
            labels.append(label)

    util.print_summary(labels, predictions)


if is_train:
    train_model()
    cv_test = loadData("./data/split_data/validate.csv")
    test_model(cv_test)
else:
    test = loadData("./data/split_data/test.csv")
    test_model(test)
Example #19
0
def main():
    p = Path("./result")
    if not p.exists():
        os.makedirs(p)

    parser = argparse.ArgumentParser(
        description='Bioinf project. The arguments can be passed in any order.'
    )

    classes = parser.add_mutually_exclusive_group()
    classes.add_argument('-cl2',
                         help='in order to classify two cancer types.',
                         action='store_true')
    classes.add_argument(
        '-cl3',
        help='in order to classify two cancer types AND sane.',
        action='store_true')

    classifier = parser.add_mutually_exclusive_group()
    classifier.add_argument('-svm',
                            help='train a Support Vector Machine classifier',
                            action='store_true')
    classifier.add_argument('-knn',
                            help='train a K Nearest Neighbors classifier',
                            action='store_true')
    classifier.add_argument('-rforest',
                            help='train a Random Forest classifier',
                            action='store_true')
    classifier.add_argument('-kmeans',
                            help='train a Kmeans clustering',
                            action='store_true')
    classifier.add_argument(
        '-hierarc',
        help='train an Agglomerative Hierarchical clustering',
        action='store_true')

    inbalance = parser.add_mutually_exclusive_group()
    inbalance.add_argument('-over',
                           help='imbalance: Random Oversampling ',
                           action='store_true')
    inbalance.add_argument('-smote',
                           help='imbalance: SMOTE',
                           action='store_true')

    preprocess = parser.add_mutually_exclusive_group()
    preprocess.add_argument(
        '-ttest',
        help=
        'feature selection: ttest per chromosoma and per cpg site - 2 classes',
        action='store_true')
    preprocess.add_argument(
        '-fisher',
        help='feature selection: fisher criterion - 3 classes',
        action='store_true')
    preprocess.add_argument('-anova',
                            help='feature selection: anova - 3 classes',
                            action='store_true')
    preprocess.add_argument(
        '-pca',
        help='dimensionality reduction: Principal Component Analisys',
        action='store_true')
    preprocess.add_argument(
        '-lda',
        help='dimensionality reduction: Linear Discriminant Analysis',
        action='store_true')
    preprocess.add_argument(
        '-sfs',
        help=
        'feature selection - wrapper: Step Forward Selection (nearly unfeasible)',
        action='store_true')
    preprocess.add_argument(
        '-ga',
        help='feature selection - wrapper: Genetic Algorithm',
        action='store_true')

    parser.add_argument(
        '-d',
        '--download',
        nargs=2,
        help='download Adenoma and Adenocarcinoma and Squamous Cell Neoplasm '
        + 'data from Genomic Data Common. It needs 2 parameters: ' +
        'first parameter is the destination folder; ' +
        'second parameters is the number of files to be downloaded for each class ',
        action='store')
    parser.add_argument(
        '-ds',
        '--downloadsane',
        nargs=2,
        help='download Sane data from Genomic Data Common' +
        'It needs 2 parameters: ' +
        'first parameter is the destination folder; ' +
        'second parameters is the number of files to be downloaded ',
        action='store')
    parser.add_argument(
        '-s',
        '--store',
        help=
        'concatenate files belonging to same cancer type and store them in a binary file',
        action='store')

    parser.add_argument(
        '--alpha',
        type=float,
        default=0.001,
        help='to set a different ALPHA: ttest parameter - default is 0.001',
        action='store')
    parser.add_argument(
        '--perc',
        type=float,
        default=0.95,
        help='to set PERC of varaince explained by the features kept by PCA',
        action='store')
    parser.add_argument(
        '-rs',
        '--r_state',
        type=int,
        default=8,
        help='to set a user defined Random State - default is 8',
        action='store')
    parser.add_argument('--only_chrms_t',
                        default=False,
                        help='select only chrms for ttest',
                        action='store_true')
    parser.add_argument(
        '--crossval',
        help=
        'to do crossvalidation OR in case of unsupervised to plot the Inertia curve',
        action='store_true')
    parser.add_argument('--plot_lc',
                        help='plot the learning curve',
                        action='store_true')
    parser.add_argument(
        '--remove_nan_cpgs',
        type=str2bool,
        default=True,
        help='IF True: removes features containing at least one NaN value. ' +
        'IF False: NaN are substituted by the mean over the feature. ' +
        'The old file resulted by feature reduction must be eliminated when changing option. '
        + 'By Default is True.',
        action='store')

    args = parser.parse_args()

    if args.download:
        print("download ")
        dgdc.getDataEx(path=args.download[0], file_n=args.download[1])
    if args.downloadsane:
        print("download sane ")
        dgdc.getSaneDataEx(path=args.downloadsane[0],
                           file_n=args.downloadsane[1])
    if args.store:
        print("store")
        dgdc.storeDataIntoBinary(path=args.store)
        print("Data stored.")

    # validity checks
    if not args.cl2 and not args.cl3:
        print(
            "insert arg -cl2 for classifying 2 classes OR -cl3 for 3 classes")
        return

    # parameters and variables
    alpha = args.alpha  # alpha parameter for t-test
    perc = args.perc  # percentage of variance explained
    classes = 2 if args.cl2 else 3
    random_state = args.r_state
    no_nan = args.remove_nan_cpgs
    n_components = 100

    cl.setPlot_lc(args.plot_lc)

    cl.addToName("cl{}".format(classes))
    cl.addToName("rs{}".format(random_state))

    # load data
    print("Loading....")
    x, y, chrms_pos = pr.loadData(classes=classes)
    if no_nan:
        cl.addToName("no_nan")
        length = x.shape[1]
        x = pr.removeNanFeature(x)
        print("{} NaN features removed!".format(length - x.shape[1]))
    print("Loaded!")

    x_train, x_test, y_train, y_test = sk.model_selection.train_test_split(
        x, y, test_size=0.2, random_state=random_state)
    del x, y

    # preprocess
    if args.ttest:
        if classes != 2:
            print("wrong number of classes")
            return
        #print("Start ttest axis={}....".format(args.ttest))
        r, cpg_r = pr.compute_t_test(x_train,
                                     y_train,
                                     chrms_pos,
                                     alpha,
                                     random_state,
                                     axis=0,
                                     remove_nan=no_nan)
        print(r)
        cl.addToName("ttest{}".format(args.ttest))
        length = x_train.shape[1]
        x_train, x_test = pr.removeFeatures(x_train,
                                            x_test,
                                            cpg_r,
                                            chrms_pos,
                                            args.only_chrms_t,
                                            remove_nan=no_nan,
                                            y_train=y_train)
        print("Features removed: {}".format(length - x_train.shape[1]))
        print("End ttest!")

    if args.ga:
        print("genetic algorithm")
        cl.addToName("ga")
        # per lavorare con meno componenti
        # x_train = x_train[:, 1:100]
        result = g.GA_function(x_train, y_train, random_state, classes, 0.1)
        path = Path('./data/GA_{}_{}.npy'.format(random_state, classes))
        np.save(path, result)
        x_train = x_train[:, result]
        x_test = x_test[:, result]

    if args.pca:
        print("pca")
        cl.addToName("pca")
        x_train, x_test = pr.pca_function(x_train,
                                          x_test,
                                          y_train,
                                          y_test,
                                          classes,
                                          perc,
                                          random_state,
                                          name=cl.name,
                                          remove_nan=no_nan)

    if args.lda:
        #print("lda - {} components".format(args.lda))
        cl.addToName("lda")
        x_train, x_test = pr.lda_function(x_train, x_test, y_train, y_test,
                                          classes, args.lda, random_state,
                                          cl.name)

    if args.fisher:
        if classes != 2:
            print("wrong number of classes")
            return
        #cl.addToName("fisher{}".format(args.fisher))
        cl.addToName("fisher")
        print("fisher")
        x_train, x_test = pr.fisher_function(x_train,
                                             x_test,
                                             y_train,
                                             y_test,
                                             random_state,
                                             best=True,
                                             n=n_components,
                                             remove_nan=no_nan)
        # if best=True selects the n best features, if False the worst n features (for debugging)
    if args.sfs:
        if classes != 2:
            print("wrong number of classes")
            return
        print("Start sfs....")
        feat_col = pr.sfs(x_train, x_test, y_train, y_test, chrms_pos, alpha,
                          random_state)
        x_train = x_train[:, feat_col]
        x_test = x_test[:, feat_col]

    if args.anova:
        if classes != 3:
            print("wrong number of classes")
            return
        print("anova")
        cl.addToName("anova")
        x_train, x_test = pr.anova_function(x_train,
                                            x_test,
                                            y_train,
                                            y_test,
                                            alpha,
                                            random_state,
                                            remove_nan=no_nan)

    # imbalance
    if args.over:
        print("over ")
        x_train, y_train = pr.imbalance(x_train, y_train, "over", random_state)
        cl.addToName("over")

    if args.smote:
        print("smote ")
        x_train, y_train = pr.imbalance(x_train, y_train, "smote",
                                        random_state)
        cl.addToName("smote")

    cl.random_state(random_state)

    # classify
    if args.svm:
        print("svm ")
        cl.svm(x_train,
               x_test,
               y_train,
               y_test,
               classes=classes,
               crossval=args.crossval)

    if args.knn:
        print("knn ")
        cl.knn(x_train,
               x_test,
               y_train,
               y_test,
               classes=classes,
               crossval=args.crossval)

    if args.rforest:
        print("rforest")
        cl.random_forest(x_train,
                         x_test,
                         y_train,
                         y_test,
                         classes=classes,
                         crossval=args.crossval)

    if args.kmeans:
        print("kmeans")
        uc.kmeans(x_train,
                  x_test,
                  y_train,
                  y_test,
                  classes=classes,
                  random_state=random_state,
                  crossval=args.crossval)

    if args.hierarc:
        print("hierarchical clustering")
        uc.hierarchical(x_train,
                        x_test,
                        y_train,
                        y_test,
                        classes=classes,
                        random_state=random_state,
                        crossval=args.crossval)

    print("Log name: {}.log".format(cl.name))

    handlers = log.getLogger().handlers[:]
    for handler in handlers:
        handler.close()
        log.getLogger().removeHandler(handler)
    nf = p / cl.name
    if not nf.exists():
        os.makedirs(nf)
    npath = Path(nf / '{}.log'.format(cl.name))
    i = 1
    while npath.exists():
        npath = Path(nf / '{}_{}.log'.format(cl.name, i))
        i += 1
    os.rename('log.log', npath)