def nldas_correlate(self):
        nldas_list = pickle.load(open("wind/nldas.p", "rb"))
        nldas = None
        for temp in nldas_list:
            if self.station_id == temp.station_id:
                nldas = temp
                break
        nldas_idx = np.where(np.logical_and(nldas.date >= np.min(self.date), nldas.date <= np.max(self.date)))
        nldas_wind_speed_anomaly = nldas.wind_speed_anomaly[nldas_idx]
        nldas_wind_dir_anomaly = nldas.wind_dir_anomaly[nldas_idx]

        fit_lr = lr()
        # mask1 = self.reject_outliers(self.wind_speed_anomaly)
        mask1 = ~np.isnan(self.wind_speed_anomaly)
        fit_lr.fit(nldas_wind_speed_anomaly[mask1].reshape((len(nldas_wind_speed_anomaly[mask1]), 1)), self.wind_speed_anomaly[mask1])
        result1 = fit_lr.predict(nldas_wind_speed_anomaly[mask1].reshape((len(nldas_wind_speed_anomaly[mask1]), 1)))
        std = np.sqrt(np.sum((self.wind_speed_anomaly[mask1] - result1) ** 2) / (len(result1) - 2))
        print "Standard deviation of the wind speed estimate is", std

        fit_lr = lr()
        # mask2 = self.reject_outliers(self.wind_dir_anomaly)
        mask2 = ~np.isnan(self.wind_dir_anomaly)
        fit_lr.fit(nldas_wind_dir_anomaly[mask2].reshape((len(nldas_wind_dir_anomaly[mask2]), 1)), self.wind_dir_anomaly[mask2])
        result2 = fit_lr.predict(nldas_wind_dir_anomaly[mask2].reshape((len(nldas_wind_dir_anomaly[mask2]), 1)))
        std = np.sqrt(np.sum((self.wind_dir_anomaly[mask2] - result2) ** 2) / (len(result2) - 2))
        print "Standard deviation of the wind direction estimate is", std

        fig = plt.figure()
        ax1 = fig.add_subplot(211)
        ax1.plot(nldas_wind_speed_anomaly[mask1], self.wind_speed_anomaly[mask1], '.b')
        ax1.plot(nldas_wind_speed_anomaly[mask1], result1, '-r')
        ax2 = fig.add_subplot(212)
        ax2.plot(nldas_wind_dir_anomaly[mask2], self.wind_dir_anomaly[mask2], '.g')
        ax2.plot(nldas_wind_dir_anomaly[mask2], result2, '-r')
        plt.show()
def main():
    plotdir = make_plotdir()
    train_X, test_X, train_y, test_y = load_data('cleveland', plotdir, print_out=False)
#   X_labels = list(train_X.columns)
    test_incoming(test_X, train_X)
    
    plot_hists(train_X, plotdir, label='Train')
    plot_hists(test_X, plotdir, label='Test')
    
    scale_cols = ['age','b_pressure','cholesterol','heart_rate','exer_depress','fluor_count']
    train_X, test_X = scale_data(train_X, test_X, scale_cols)
#   one_hot_cols = ['chest_pain','ecg_type','exer_slope','thal_defect']
    one_hot_cols = ['chest_pain']
    train_X, test_X = one_hot_encode(train_X, test_X, one_hot_cols)
#   print('one hot encode train_X head\n', train_X[:3])
    X_labels = list(train_X.columns)
    
    clf = lr()
    fit_predict(clf, train_X, train_y, test_X, test_y, label='logistic')
    cross_validate(clf, train_X, train_y['Y'], print_out=True)
    print_lr_coefs(clf, X_labels)

    clf = LinearSVC()   # data must first be scaled
    fit_predict(clf, train_X, train_y, test_X, test_y, label='svc')
    cross_validate(clf, train_X, train_y['Y'], print_out=True)
    
    explore_pca(train_X)
Exemple #3
0
def classify(train_data_filename, train_label_filename, dev_data_filename, dev_label_filename, 
             train_feature_dir, dev_feature_dir, feature_list, model_type='LR', 
             regularizer='l1', alpha=1.0, converg_tol=0.01, verbose=1, folds=2, n_jobs=-1, score_eval='f1'):
    
    if model_type == 'LR':
        model = lr(penalty=regularizer, C=alpha, tol=converg_tol)
    elif model_type == 'SVM':
        model = svm.LinearSVC(penalty=regularizer, C=alpha, tol=converg_tol)
    else:
        sys.exit('Model type ' + model_type + ' not supported')

    train_X, train_Y = load_features(train_data_filename, train_label_filename, train_feature_dir, 
                                     feature_list, verbose)
    #if we have separate dev data, so we don't need cross validation
    if folds < 1:
        # Try loading dev data using train vocabulary, and not saving dev feature extractions
        dev_X, dev_Y = load_features(dev_data_filename, dev_label_filename, dev_feature_dir,
                                     feature_list, verbose, vocab_source=train_feature_dir)

        dev_f1, dev_acc, train_f1, train_acc = compute_evaluation_metrics(train_X, train_Y, dev_X, dev_Y, model)
        print('train acc: ' + str(train_acc))
        print('dev acc: ' + str(dev_acc))
        neg_loss = dev_acc
    #if we don't have separate dev data, so we need cross validation
    else:
        skf = StratifiedKFold(train_Y, folds,random_state=17)
        neg_loss = cross_val_score(model, train_X, train_Y, cv=skf,scoring=score_eval,n_jobs=n_jobs).mean()
        print('crossvalidation f1: ' + str(f1))

    return {'loss': -neg_loss, 'status': STATUS_OK, 'model': model}
def trainModel():
	fh = open('train.features')
	X = []
	for x in fh:
		x = x.strip()
		x = x.split(',')
		x = [int(x1) for x1 in x]

		X.append(x)
	fh.close()
	fh = open('train.labels')
	Y = []
	for y in fh:
		y = y.strip()
		Y.append(int(y))
	fh.close()
	clf = lr()
	clf.fit(X,Y)
	print sigmoid(clf.predict(X[45]))
	print clf.coef_
	#np.save("lr_coeff",clf.coef_)
	print clf.intercept_
	#np.save("lr_intercept",clf.intercept_)
	score = np.dot(clf.coef_, X[45])+ clf.intercept_
	print sigmoid(score)

	coeff = np.load("lr_coeff.npy")
	intercept = np.load("lr_intercept.npy")
	score = np.dot(coeff, X[45]) + intercept
	print sigmoid(score)	
def predict_lr(X_train, X_test, y_train, y_test):
    clf = lr()
    print("lr started")
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    calc_accuracy("Logistic regression",y_test,y_pred)
    np.savetxt('submission_surf_lr.csv', np.c_[range(1,len(y_test)+1),y_pred,y_test], delimiter=',', header = 'ImageId,Label,TrueLabel', comments = '', fmt='%d')
    return clf
def main():
    if (not path.exists('training_images_pos0.npy')
            and not path.exists('training_image_neg0.npy')):
        convert_to_numpy()

        # (# samples, 256, 128, 3)
    train_pos = np.load('training_images_pos0.npy')
    train_neg = np.load('training_images_neg0.npy')
    print(train_pos.shape)
    print(train_neg.shape)

    # flatten images
    train_pos = preprocessing.minmax_scale(
        train_pos.reshape((train_pos.shape[0], 32768)))
    train_neg = preprocessing.minmax_scale(
        train_neg.reshape((train_neg.shape[0], 32768)))

    pos_label = np.ones(train_pos.shape[0])
    neg_label = np.zeros(train_neg.shape[0])

    trainX = np.concatenate((train_pos, train_neg))
    trainY = np.concatenate((pos_label, neg_label))

    idxs = np.random.permutation(trainX.shape[0])

    trainX = trainX[idxs]
    trainY = trainY[idxs]

    # train_pos = np.concatenate((train_pos, pos_label), axis=1)
    # train_neg = np.concatenate((train_neg, neg_label), axis=1)
    model = lr()
    clf = model.fit(trainX, trainY)
    pickle.dump(model, open('model.sav', 'wb'))

    test_pos = np.load('testing_images_pos0.npy')
    test_neg = np.load('testing_images_neg0.npy')
    print(test_pos.shape)
    print(test_neg.shape)

    test_pos = preprocessing.minmax_scale(
        test_pos.reshape((test_pos.shape[0], 32768)))
    test_neg = preprocessing.minmax_scale(
        test_neg.reshape((test_neg.shape[0], 32768)))

    pos_label_test = np.ones(test_pos.shape[0])
    neg_label_test = np.zeros(test_neg.shape[0])

    testX = np.concatenate((test_pos, test_neg))
    testY = np.concatenate((pos_label_test, neg_label_test))

    idxs = np.random.permutation(testX.shape[0])

    testX = testX[idxs]
    testY = testY[idxs]

    print(clf.score(testX, testY))
    roc_auc_score(testY, clf.predict_proba(testX)[:, 1])
Exemple #7
0
def run_lr():
	clf = lr()
	print("lr started")
	clf.fit(x,y)
	#print clf.n_layers_
	pred=clf.predict(x_)
	#print(pred)
	np.savetxt('submission_lr.csv', np.c_[range(1,len(test)+1),pred,label_test], delimiter=',', header = 'ImageId,Label,TrueLabel', comments = '', fmt='%d')
	calc_accuracy("Logistic regression",label_test,pred)
Exemple #8
0
def main():
    dfcol, dups = readRawColumns()
    dftrain, dftrain_y, dftest, dftest_y = readRawData(dfcol)

    dftrain = renameColumns(dftrain)
    dftest = renameColumns(dftest)
    print("dftrain shape head", dftrain.shape, "\n", dftrain[:3])
    print("dftest shape head", dftest.shape, "\n", dftest[:3])
    print("dftrain stats\n", dftrain.describe())
    # groupby subject, activity(y) ?
    #    print("dftrain group by subject stats\n", dftrain.groupby('subject').describe())

    make_plotdir()
    explore_pca(dftrain, dftest, "all")  # 562 columns

    clf = LinearSVC()
    print("fitting LinearSVC")
    fit_predict(clf, dftrain, dftrain_y, dftest, dftest_y,
                'raw data, all cols')
    fit_predict(clf, dftrain.ix[:, :30], dftrain_y, dftest.ix[:, :30],
                dftest_y, 'raw data, 30 cols')
    # 30 columns not sorted by pca - only 70% accuracy

    X_train, X_test = quick_pca(dftrain, dftest, ncomps=100)

    print("fitting LinearSVC with PCA input")
    preds = []
    for j in [10, 20, 30, 50, 100]:
        p = fit_predict(clf, X_train[:, :j], dftrain_y, X_test[:, :j],
                        dftest_y, 'pca {:d} cols'.format(j))
        preds.append((j, p))
    plot_pca_fit(preds, "svc", "SVC")

    do_svc_gridsearch(X_train[:, :30], dftrain_y)

    print("Cross-validating LinearSVC with PCA input")
    get_cv_scores(clf, X_train[:, :30],
                  dftrain_y)  # randomized, not grouped by subject
    # 30 columns sorted by pca - 89% accuracy

    clf = lr()
    print("fitting Logistic Regression with PCA input")
    preds = []
    for j in [10, 20, 30, 50, 100]:
        p = fit_predict(clf, X_train[:, :j], dftrain_y, X_test[:, :j],
                        dftest_y, 'pca {:d} cols'.format(j))
        preds.append((j, p))
    plot_pca_fit(preds, "lr", "Logistic Regression")
    print("Cross-validating Logistic Regression with PCA input")
    get_cv_scores(clf, X_train[:, :30], dftrain_y)

    txt = '''\nConclusion: Using PCA as input to Logistic Regression or LinearSVC is effective, 
with 91% accuracy using only 30 components (5.4% of 562 total).  For six 
predicted classes, a classification report shows precision of 85% and greater 
(also confirmed by confusion matrix).  Cross-validation gives average fit 
scores of 89% +- 5%.'''
    print(txt)
Exemple #9
0
def regressMissingData(x, y, xnew, robust=True):
    '''
    linear or robust linear regression to fill in missing data.
    
    author: Nat
    
    input: 
        x: independent variables with corresponding dependent variable y
        xnew: independent variables with MISSING dependent variable y
        y: dependent variable which is known
    output:
        ynew: regressed y value where y is missing       
    '''
    import pandas as pd
    from sklearn.linear_model import LinearRegression as lr
    m = lr()
    m.fit(x, y)
    ynew_lr = pd.DataFrame(m.predict(xnew), columns=['WON_MONTH2'])

    from sklearn.linear_model import RANSACRegressor as ransac
    m_ransac = ransac(lr())
    m_ransac.fit(x, y)
    ynew_ransac = pd.DataFrame(m_ransac.predict(xnew), columns=['WON_MONTH2'])
    #    import numpy as np
    #    from matplotlib import pyplot as plt
    #    yhat_lr = pd.DataFrame(m.predict(x))
    #    yhat_ransac = pd.DataFrame(m_ransac.predict(x))
    #    inlier_mask = m_ransac.inlier_mask_
    #    outlier_mask = np.logical_not(inlier_mask)
    #    plt.scatter(x[inlier_mask], y[inlier_mask],
    #                color='green', marker='.',
    #                label='Inliers')
    #    plt.scatter(x[outlier_mask], y[outlier_mask],
    #                color='red', marker='.',
    #                label='Outliers')
    #    plt.plot(pd.concat([x,xnew]), pd.concat([yhat_ransac, ynew_ransac]), '-',
    #             label='RANSAC regressor')
    #    plt.plot(pd.concat([x,xnew]), pd.concat([yhat_lr, ynew_lr]), '-',
    #             label='linear regressor')
    #    plt.show()
    if robust == True:
        return ynew_ransac
    else:
        return ynew_lr
 def train_edge_classification(X_train, Y_train):
     """
     train  the classifier with the train set.
     :param X_train: The features' edge- norm (train set).
     :param Y_train: The edges labels- 0 for true, 1 for false (train set).
     :return: The classifier
     """
     classif2 = TopKRanker(lr())
     classif2.fit(X_train, Y_train)
     return classif2
Exemple #11
0
 def get_model(self,args):
     if args['model']['model'] == 'LR':
         model = lr(penalty=args['model']['regularizer_lr'], C=args['model']['C_lr'],n_jobs=self.cjobs)
     elif args['model']['model'] == 'SVM':
         if args['model']['regularizer_svm'] == 'l1':
             #squared hinge loss not available when penalty is l1. 
             model = svm.LinearSVC(C=args['model']['C_svm'], penalty=args['model']['regularizer_svm'],dual=False,n_jobs=self.cjobs)#loss='hinge')
         else:
             model = svm.LinearSVC(C=args['model']['C_svm'], penalty=args['model']['regularizer_svm'],n_jobs=self.cjobs)
     return model
Exemple #12
0
def get_classifier():
    x = query_features[query_features.columns[2:23]]
    y = query_features[query_features.columns[-1]]
    x_train, x_test, y_train, y_test = sk_model.train_test_split(x,
                                                                 y,
                                                                 test_size=0.2)

    clf = lr(max_iter=1000).fit(x_train, y_train)

    return clf
Exemple #13
0
def fit_and_test():
    data, target = pd.read_train()

    train_x, val_x, train_y, val_y = t(data, target, test_size=0.1)

    m = lr()
    m.fit(train_x, train_y)

    print("Score on validation")
    print(m.score(val_x, val_y))
def linear_regression(x, y):
    lineerreg = lr(
    )  #sklearn lineer regresyon modelini 'lineerreg' adıyla kullancağız
    lineerreg.fit(
        x, y)  # örneğin veri üzerinde öğrenmesi fit fonksiyonuyla yapılıyor
    lineerreg.predict(x)  #tahmin fonksiyoru
    m = lineerreg.coef_  #eğim
    b = lineerreg.intercept_  #b değeri
    plt.scatter(x, y)  # matplotlib ile noktaları gösterme
    plt.plot(x, lineerreg.predict(x), c="red")  # doğruyu çizdirme
    plt.show()  # çizilen grafiği göster
Exemple #15
0
def train_edge_classification(X_train, Y_train):
    """
    Predictions of nodes' labels.
    :param X: The features' graph- norm
    :param Y: The edges labels- 0 for true, 1 for false
    :param test_ratio: To determine how to split the data into train and test
    :return: Scores- F1-macro, F1-micro accuracy and auc
    """
    classif2 = TopKRanker(lr())
    classif2.fit(X_train, Y_train)
    return classif2
Exemple #16
0
 def construct_all_models(self, hyperTune):
     if hyperTune:
         #3 models KNN SCM and LR
         self.models={'SVM':[SVC(kernel='linear',probability=True),dict(C=np.arange(0.01, 2.01, 0.2))],\
                      'LogisticRegression':[lr(),dict(C=np.arange(0.1,3,0.1))],\
                      'KNN':[KNeighborsClassifier(),dict(n_neighbors=range(1, 100))],}
         for name, candidate_hyperParam in self.models.items():
             #update each classifier after training and tuning
             self.models[name] = self.train_with_hyperParamTuning(
                 candidate_hyperParam[0], name, candidate_hyperParam[1])
         print('\nTraining process finished\n\n\n')
def eval_node_classification(X_train, Y_train, X_test, Y_test):

    # y_train = (n_sample, n_classes)
    top_k_list = list(Y_test.sum(axis=1))
    classif2 = TopKRanker(lr(solver='liblinear'))
    classif2.fit(X_train, Y_train)
    prediction = classif2.predict(X_test, top_k_list)
    micro = f1_score(Y_test, prediction, average='micro')
    macro = f1_score(Y_test, prediction, average='macro')

    return micro, macro
Exemple #18
0
def getR2(y_actual, factor, isRet=False):
    n = len(y_actual)
    y = np.array(y_actual).reshape((n, 1))
    x = np.array(factor).reshape((n, 1))
    if isRet:
        n = n - 1
        y = np.log(y[1:] / y[:-1])
        x = x[:-1]
    reg = lr()
    reg.fit(x, y)
    return r2_score(y, reg.predict(x))
def explore_params(loans_X, loans_y, plotdir, app, appf):
    '''Explore fit parameters on training data,
       grid search of fit scores, boxplot gridsearch results.'''
    clf = lr()
    param_grid = [{'C': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0]}]
    gs = GridSearchCV(estimator=clf, param_grid=param_grid, cv=10, \
      verbose=1, n_jobs=-1, scoring='accuracy')
    gs.fit(loans_X, loans_y)  # fit all grid parameters
    print("gs grid scores\n", gs.grid_scores_)
    print("gs best score %.5f %s\n%s" % \
      (gs.best_score_, gs.best_params_, gs.best_estimator_))
    gridscore_boxplot(gs.grid_scores_, plotdir, app, appf, "C", "solver='liblinear'")
Exemple #20
0
def cal_linear_reg_r(y, x=None):
    '''
    计算y中数据点的斜率(一元线性回归)
    y和x为list或pd.Series或np.array
    '''
    if isnull(x):
        X = pd.DataFrame({'X': range(0, len(y))})
    else:
        X = pd.DataFrame({'X': x})
    y = pd.Series(y)
    mdl = lr().fit(X, y)
    return mdl.coef_[0], mdl.intercept_
Exemple #21
0
def myLr(x, y, xnew):
    '''
    calls sklearn.linear_model.LinearRegression
    wrapper author: Nat
    '''
    from sklearn.linear_model import LinearRegression as lr
    import numpy as np
    model = lr()
    model.fit(x, y)
    ynew = model.predict(xnew)
    ynew = np.where(ynew < 0, 0, ynew)
    return ynew
Exemple #22
0
def _plotDegreedist(degree_df, plot_model=False, path=None):
    """
    Args:
        degree_df (pandas.DataFrame): data_frame that include degree.
            degree info shold be stored in the column, "degree"

        plot_model (bool): Whether to plot linear approximation line.

        path (str): Folder path to save plots. If the folde does not exist in the path, the function create the folder.
            If None, plots will not be saved. Default is None.
    """

    from sklearn.linear_model import LinearRegression as lr
    df = degree_df.copy()

    dist = df.degree.value_counts() / df.degree.value_counts().sum()
    dist.index = dist.index.astype(np.int)

    fig, ax = plt.subplots(1, 2)

    ax[0].scatter(dist.index.values, dist.values, c="black")
    ax[0].set_title("degree distribution")
    ax[0].set_xlabel("k")
    ax[0].set_ylabel("P(k)")

    #plt.yscale('log')
    #plt.xscale('log')

    x = np.log(dist.index.values).reshape([-1, 1])
    y = np.log(dist.values).reshape([-1, 1])
    if plot_model:
        model = lr()
        model.fit(x, y)
        x_ = np.array([-1, 5]).reshape([-1, 1])
        y_ = model.predict(x_)

        ax[1].set_title(
            f"degree distribution (log scale)\nslope: {model.coef_[0][0] :.4g}, r2: {model.score(x,y) :.4g}"
        )
        ax[1].plot(x_.flatten(), y_.flatten(), c="black", alpha=0.5)
    else:
        ax[1].set_title(f"degree distribution (log scale)")

    ax[1].scatter(x.flatten(), y.flatten(), c="black")
    ax[1].set_ylim([y.min() - 0.2, y.max() + 0.2])
    ax[1].set_xlim([-0.2, x.max() + 0.2])
    ax[1].set_xlabel("log k")
    ax[1].set_ylabel("log P(k)")

    if path is not None:
        fig.savefig(path, transparent=True)
    plt.show()
Exemple #23
0
    def regression(self, metric="root_mean_squared_error", folds=10, alphas=[], graph=False):
        size = 1.3 * self.report_width // 10

        models = {}
        models["Linear regressor"]                  = lr()
        models["Lasso regressor"]                   = lassor()
        models["Lasso CV regressor"]                = lassocvr()
        models["Ridge regressor"]                   = rr(alpha=0, normalize=True)
        models["Ridge CV regressor"]                = rcvr(alphas = alphas)
        models["K nearest neighbors regressor K2u"] = knnr(n_neighbors=2, weights='uniform')
        models["K nearest neighbors regressor K2d"] = knnr(n_neighbors=2, weights='distance')
        models["K nearest neighbors regressor K5"]  = knnr(n_neighbors=5)
        models["K nearest neighbors regressor K10"] = knnr(n_neighbors=10)
        models["SGD regressor"]                     = sgdr(max_iter=10000, warm_start=True)
        models["Decision tree regressor"]           = dtr()
        models["Decision tree regressor D3"]        = dtr(max_depth=3)
        models["Random forest regressor"]           = rfr()
        models["Ada boost regressor"]               = abr()
        models["Gradient boost regressor"]          = gbr()
        models["Support vector regressor"]          = svr()
        self.models = models

        print('\n')
        print(self.report_width * '*', '\n*')
        print('* REGRESSION RESULTS - BEFORE PARAMETERS BOOSTING \n*')
        #kf = StratifiedKFold(n_splits=folds, shuffle=True)
        kf = KFold(n_splits=folds)
        results = []
        names = []
        for model_name in models:
            cv_scores = -1 * cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, scoring=metric)  
            results.append(cv_scores)
            names.append(model_name)
        print(self.report_width * '*', '')
        report = pd.DataFrame({'Regressor': names, 'Score': results})
        report['Score (avg)'] = report.Score.apply(lambda x: x.mean())
        report['Score (std)'] = report.Score.apply(lambda x: x.std())
        report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)']
        report.sort_values(by='Score (avg)', inplace=True)
        report.drop('Score', axis=1, inplace=True)
        display(report)
        print('\n')
        if graph:
            fig, ax = plt.subplots(figsize=(size, 0.5 * size))
            plt.title('Regressor Comparison')
            #ax = fig.add_subplot(111)
            plt.boxplot(results)
            ax.set_xticklabels(names)
            plt.xticks(rotation=45)
            plt.subplots_adjust(hspace=0.0)
            plt.show()             
        return None
Exemple #24
0
    def test(self, model_name, graph=False):
        size = 1.3 * self.report_width // 10
        model = self.models[model_name]
        # fit using the train subset
        X, y = self.Xt_train, self.yt_train
        model.fit(X, y)

        # evaluate using the test subset
        X, y = self.Xt_test, self.yt_test
        
        if self.strategy == 'regression':
            y_hat = model.predict(X)
            # show residual analysis
            self.residual(y, y_hat, model_name, graph)
            if graph:
                # show the correlation between y and y_hat
                fig, ax = plt.subplots(figsize=(size, 0.5 * size))
                plt.title('Model Overall Performance')
                plt.scatter(y, y_hat, color='g')
                viewer = lr()
                plt.plot(y, viewer.fit(y, y_hat).predict(y), color='k')
                plt.xlabel('Observed')
                plt.ylabel('Predicted')
                plt.show()

        else:
            y_pred = model.predict(X)
            sample_size = len(y_pred)
            print('\n')
            print(self.report_width * '*', '\n*')
            print('* MODEL PERFORMANCE \n*')
            print('* MODEL NAME: ', model_name)
            print('* TEST SAMPLE SIZE: ', sample_size)
            print('* ACCURACY: ', round(accuracy_score(y, y_pred)*100, 1), '%')
            print('* ')
            print(self.report_width * '*', '\n')
            report = classification_report(y, y_pred, output_dict=True)
            if graph:
                fig, ax = plt.subplots(figsize=(size, 0.3 * size))
                plt.title('Confusion Matrix')
                sns.heatmap(confusion_matrix(y, y_pred), annot=True, cmap='YlGn', fmt='d',)
                plt.xlabel('Predicted')
                plt.ylabel('True Class')
                plt.show()
                fig, ax = plt.subplots(figsize=(size, 0.5 * size))
                plt.title('Classification Report')
                sns.heatmap(pd.DataFrame(report).iloc[0:3].T, annot=True, vmin=0, vmax=1, cmap='BrBG', fmt='.2g')
                plt.xlabel('Score')
                plt.show()
            else:
                display(pd.DataFrame(report).T)
        return None
Exemple #25
0
def log_reg(x, y, t, q):

    # Logistic Regression predictor initialization

    pred = lr(solver="saga", max_iter=200, multi_class="multinomial", tol=0.1)
    start = timer()  # Start timer
    pred.fit(x, y)  # Predictor training
    pred.result = pred.score(t, q)  # Predictor test
    pred.error = 1 - pred.result  # error probability
    pred.end = timer() - start  # End timer
    q = pred.predict(t)

    return q, pred
Exemple #26
0
def gs(x, y="prob"):
    #70/30 train test split
    x_train, x_test, y_train, y_test = tts(x, x.label, test_size=0.3)
    data = x_train.iloc[:, :32]
    test_data = x_test.iloc[:, :32]

    #train model
    classifier = lr(random_state=0).fit(data, y_train)
    if y == "prob":
        pred = classifier.predict_proba(test_data)
    else:
        pred = classifier.predict(test_data)
    return pred, y_test.values
def model_stop(df):
    #df = pd.get_dummies(df,columns=['day'])
    #features = ['day_'+str(i) for i in range(0,7)]
    #for f in features:
    #    if f not in df.columns:
    #        df[f] = 0
    df = df[df['traveltime'] < df['traveltime'].quantile(0.95)]
    features = ['rain','temp','vappr','hour','hour2','hour3','hour4','day','day2','day3','day4']
    for i in range(2,5):
        df['hour'+str(i)] = df['hour'] ** i
        df['day'+str(i)] = df['day'] ** i
    model = lr(fit_intercept=True).fit(df[features],df['traveltime'])
    return model,df,features 
Exemple #28
0
def evaluateNodeClassification(X, Y, test_ratio):
    X_train, X_test, Y_train, Y_test = sk_ms.train_test_split(
        X, Y, test_size=test_ratio)
    try:
        top_k_list = list(Y_test.toarray().sum(axis=1))
    except:
        top_k_list = list(Y_test.sum(axis=1))
    classif2 = TopKRanker(lr())
    classif2.fit(X_train, Y_train)
    prediction = classif2.predict(X_test, top_k_list)
    micro = f1_score(Y_test, prediction, average='micro')
    macro = f1_score(Y_test, prediction, average='macro')
    return (micro, macro)
Exemple #29
0
    def create_model(self, model_type, parameters):

        if model_type == 'lr':
            model = lr()
        elif model_type == 'svm':
            model = svm()
        elif model_type == 'mlp':
            model = mlp()
        elif model_type == 'rf':
            model = rf()
        elif model_type == 'xgb':
            model = xgb()
        return model.set_params(**parameters)
def version1():  # Logistic Regression Model
    train_test_split(df["reviewText"], df["Positivity"], 100)

    features_train_vectorized = cv().fit_transform(features_train)
    features_test_vectorized = cv().transform(features_test)

    model = lr().fit(features_train_vectorized,
                     labels_train)  # Model creation for logistic regression
    predictions = model.predict(features_test_vectorized)

    ras(labels_test, predictions)  # Generating prediction score
    cm(labels_test, predictions)

    return model
Exemple #31
0
    def __init__(self, model_type=None, column_names=None, metric='f1', **kwargs):
        self.model_type = model_type
        self.column_names = column_names
        self.params = kwargs
        self.trained = None
        self.metric = metric
        if model_type == 'LR':
            if self.params.get('regularization', None) is None:
                self.params['regularization'] = 'l1'
            if self.params.get('alpha', None) is None:
                self.params['alpha'] = 1.0
            self.model = lr(penalty=self.params['regularization'], C=self.params['alpha'])
        elif model_type == 'SVM' or model_type == 'SVMNB':
            if self.params.get('kernel', None) is None:
                self.params['kernel'] = 'rbf'

            if model_type == 'SVM':
                if self.params.get('alpha', None) is None:
                    self.params['alpha'] = 0.1
            else:  # elif model_type == SVMNB:
                self.params['kernel'] = 'linear'
                if self.params.get('alpha', None) is None:
                    self.params['alpha'] = 1
                if self.params.get('beta', None) is None:
                    self.params['beta'] = 0.25

            if self.params['kernel'] == 'linear':
                # override regularization parameter to avoid a conflict
                self.params['regularization'] = 'l2'
                self.model = svm.LinearSVC(C=self.params['alpha'])
            else:  # elif self.params['kernel'] != 'linear':
                if self.params.get('degree', None) is None:
                    self.params['degree'] = 3
                if self.params.get('gamma', None) is None:
                    self.params['gamma'] = 0.0
                if self.params.get('coef0', None) is None:
                    self.params['coef0'] = 0.0
                self.model = svm.SVC(C=self.params['alpha'], kernel=self.params['kernel'], degree=self.params['degree'],
                                     gamma=self.params['gamma'], coef0=self.params['coef0'])
        elif model_type == 'MNB':
            if 'alpha' not in self.params:
                self.params['alpha'] = 1.0
            self.model = MultinomialNB(alpha=self.params['alpha'], fit_prior=True)
        elif model_type == 'myMNB':
            if 'alpha' not in self.params:
                self.params['alpha'] = 1.0
            self.model = None
        else:
            self.model_type = 'default'
            self.model = None
 def linear_model(self, nldas_wind, type = 'speed'):
     X = nldas_wind
     if type == 'speed':
         y = self.wind_speed_anomaly
     else:
         y = self.wind_dir_anomaly
     mask = ~np.isnan(y)
     X = X[mask].reshape((len(X[mask]), 1))
     y = y[mask]
     lr_model = lr()
     lr_model.fit(X, y)
     est_y = lr_model.predict(X)
     std = np.sqrt(np.sum((est_y - y) ** 2) / (len(y) - 2))
     return lr_model, std
Exemple #33
0
def predict_lr(X, y, X_train, X_test, y_train, y_test):
    clf = lr(solver='lbfgs', multi_class='ovr')
    print("======Logistic Regression======")
    clf.fit(X_train, y_train)
    pickle.dump(clf, open('logreg_trained_new.sav', 'wb'))
    y_pred = clf.predict(X_test)
    calc_accuracy("Logistic regression", y_test, y_pred)
    np.savetxt('submission_surf_lr.csv',
               np.c_[range(1,
                           len(y_test) + 1), y_pred, y_test],
               delimiter=',',
               header='ImageId,Label,TrueLabel',
               comments='',
               fmt='%d')
Exemple #34
0
def LR_from_cfg(params):
    X_ = X[:]
    clf = lr(**params)
    if params['penalty'] == 'l2':
        if params['dual'] is True:
            if params['solver'] == 'liblinear':
                if params['multi_class'] == 'multinomial':
                    return 1 - 0.001
                else:
                    return 1 - cross_val_score(clf, X_, y, cv=5).mean()
            else:
                return 1 - 0.001
        else:
            if params['solver'] == 'liblinear' and params[
                    'multi_class'] == 'multinomial':
                return 1 - 0.001
            else:
                return 1 - cross_val_score(clf, X_, y, cv=5).mean()
    elif params['penalty'] == 'l1':
        if params['dual'] is True:
            return 1 - 0.001
        else:
            if params['solver'] == 'liblinear':
                if params['multi_class'] == 'multinomial':
                    return 1 - 0.001
                else:
                    return 1 - cross_val_score(clf, X_, y, cv=5).mean()
            elif params['solver'] == 'saga':
                return 1 - cross_val_score(clf, X_, y, cv=5).mean()
            else:
                return 1 - 0.001
    elif params['penalty'] == 'elasticnet':
        if params['dual'] is True:
            return 1 - 0.001
        else:
            if params['solver'] == 'saga':
                return 1 - cross_val_score(clf, X_, y, cv=5).mean()
            else:
                return 1 - 0.001
    elif params['penalty'] == 'none':
        if params['dual'] is True:
            return 1 - 0.001
        else:
            if params['solver'] == 'liblinear':
                return 1 - 0.001
            else:
                return 1 - cross_val_score(clf, X_, y, cv=5).mean()
    else:
        return 1 - cross_val_score(clf, X_, y, cv=5).mean()
def evaluateNodeClassification(X_train, X_test, Y_train, Y_test):
    try:
        top_k_list = list(Y_test.toarray().sum(axis=1))
    except:
        top_k_list = list(Y_test.sum(axis=1))
    classif2 = TopKRanker(lr())
    try:
        classif2.fit(X_train, Y_train)
        prediction = classif2.predict(X_test, top_k_list)
    except:
        print('Could not fit node classification model')
        prediction = np.zeros(Y_test.shape)
    micro = f1_score(Y_test, prediction, average='micro')
    macro = f1_score(Y_test, prediction, average='macro')
    return prediction
def log_reg(x, y, t, q):
    """ This function is an amalgamation of different minute tasks that 
    I just gatherd into a singal call function to ease work."""
    
    pred = lr(solver = "saga", tol = 0.001, max_iter = 600, n_jobs = -1, fit_intercept = True)
    pred.fit(x,y)                              # Predictor training
    g = pred.score(t,q)           # Predictor test
    pred = pred.predict(t)                     # Predicting correct labels
    
    # Printing some information for user
    print("------------------------------------------")
    print("accuracy rate is %{}" .format(round(g * 100 , 3)))
    print("Error rate is %{}" .format(round((1 - g) * 100 , 3)))
    
    return pred
Exemple #37
0
    def train_model(self):
        '''
        Trains simple logistic regression using the class labels.
        No regularization. The Metonymi features do all of the heavy lifting!
        '''
        print('TRAINING MODEL...')
        labels = self.frame[:, -1]
        frame = scale(self.frame[:, :-1])
        self.train, self.test, self.train_labels, self.test_labels = \
        tts(frame, labels, random_state=26, test_size=.15)
        self.model = lr(max_iter=200)
        self.model.fit(self.train, self.train_labels)
        print('DONE!\n')

        return True
def _train_SKLR_Classifier(extractedBases, lbls, params = {}):
    """ NLTK ME Training Wrapper"""

    Xtrn = makeSKFormat(extractedBases)
    ytrn = lbls

    C = params.get('C', 10)
    penalty = params.get('penalty', 'l1')
    class_weight = params.get('class_weight','auto')
    tol = params.get('tol', 1e-6)

    classifier = lr(C=C, penalty=penalty,
                    class_weight=class_weight, tol=tol)

    classifier.fit(Xtrn,ytrn)

    return classifier, list(classifier.classes_)
Exemple #39
0
def classify(data_filename, label_filename, feature_dir, list_of_features, model_type='LR',
             regularizer='l1', alpha=1.0, verbose=1):

    labels = pd.read_csv(label_filename, header=0, index_col=0)

    if not os.path.exists(feature_dir):
        os.makedirs(feature_dir)

    # for each feature in feature_list:
    items = None
    feature_matrices = []
    column_names = []
    print "Loading features"
    for feature in list_of_features:
        feature_description = feature
        rows, columns, counts = feature_loader.load_feature(feature_description, feature_dir, data_filename, verbose=1)
        if items is None:
            items = rows
        else:
            assert items == rows
        if verbose > 0:
            print "Loaded", feature, "with shape", counts.shape
        feature_matrices.append(counts)
        column_names.append(columns)

    # concatenate all features together
    X = sparse.csr_matrix(sparse.hstack(feature_matrices))
    column_names = np.concatenate(column_names)
    if verbose > 0:
        print "Full feature martix size:", X.shape

    #return items, column_names, X
    if model_type == 'LR':
        model = lr(penalty=regularizer, C=alpha)
    elif model_type == 'SVM':
        model = svm.LinearSVC(C=alpha, penalty=regularizer)
    else:
        sys.exit('Model type ' + model_type + ' not supported')

    y = labels.as_matrix().ravel()
    model.fit(X, y)
    pred = model.predict(X)
    f1 = f1_score(y_true=y, y_pred=pred)
    print f1
    return {'loss': -f1, 'status': STATUS_OK}
def classify_one_model(feature_list, model_type='LR', regularizer='l1', alpha=1.0, converg_tol=0.01, verbose=1, folds=2, n_jobs=-1, score_eval='f1'):

    if model_type == 'LR':
        model = lr(penalty=regularizer, C=alpha, tol=converg_tol)
    elif model_type == 'SVM':
        model = svm.LinearSVC(penalty=regularizer, C=alpha, tol=converg_tol)
    else:
        sys.exit('Model type ' + model_type + ' not supported')

    train_X, train_Y = load_features(train_data_filename, train_label_filename, train_feature_dir, 
                                     feature_list, verbose)
    # Try loading dev data using train vocabulary, and not saving dev feature extractions
    dev_X, dev_Y = load_features(dev_data_filename, dev_label_filename, dev_feature_dir,
                                     feature_list, verbose, vocab_source=train_feature_dir)

    model.fit(train_X, train_Y)
    dev_pred_prob_Y = model.predict_proba(dev_X)
    
    return dev_pred_prob_Y, model, dev_Y
Exemple #41
0
data.drop('F19', axis=1, inplace=True)
selector = selector.fit(data, y)

#print which features have been selected
print "ATTRIBUTES WHICH HAVE BEEN SELECTED\n"
for i in xrange(0,len(data.columns)):
	if(selector.support_[i]==True):
		print data.columns[i]

df1=data[['FAC_NAME','F1','F2','F3','F4','F5','F6','F7','F8','F9','F10','F11','F12','F13','F14','F15','F16','F17','F18','F19','F20','F21','F22']]
clf=SVC()  #???
scores=cv1(clf,df1,y,cv=10)
print "\nSVC Cross validated Scores:\n"
print scores

clf1=lr()
scores1=cv1(clf1,df1,y,cv=10)
print "\nLogistic Regression Cross validated Scores:\n"
print scores1

model = GaussianNB()
scores2=cv1(model,df1,y,cv=10)
print "\nNaive Bayes Cross validated Scores:\n"
print scores2

model = DecisionTreeClassifier()
scores3=cv1(model,df1,y,cv=10)
print "\nDecision Trees validated Scores:\n"
print scores3

clf=LinearSVC()
def main():
    "main program"
    app = get_app_title()
    appf = get_app_file()
    plotdir = make_plotdir()
    
    loans_df, loans_y, test_df, test_y, numeric_vars = load_data()
    indep_vars = numeric_vars
    
    # skip scaling for now, score 0.71
    loans_X = loans_df
    test_X = test_df
    clf = lr()
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)  
    plot_predict(plotdir, app, appf, "rawvar", indep_vars, test_df, test_y, pred_y)

    # add scaling, score 0.90    
    loans_X, my_scaler = scale_train_data(loans_df, print_out=True)
    test_X = scale_test_data(my_scaler, test_df)
    
    clf = lr()
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)  
    plot_predict(plotdir, app, appf, "allvar", indep_vars, test_df, test_y, pred_y)
    print("columns:", indep_vars)
#   print_coefs(clf)
    X_labels = list(loans_df.columns)
#   print_lr_coefs(clf, X_labels)
    plist = print_lr_coefs(clf, indep_vars)

# find score using only top6
    top6 = [p[0] for p in plist[:6]]
    print("top6:", top6)
    loans_X = loans_df[top6]
    test_X = test_df[top6]
    loans_X, my_scaler = scale_train_data(loans_X, print_out=True)
    test_X = scale_test_data(my_scaler, test_X)
    clf = lr()
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)
    print_lr_coefs(clf, top6)
    plot_predict(plotdir, app, appf, "top6", top6, test_df, test_y, pred_y)

    do_roc(clf, test_X, test_y, "top6", top6, app, appf, plotdir)
    
#    arr = clf.decision_function(loans_df)
#    print("decision function:", arr.shape, arr)  # shape (1873,)
##    clf.decision_function(loans_df)
#    print_coefs(clf)
# traditional coefs in "frequentist" style?
#    proba = clf.predict_proba(loans_X)
#    print("proba", proba.shape, proba)
    
    explore_params(loans_X, loans_y, plotdir, app, appf)
    
    # run optimization routine
    clf = lr()
#    init_list = [indep_vars[0], indep_vars[1]]
#    random_opt(clf, indep_vars, init_list, loans_df, loans_y, print_out=True)
    opt_score, opt_list = run_opt(clf, numeric_vars, loans_df, loans_y, app, appf, plotdir, rescale=True)
    # accuracy 73% +- 3% with no scaling  (90% with scaling)
#    print_coefs(clf)

    # redo exploration with optimized columns
    loans_X = loans_df[opt_list]
    test_X = test_df[opt_list]
    loans_X, my_scaler = scale_train_data(loans_X, print_out=True)
    test_X = scale_test_data(my_scaler, test_X)
#    print("loans_X head\n", loans_X[:3])
    explore_params(loans_X, loans_y, plotdir, app, appf+"opt_")
    # accuracy 73% due to no scaling
    
    clf = lr()
    cross_validate(clf, loans_X, loans_y, print_out=True)
    
    clf = lr()
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)
    print("opt_list columns:", opt_list)
#   print_coefs(clf)
#   print_lr_coefs(clf, X_labels)
    print_lr_coefs(clf, opt_list)
    plot_predict(plotdir, app, appf, "optvar", opt_list, test_df, test_y, pred_y)
Exemple #43
0
    print 'f1 macro:', res
    print
    # color = cm(1. * i / NUM_COLORS)  # color will now be an RGBA tuple
    # cm = plt.get_cmap('gist_rainbow')
    # fig = plt.figure(figsize=(8.0, 5.0))
    # ax = fig.add_subplot(111)
    # # ax.set_color_cycle([cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)])
    # ax.plot(range(len(scores)), scores, label=str(threshold))
    # ax.text(len(scores) - 1, scores[len(scores) - 1], threshold, fontsize='smaller')
    # plt.show()
    print name
    return res


vec_list = [tf(), cv()]
clf_list = [svc(), lr()]
threshold_list = np.arange(0.5, 3, 0.5)
print len(threshold_list)
# results_size = (len(vec_list), len(clf_list),len(threshold_list))
# results = np.zeros(results_size, dtype = np.float)
# a, b, c = range(3), range(3), range(3)
# def my_func(x, y, z):
#     return (x + y + z) / 3.0, x * y * z, max(x, y, z)

grids = np.vectorize(run)(*np.ix_(threshold_list, vec_list, clf_list))
# mean_grid, product_grid, max_grid = grids
print len(grids)
try:
    print grids.shape
except:
    print type(grids)
	x[:,16] = (x1**4)*x2
	x[:,17] = (x1**3)*(x2**2)
	x[:,18] = (x1**2)*(x2**3)
	x[:,19] = x1*(x2**4)
	x[:,20] = x2**5
	x[:,21] = x1**6
	x[:,22] = (x1**5)*x2
	x[:,23] = (x1**4)*(x2**2)
	x[:,24] = (x1**3)*(x2**3)
	x[:,25] = (x1**2)*(x2**4)
	x[:,26] = x1*(x2**5)
	x[:,27] = x2**6
	return x

data = np.loadtxt("data_microchip.txt",delimiter=",")
m = data[:,0].size
x1 = data[:,0]
x2 = data[:,1]
x = map_features(x1,x2,m)
y = data[:,2]

reg = lr(C=10)
reg.fit(x,y)

s = reg.coef_.size
theta_ans = np.zeros((s+1))
theta_ans[0] = reg.intercept_[0]
theta_ans[1:] = reg.coef_
theta_ans = theta_ans.reshape(s+1,1)
print "%.2f%% accuracy"%(reg.score(x,y)*100)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as lr

data = np.loadtxt("ex1data1.txt",delimiter = ',')
m = data[:,0].size
x = data[:,0].reshape(m,1)
y = data[:,1]
a = lr(fit_intercept=True)
a.fit(x,y)
print a.coef_
print a.intercept_
print a.score(x,y)
plt.scatter(x,y)
plt.plot(x,a.predict(x))
plt.show()

train_data = np.load('train_data.npy')

if load_saved:
    report = np.load("report.npy").item()    
    rbm = RBM(len(train_data), report["n_hidden"], report["batch_size"])
    rbm.W = report["W"]
    rbm.hbias = report["hbias"]
    rbm.vbias = report["vbias"]

Y = np.argmax(train_data[:,:20], axis=1)
train_data = train_data[:,20:]
X = sigmoid(np.dot(train_data, rbm.W) + rbm.hbias)
#X = train_data


classifier = lr(0.01, solver = 'lbfgs', multi_class='multinomial')
classifier.fit(X, Y)

test_data = np.load('test_data.npy')
test_X = sigmoid(np.dot(test_data, rbm.W) + rbm.hbias)
#test_X = test_data

pred = classifier.predict(test_X)
train_ids, train_cuisines, train_ingredients = read_data('train.json')
test_ids, test_cuisines, test_ingredients = read_data('test.json')
del train_ids, train_ingredients, test_cuisines, test_ingredients
le = LabelEncoder()
le.fit(train_cuisines)
pred = le.inverse_transform(pred)
create_submission(test_ids, pred)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression as lr

def show_scatter():
	data_admitted = data[data[:,2]==1]
	data_notadmitted = data[data[:,2]==0]
	plt.scatter(data_admitted[:,0],data_admitted[:,1],c='r',s=50)
	plt.scatter(data_notadmitted[:,0],data_notadmitted[:,1],c='b',s=50)
	x_coordinates = [0,-theta_ans[0][0]/theta_ans[1][0]]
	y_coordinates = [-theta_ans[0][0]/theta_ans[2][0],0]
	plt.plot(x_coordinates,y_coordinates)
	plt.show()


data = np.loadtxt("data_logistic_regression.txt",delimiter=",")
m = data[:,0].size
x = data[:,0:2]
y = data[:,2]

reg = lr(C=3.2)
reg.fit(x,y)
s = reg.coef_.size
theta_ans = np.zeros((s+1))
theta_ans[0] = reg.intercept_[0]
theta_ans[1:] = reg.coef_
theta_ans = theta_ans.reshape(s+1,1)
print theta_ans
print reg.score(x,y)*100,"% accuracy"
show_scatter()
Exemple #48
0
 def train(self, train_X, train_Y):
     self.model = lr(penalty=self.hp['regularizer'], C=self.hp['alpha'], tol=self.hp['converg_tol'])
     self.model.fit(train_X, train_Y)
import pandas as pa
from sklearn.linear_model import LinearRegression as lr
import matplotlib.pyplot as plt
import random

random.seed(1)

tabtrain = pa.read_csv('sources/train.csv')
tabtest = pa.read_csv('sources/test.csv')

# On forme les tableaux des features
x_train = tabtrain.drop(['datetime','count','casual','registered'],1)
x_test = tabtest.drop(['datetime'],1)


# On forme les tableaux des résultats
y_train = tabtrain['count']


model = lr(5)

model.fit(x_train, y_train)

y_test = model.predict(x_test)
y_test = pa.DataFrame(y_test)
y_test.index = tabtest['datetime']

print(y_test)

def logistic_regression_speed_test(dftrain, dftrain_y, plotdir):
    atitle = 'Logistic Regression'
    afile = 'logreg'
    clf = lr()
#   speed_test_medium(clf, dftrain, dftrain_y, atitle, afile, plotdir)
    speed_test_large(clf, dftrain, dftrain_y, atitle, afile, plotdir)
testDf = auxiliary.initialise_test(False)
ids = testDf['Id'].values
# Id,Dates,DayOfWeek,PdDistrict,Address,X,Y,Year,Week,Hour
testDf = testDf.drop(['Id', 'Dates', 'Address', 'X', 'Y'], axis=1)

# Random Forest Algorithm
print list(trainDf.columns.values)
print list(testDf.columns.values)
#print list(trainDf.X.values)

# back to numpy format
trainData = trainDf.values
testData = testDf.values

print 'Training...'
logit = lr()
logit = logit.fit(trainData[0::,1::], trainData[0::,0])

print 'Predicting...'
output = logit.predict_proba(testData).astype(float)
output = output.tolist()

predictions_file = open("../submissionLR.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Id",'ARSON','ASSAULT','BAD CHECKS','BRIBERY','BURGLARY','DISORDERLY CONDUCT',
                           'DRIVING UNDER THE INFLUENCE','DRUG/NARCOTIC','DRUNKENNESS','EMBEZZLEMENT','EXTORTION',
                           'FAMILY OFFENSES','FORGERY/COUNTERFEITING','FRAUD','GAMBLING','KIDNAPPING','LARCENY/THEFT',
                           'LIQUOR LAWS','LOITERING','MISSING PERSON','NON-CRIMINAL','OTHER OFFENSES',
                           'PORNOGRAPHY/OBSCENE MAT','PROSTITUTION','RECOVERED VEHICLE','ROBBERY','RUNAWAY',
                           'SECONDARY CODES','SEX OFFENSES FORCIBLE','SEX OFFENSES NON FORCIBLE','STOLEN PROPERTY',
                           'SUICIDE','SUSPICIOUS OCC','TREA','TRESPASS','VANDALISM','VEHICLE THEFT','WARRANTS',
import pandas as pa
from sklearn.linear_model import LinearRegression as lr
import matplotlib.pyplot as plt
import random

random.seed(1)

tabtrain = pa.read_csv('sources/train.csv')
tabtest = pa.read_csv('sources/test.csv')

# On forme les tableaux des features
x_train = tabtrain.drop(['datetime','count','casual','registered'],1)
x_test = tabtest.drop(['datetime'],1)


# On forme les tableaux des résultats
y_train = tabtrain['count']


model = lr()

model.fit(x_train, y_train)

y_test = model.predict(x_test)
y_test = pa.DataFrame(y_test)
y_test.index = tabtest['datetime']

print(y_test)

Exemple #53
0
    x["miss"] = data.Name.map(lambda x:1 if x.lower().find("miss")>=0 else 0)
    x["master"] = data.Name.map(lambda x:1 if x.lower().find("master")>=0 else 0)

    x["embark_C"] = data.Embarked.map(lambda x:1 if x=="C" else 0)
    x["embark_Q"] = data.Embarked.map(lambda x:1 if x=="Q" else 0)
    x["embark_S"] = data.Embarked.map(lambda x:1 if x=="S" else 0)

    #return x
    p = poly(2, interaction_only=False)
    return p.fit_transform(x)

if __name__ == "__main__":
    data = pd.read_csv("./data/train.csv")

    x = makeInput(data)
    y = data.Survived

    model = lr(C=0.2)
    model.fit(x,y)

    test_data = pd.read_csv("./data/test.csv")
    x_test = makeInput(test_data)
    predict = model.predict(x_test)
    predict = pd.Series(predict)

    y_test = pd.DataFrame({
        "PassengerId": test_data.PassengerId
        ,"Survived": predict
    })
    y_test.to_csv("./predict.csv", index=False)