Esempi in Python per tts, esempi in Python per sklearn.cross_validation.tts

Esempio n. 1

0

Mostra file

File: model_run.py Progetto: pmillan-gpsw/Churn_Rforests

def train_test_split(model_df, percent, num_bootstraps):
    '''
	Splits the model dataset into the required train and test data sets
	Use 'percent' to first split train and test
	Then use the train data set to understand how much churned subs are
		present
	Split unchurned subs into multiple random selections equivalent in 
		size to the churned subs set
	Return each dataset as train set in a dictionary
	'''
    print('Entered train test split')
    print model_df.head()
    print model_df['customer_life']
    model_df['flag'] = 0
    model_df['flag'][(model_df['customer_life'] < 60)] = 1
    model_df = model_df[model_df['flag'] == 0]
    col = model_df.columns.tolist()
    col.remove('flag')
    model_df = model_df[col].copy()
    master_train, test_data = tts(
        model_df, test_size=percent
    )  #Master train = 80% of data, Therefore PERCENT = 0.2
    train_churn = master_train[master_train['churn_flag'] == 1]  #
    train_uchurn = master_train[master_train['churn_flag'] == 0]
    print len(train_churn)
    print len(train_uchurn)
    train_subsample_size = int(len(train_churn) * 0.8)
    sub_uchurn_percent = float(train_subsample_size * 9) / float(
        len(train_uchurn))
    test_size = sub_uchurn_percent

    train_indep_dsamples = {}
    train_dep_dsamples = {}

    print test_size
    for i in range(num_bootstraps):
        print(str(i))
        dummy, down_train_uchurn = tts(train_uchurn, test_size=test_size)
        dummy, down_train_churn = tts(train_churn, test_size=0.8)
        indep_columns = down_train_churn.columns.tolist()
        indep_columns.remove('churn_flag')
        dep_columns = ['churn_flag']
        indep_set = pd.concat([
            down_train_uchurn[indep_columns], down_train_churn[indep_columns]
        ])
        dep_set = pd.concat(
            [down_train_uchurn[dep_columns], down_train_churn[dep_columns]])
        print len(indep_set)
        print len(dep_set)
        train_indep_dsamples[i] = indep_set
        train_dep_dsamples[i] = dep_set

    return_dict = {
        'test_set': test_data,
        'train_indep': train_indep_dsamples,
        'train_dep': train_dep_dsamples,
        'master_train': master_train
    }

    return return_dict

Esempio n. 2

0

Mostra file

def build_and_evaluate(X, y, classifier=svm.SVC, verbose=True):
    def build(classifier, X, y=None):
        if isinstance(classifier, type):
            classifier = classifier()

        model = Pipeline([
            (
                'union',
                FeatureUnion(transformer_list=[
                    (
                        'bag_words',
                        Pipeline([
                            ('preprocessor', NLTKPreprocessor()),
                            #('tfidf', TfidfVectorizer(ngram_range=(1, 2), tokenizer=identity, preprocessor=None, lowercase=False)),
                            #('tfidf', TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words='english')),
                            (
                                'topics_and_ngrams',
                                FeatureUnion(transformer_list=[
                                    ('grams',
                                     Pipeline([(
                                         'ngram',
                                         TfidfVectorizer(ngram_range=(1, 2),
                                                         tokenizer=identity,
                                                         preprocessor=None,
                                                         lowercase=False)
                                     ), ('best',
                                         TruncatedSVD(n_components=50))])),
                                    #('topics', Pipeline([
                                    #	('tfid', TfidfVectorizer(ngram_range=(1, 1), tokenizer=identity, preprocessor=None, lowercase=False)),
                                    #	('topic', NMF(n_components=9, random_state=1,
                                    #	alpha=.1, l1_ratio=.5)),
                                    #	])),
                                ])),
                        ])),
                    # add other features here as an element in transformer list
                    ('capitalize',
                     Pipeline([('cap_words', CaptilizationExtractor())])),
                    ('punctuation', PuncuationExtractor())
                    #('emotion', Pipeline([
                    # ('emotion_words', EmotionExtractor())
                    #]))
                ])),
            ('svc', svm.SVC()),
        ])
        model.fit(X, y)
        return model

    labels = LabelEncoder()
    y = labels.fit_transform(y)

    if verbose: print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    model = build(classifier, X_train, y_train)

    if verbose:

        print("classification Report: \n")

    y_pred = model.predict(X_test)
    print(clsr(y_test, y_pred))

Esempio n. 3

0

Mostra file

File: simple_linear_regression.py Progetto: marcelloaborges/Studies-and-Researches

    def Run(self, csv):
        dataset = pd.read_csv(StringIO(csv), delimiter=';')
        x = dataset.iloc[:, 0:1].values
        y = dataset.iloc[:, 1].values

        #split base into train and test
        from sklearn.cross_validation import train_test_split as tts
        x_train, x_test, y_train, y_test = tts(x,
                                               y,
                                               test_size=0.2,
                                               random_state=0)

        #fit the regression
        from sklearn.linear_model import LinearRegression
        regressor = LinearRegression()
        regressor.fit(x_train, y_train)

        #regression
        y_pred = regressor.predict(x_test)

        result = []
        for i in range(0, len(y_pred)):
            result.append({
                'Test': x_test[i][0],
                'Expected': y_test[i],
                'Predicted': y_pred[i],
            })

        print(result)

        return result

Esempio n. 4

0

Mostra file

File: polinomial_regression.py Progetto: marcelloaborges/Studies-and-Researches

    def Run(self, csv):
        dataset = pd.read_csv(StringIO(csv))
        x = dataset.iloc[:, 0:1].values
        y = dataset.iloc[:, 1].values

        from sklearn.cross_validation import train_test_split as tts
        x_train, x_test, y_train, y_test = tts(x,
                                               y,
                                               test_size=0.2,
                                               random_state=0)

        from sklearn.linear_model import LinearRegression
        from sklearn.preprocessing import PolynomialFeatures

        feature_poly = PolynomialFeatures(degree=4)
        x_poly = feature_poly.fit_transform(x_train)

        pr = LinearRegression()
        pr.fit(x_poly, y_train)

        y_pred = pr.predict(feature_poly.fit_transform(x_test))

        result = []
        for i in range(0, len(y_pred)):
            result.append({
                'Expected': x_test.tolist()[i][0],
                'Preditect': y_pred[i],
            })

        print(result)

        return result

Esempio n. 5

0

Mostra file

File: L-BFGS_Based_Adversarial_Attack.py Progetto: sunyi199374/L-BFGS-Based-Adversarial-Input-Against-SVM-

def fit_model(X, y):
    Xtr, Xts, ytr, yts = tts(X, y, test_size=1 / 6, random_state=0)
    svc.fit(Xtr, ytr)
    yhat_ts = svc.predict(Xts)
    acc = np.mean(yhat_ts == yts)
    print('Accuaracy = {0:f}'.format(acc))
    return acc

Esempio n. 6

0

Mostra file

File: viz.py Progetto: veksev/spring16

def plot_roc_curve(estimators, X, y):
    try:
        if type(estimators) is not type([]):
            estimators = [estimators]

        X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=5557)
        for i, clf in enumerate(estimators):
            name = clf.__class__.__name__
            clf.fit(X_train, y_train)
            if 'predict_proba' in dir(clf):
                y_probas = clf.predict_proba(X_test)[:,1]
            elif 'decision_function' in dir(clf):
                y_probas = clf.decision_function(X_test)
            else:
                print('Probability score not available in {}, skipping.'.format(name))
                continue
            fpr, tpr, thresholds = metrics.roc_curve(y_test, y_probas, drop_intermediate=True)
            plt.plot(fpr, tpr, label=name)

        plt.title('ROC Comparison'.format(name))
        plt.xlim(-0.05, 1.05)
        plt.ylim(-0.05, 1.05)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc='lower right')
        plt.savefig('aggregate_roccurve.png')
        plt.clf()
    except Exception as e:
        print(e)

Esempio n. 7

0

Mostra file

 def train(self):
     self.configs['text1'].delete('1.0', END)
     try:
         self.LIST
     except:
         self.configs['text1'].insert(
             "1.0", 'The entities have not been initialized!')
         return
     x_train_o, x_test_o = tts(self.LIST, test_size=0.2)
     x_train = np.array([model[i] for i in x_train_o])
     x_test = np.array([model[i] for i in x_test_o])
     train_model = get_model(x_train, self.ini)
     y_pred = train_model.predict(x_test)
     labels = sorted(set(y_pred))
     most = [sum(y_pred == i) for i in labels]
     if len(most) > 1:
         arg_outlier = np.argmin(most)
         outliers = x_test_o[y_pred == labels[arg_outlier]]
         self.outliers[self.ini] = outliers
     most = max(most)
     ACC = most * 1.0 / len(y_pred)
     self.ACC[self.ini] = ACC
     self.trained[self.ini] = True
     self.configs['text1'].insert(
         "1.0", 'Type of classifier: ' + names[self.ini] +
         '\n The ACC is:\n' + str(ACC))

Esempio n. 8

0

Mostra file

File: validate.py Progetto: pawelchw/learning

def train_split(data, outcome, predictors, ratio=0.3):

    x_train, x_test, l_train, l_test = tts(data[predictors],
                                           data[outcome],
                                           test_size=ratio,
                                           random_state=123)
    return x_train, x_test, l_train, l_test

Esempio n. 9

0

Mostra file

File: classifyReceipt.py Progetto: rafiparvez/ReceiptRecognizer

def buildnEvaluateModel(X, y):
    '''
    The function takes training data and splits it further into
    Training and Cross-validate sets. And returns the model.
    '''
    # Split the traning data input to get 20% cross-validation data set
    # for model evaluation
    X_train, X_cv, y_train, y_cv = tts(X, y, test_size=0.2)

    #convert dataframe with float valaues into bool
    y_train = [bool(int(i)) for i in y_train]
    y_cv = [bool(int(i)) for i in y_cv]

    #output classification labels
    labels = LabelEncoder()
    labels.fit_transform(y_train)

    # define classification model
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SVC(kernel='linear', probability=True)),
    ])

    #Traning the model
    text_clf = text_clf.fit(X_train, y_train)
    '''
    Following section evaluates the model performance
    '''
    predicted = text_clf.predict(X_cv)
    print("Model Accuracy = " + str(np.mean(predicted == y_cv)))
    print(clsr(y_cv, predicted,
               target_names=[str(i) for i in labels.classes_]))

    return text_clf

Esempio n. 10

0

Mostra file

def build_and_evaluate(X,
                       y,
                       classifier=SGDClassifier,
                       outpath=None,
                       verbose=True):
    # @timeit
    def build(classifier, X, y=None):
        """
        Inner build function that builds a single model.
        """
        if isinstance(classifier, type):
            classifier = classifier()

        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)),
            ('classifier', classifier),
        ])

        model.fit(X, y)
        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    secs = time()

    # Begin evaluation
    if verbose: print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    model = build(classifier, X_train, y_train)

    if verbose:
        print("Evaluation model fit in {:0.3f} seconds".format(time() - secs))
        print("Classification Report:\n")

    y_pred = model.predict(X_test)
    print(clsr(y_test, y_pred, target_names=labels.classes_))

    secs = time()
    if verbose:
        print("Building complete model and saving ...")
    model = build(classifier, X, y)
    model.labels_ = labels

    if verbose:
        print("Complete model fit in {:0.3f} seconds".format(time() - secs))

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))

    return model

Esempio n. 11

0

Mostra file

File: topic_modelling_and_classification.py Progetto: jannor226/auto_topic_modelling-classification

def prepare_dataset(corpus, labels, test_data_proportion=0.3):
    '''
    creates a train and test split of calssification dataset
    '''
    train_x, test_x, train_y, test_y = tts(corpus,
                                           labels,
                                           test_size=0.3,
                                           random_state=42)
    return train_x, test_x, train_y, test_y

Esempio n. 12

0

Mostra file

def train_subset(x_train, y_train, x_test, porc_corte, pipeline):
    x_train_1, x_test_1, y_train_1, y_test_1 = tts(x_train,
                                                   y_train,
                                                   random_state=0,
                                                   test_size=porc_corte)
    pipeline.fit(x_train_1, y_train_1)

    print "Predict!!"
    return pipeline.predict(x_test)

Esempio n. 13

0

Mostra file

 def test_resid_plots(self):
     """
     Assert no errors occur during Residual Plots integration
     """
     model = SVR()
     X_train, X_test, y_train, y_test = tts(X, y, test_size=0.5)
     model.fit(X_train, y_train)
     visualizer = ResidualsPlot(model)
     visualizer.score(X_test, y_test)

Esempio n. 14

0

Mostra file

 def __init__(self, iris):
     #setting the train and test data and targets
     self.iris = iris
     self.d_train, self.d_test, self.t_train, self.t_test = tts(
         iris.data,
         iris.target,
         train_size=.7,
         random_state=random.randint(400, 600))
     self.prediction = []
     self.percent = 0

Esempio n. 15

0

Mostra file

File: Imbal CC Frd SMOTE 2.py Progetto: srees16/ML-Deep-Learning-Python

def data_preparation(x):
    #again and again so make a function
    x_features=x.iloc[:,x.columns!="Class"]
    x_labels=x.iloc[:,x.columns=="Class"]
    x_features_train,x_features_test,x_labels_train,x_labels_test=tts(x_features,x_labels,test_size=0.3)
    print("length of training data")
    print(len(x_features_train))
    print("length of test data")
    print(len(x_features_test))
    return(x_features_train,x_features_test,x_labels_train,x_labels_test)

Esempio n. 16

0

Mostra file

 def __init__(self, location, split=0.2):
     self.location = location
     datas1 = pd.read_csv(location)
     self.x = datas1.iloc[:, :-1].values
     self.y = datas1.iloc[:, -1].values
     self.xtr, self.xte, self.ytr, self.yte = tts(self.x,
                                                  self.y,
                                                  test_size=split)
     self.t = 0
     self.tt = self.t + 1

Esempio n. 17

0

Mostra file

def run_svmtest_int(num):
    n = 0
    l = []
    for n in range(num):
        I_train, I_test, y2_train, y2_test = tts(I, y2, test_size=.1)
        my_c1 = svm.SVC()
        my_c1.fit(I_train.values.reshape(-1, 1), y2_train)
        predictions1 = my_c1.predict(I_test.values.reshape(-1, 1))
        score = accuracy_score(y2_test, predictions1)
        l.append(score)
        n += 1
    return l

Esempio n. 18

0

Mostra file

File: regressor.py Progetto: mariusvniekerk/yellowbrick

    def fit(self, X, y):
        """
        Fit all three models and also store the train/test splits.

        TODO: move to MultiModelMixin.
        """
        # TODO: make test size a parameter and do better data storage on viz.
        self.X_train, self.X_test, self.y_train, self.y_test = tts(
            X, y, test_size=0.2)
        self.models = list(
            map(lambda model: model.fit(self.X_train, self.y_train),
                self.models))

Esempio n. 19

0

Mostra file

def main():
    kfold = KFold(len(yall), 10)
    sen = []
    spe = []
    acc = []
    mcc = []
    figs = []
    #set the params of SVM
    C = np.linspace(0.6, 0.8, 10)
    G = np.linspace(0.13, 0.22, 10)
    clist = []
    glist = []
    aucs = []
    param = {'C': C, 'gamma': G}
    for ind1, ind2 in kfold:
        print('*********')
        x_train = trall[ind1]
        y_train = yall[ind1]
        X_p = x_train[y_train == 1]
        X_n = x_train[y_train == 0]
        Table = frequences_matrix_mainFunc(X_p, X_n)
        x_train, y_train = GetFeatures(x_train, y_train, Table)
        x_test = trall[ind2]
        y_test = yall[ind2]
        x_test, y_test = GetFeatures(x_test, y_test, Table)
        svm = SVC(kernel='rbf', probability=True)
        x1, x2, y1, y2 = tts(x_train, y_train, test_size=0.2)
        cv = CV(svm, param, n_jobs=2)
        cv.fit(x2, y2)
        best = cv.best_params_
        c = best['C']
        g = best['gamma']
        clist.append(c)
        glist.append(g)
        print('c,g:', c, g)
        svm = SVC(kernel='rbf', C=c, gamma=g, probability=True)
        svm.fit(x_train, y_train)
        acc_r = svm.score(x_test, y_test)
        mcc_r, sen_r, spe_r = getmcc2(svm, x_test, y_test)
        acc.append(acc_r)
        mcc.append(mcc_r)
        sen.append(sen_r)
        spe.append(spe_r)
        scores = svm.predict_proba(x_test)[:, 1]
        fpr, tpr, thres = roc_curve(y_test, scores)
        figs.append([fpr, tpr])
        #print('sen:',sen_r,'\n','spe:',spe_r)
        auc_r = auc(fpr, tpr)
        aucs.append(auc_r)
        print(auc_r)
        print('acc:', acc_r, '\n', 'mcc:', mcc_r)
        print('*********')
    return mcc, acc, aucs, sen, spe, figs

Esempio n. 20

0

Mostra file

def run_treetest_f1(num):
    n = 0
    l = []
    for n in range(num):
        V_train, V_test, y_train, y_test = tts(V, y, test_size=.1)
        my_c1 = tree.DecisionTreeClassifier()
        my_c1.fit(V_train.values.reshape(-1, 1), y_train)
        predictions1 = my_c1.predict(V_test.values.reshape(-1, 1))
        score = accuracy_score(y_test, predictions1)
        l.append(score)
        n += 1
    return l

Esempio n. 21

0

Mostra file

File: classify.py Progetto: sophie4869/topic-model

def classifier():
    vect,voc,txt=jiebaCounter()
    # normalisation
    x=np.array(vect/(np.max(vect,axis=1)+1e-10))
    x_train,x_test,y_train,y_test=tts(x,y,test_size=0.25,train_size=0.75)
    clf=svm.LinearSVC()
    clf.fit(x_train,y_train)
    Cs=np.logspace(-5,0,10)
    clf_ = GridSearchCV(estimator=clf, param_grid=dict(C=Cs))
    clf_.fit(x_,y)
    print(clf_.best_params_)
    print("train accuracy:")
    print(np.sum(clf_.predict(x_train)==y_train)/float(len(y_train)))
    print("test accuracy:")
    print(np.sum(clf_.predict(x_test)==y_test)/float(len(y_test)))

Esempio n. 22

0

Mostra file

File: codes.py Progetto: mehmetgoren/machine_learning

def train(x_dataset, y_dataset, test_size=.33):
    x_train, x_test, y_train, y_test = tts(x_dataset,y_dataset, test_size=test_size)

    lr = LinearRegression()
    lr.fit(x_train, y_train)
    predict = lr.predict(x_test)


    result = []
    index = 0
    for y_item in y_test.values:
        predicted_item = predict[index]
        index += 1
        result.append((float(y_item), float(predicted_item)))
    
    return result, y_test.values, predict

Esempio n. 23

0

Mostra file

File: seeds.py Progetto: gunner272/StatAI

def main():

	#df = pd.read_csv('../data/seeds.data',error_bad_lines = False,sep = '\t')
	#df.columns=['area','perimeter','compactness','k_length','k_width','assy_coef','g_length','label']

	df = pd.read_csv('../data/alabone.data',header = 0,error_bad_lines = False)

	tar = df['label']

	df = df.drop(['c1','label'],axis=1)
	# Q1 split 50-50%
	rk = {}
	rk[1] = []
	rk[2] = []
	rk[3] = []
	for i in range(0,10):
		print 'Test run',i
		xtrain,xtest,ytrain,ytest = tts(df,tar,test_size = 0.5)
 		rk[1].append(results(xtrain,xtest,ytrain,ytest,k=1))
 		print
 		rk[2].append(results(xtrain,xtest,ytrain,ytest,k=2))
 		print
		rk[3].append(results(xtrain,xtest,ytrain,ytest,k=3))
 			   
 	print "Mean accuracy and variance over 10 runs with k = 1",np.mean(rk[1]),np.var(rk[1])
 	print
 	print "Mean accuracy and variance over 10 runs with k = 2",np.mean(rk[2]),np.var(rk[2])
	print
	print "Mean accuracy and variance over 10 runs with k = 3",np.mean(rk[3]),np.var(rk[3])

	'''
	Cross validation 5 fold
	'''

	sf = StratifiedKFold(tar,n_folds = 5)
	i = 1
	rk[3] = []
	for train,test in sf:
		print 'Fold',i
		i = i +1
		xtrain,xtest,ytrain,ytest = df.values[train],df.values[test],tar.values[train],tar.values[test]
 		print
 		rk[3].append(result(xtrain,xtest,ytrain,ytest,k=3))
 		
 	print	
 	print "Mean accuracy and variance over 5-folds",np.mean(rk[3]),np.var(rk[3])

Esempio n. 24

0

Mostra file

File: classify.py Progetto: sophie4869/topic-model

def classifier():
    vect, voc, txt = jiebaCounter()
    # normalisation
    x = np.array(vect / (np.max(vect, axis=1) + 1e-10))
    x_train, x_test, y_train, y_test = tts(x,
                                           y,
                                           test_size=0.25,
                                           train_size=0.75)
    clf = svm.LinearSVC()
    clf.fit(x_train, y_train)
    Cs = np.logspace(-5, 0, 10)
    clf_ = GridSearchCV(estimator=clf, param_grid=dict(C=Cs))
    clf_.fit(x_, y)
    print(clf_.best_params_)
    print("train accuracy:")
    print(np.sum(clf_.predict(x_train) == y_train) / float(len(y_train)))
    print("test accuracy:")
    print(np.sum(clf_.predict(x_test) == y_test) / float(len(y_test)))

Esempio n. 25

0

Mostra file

File: iris.py Progetto: gunner272/StatAI

def main():

	df = pd.read_csv('../data/iris.data',)
	df.columns=['sepal_l','sepal_w','petal_l','petal_w','label']

	tar = df['label']

	df = df.drop(['label'],axis=1)
	# Q1 split 50-50%
	rk = {}
	rk[1] = []
	rk[2] = []
	rk[3] = []
	for i in range(0,10):
		print 'Test run',i
		xtrain,xtest,ytrain,ytest = tts(df,tar,test_size = 0.5)
 		rk[1].append(results(xtrain,xtest,ytrain,ytest,k=1))
 		print
 		rk[2].append(results(xtrain,xtest,ytrain,ytest,k=2))
 		print
		rk[3].append(results(xtrain,xtest,ytrain,ytest,k=3))
 			   
 	print "Mean accuracy and variance over 10 runs with k = 1",np.mean(rk[1]),np.var(rk[1])
 	print
 	print "Mean accuracy and variance over 10 runs with k = 2",np.mean(rk[2]),np.var(rk[2])
	print
	print "Mean accuracy and variance over 10 runs with k = 3",np.mean(rk[3]),np.var(rk[3])

	'''
	Cross validation 5 fold
	'''

	sf = StratifiedKFold(tar,n_folds = 5)
	i = 1
	rk[3] = []
	for train,test in sf:
		print 'Fold',i
		i = i +1
		xtrain,xtest,ytrain,ytest = df.values[train],df.values[test],tar.values[train],tar.values[test]
 		print
 		rk[3].append(result(xtrain,xtest,ytrain,ytest,k=3))
 		
 	print	
 	print "Mean accuracy and variance over 5-folds",np.mean(rk[3]),np.var(rk[3])

Esempio n. 26

0

Mostra file

File: evaluator.py Progetto: josneville/CS410_FinalProject

def build_and_save_model(X, y, filepath):
    """
    This function does the following:
    - Build a classifier (SGD)
    - Fit our data to the classifier
    - Run cross validation to test the accuracy of our model
    """
    def build(classifier, X, y=None):
        """
        Build a model based on our process, a vectorizer and a linear classifier
        """
        if isinstance(classifier, type):
            classifier = classifier()

        model = Pipeline([
            ('preprocessor', DataPreProcessor()),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)),
            ('classifier', classifier),
        ])

        model.fit(X, y)  # Fit the model to our data
        return model

    # Label encode the classes we chose
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Split data into train/test
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.1)
    model = build(SGDClassifier, X_train, y_train)

    # Predict the results of test data and calculate accuracy
    y_pred = model.predict(X_test)
    print(clsr(y_test, y_pred, target_names=labels.classes_))

    model.labels_ = labels

    with open(filepath, 'wb') as f:
        pickle.dump(model, f)

    return model

Esempio n. 27

0

Mostra file

def tfidf_iterator(batch_size=100,max_features=10000,path="/home/tingyubi/20w/data/",prefix="extraction-",begin=1,end=26):
    #tf,voc,txt = tfidf(max_features=max_features,path=path,prefix=prefix,begin=begin,end=end)
    #jsonfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".json"
    #with open(jsonfile,'r') as f:
    #    data = json.load(f)
    #tf,voc = np.array(data['tfidf']), data['vocabulary']
    pklfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".mat"
    with open(pklfile,'rb') as f:
        tf=cPickle.load(f)
    vocfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".voc"
    #vocfile = "allvoc.txt"
    f = open(vocfile,'r')
    voc=f.read().decode('utf-8').split("\n")
    f.close()
    tf = tf.toarray()
    tf = tf / (np.max(tf,axis = 1)[:, None] + 1e-10)
    x_train,x_test=tts(tf,train_size=0.9,test_size=0.1)
    train_iter = mx.io.NDArrayIter(data=x_train,batch_size=batch_size,shuffle=True)
    test_iter = mx.io.NDArrayIter(data=x_test,batch_size=batch_size,shuffle=True)
    return train_iter,test_iter,voc

Esempio n. 28

0

Mostra file

File: model_builder.py Progetto: sweinger/learnedleague-defender

def build_model(X, y, classifier, verbose=True):
    @timeit
    def build(classifier, X, y=None):
        """
        Inner build function that builds a single model.
        """

        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)),
            ('classifier', classifier),
        ])

        model.fit(X, y)
        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Begin evaluation
    if verbose: print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    model, secs = build(classifier, X_train, y_train)

    if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs))
    if verbose: print("Classification Report:\n")

    y_pred = model.predict(X_test)
    print(clsr(y_test, y_pred, target_names=labels.classes_))

    if verbose: print("Building complete model and saving ...")
    model, secs = build(classifier, X, y)
    model.labels_ = labels.inverse_transform(model.classes_)

    if verbose: print("Complete model fit in {:0.3f} seconds".format(secs))

    return model

Esempio n. 29

0

Mostra file

File: face-recognition_svm.py Progetto: xieydd/xieydd-s-respository

def pca_svm(pca_n=10,svm_C=1):  
    t1=time.time()  
    data,target=get_data()  
    #scale_learner=StandardScaler()  
    #data=scale_learner.fit_transform(data)  
    x_train,x_test,y_train,y_test=tts(data,target,random_state=33)  
    pca_learner=decomposition.PCA(n_components=pca_n)  
    x_train=pca_learner.fit_transform(x_train)  
    svm_learner=svm.SVC(C=svm_C)  
    svm_learner.fit(x_train,y_train)  
    x_test_pre=pca_learner.transform(x_test)  
    y_test_pre=svm_learner.predict(x_test_pre)  
    # report=classification_report(y_test,y_test_pre)  
    # print 'The Main Explanied: ',numpy.sum(pca_learner.explained_variance_ratio_)  
    # print report  
    # print x_test_pre.shape,y_test_pre.shape,y_test.shape  
    ac=svm_learner.score(x_test_pre,y_test)  
    p=precision_score(y_test,y_test_pre,average='weighted')  
    r=recall_score(y_test,y_test_pre,average='weighted')  
    f1=2.0/(1.0/p+1.0/r)  
    t=time.time()-t1  
    return ac,p,r,f1,t

Esempio n. 30

0

Mostra file

File: logistic_regression.py Progetto: marcelloaborges/Studies-and-Researches

    def Run(self, csv):
        dataset = pd.read_csv(StringIO(csv))

        x = dataset.iloc[:, [0, 1]].values
        y = dataset.iloc[:, 2].values

        from sklearn.cross_validation import train_test_split as tts
        x_train, x_test, y_train, y_test = tts(x,
                                               y,
                                               test_size=0.2,
                                               random_state=0)

        from sklearn.preprocessing import StandardScaler
        sc_x = StandardScaler()

        x_train_sc = sc_x.fit_transform(x_train)
        x_test_sc = sc_x.fit_transform(x_test)

        from sklearn.linear_model import LogisticRegression
        #FOR LINEAR LOGISTIC REGRESSION => ONLY TWO OUTPUTS (SIGMOID)
        llr = LogisticRegression(random_state=0)

        llr.fit(x_train_sc, y_train)

        y_pred = llr.predict(x_test_sc)

        result = []
        for i in range(0, len(y_pred)):
            result.append({
                'Age': x_test.tolist()[i][0],
                'Salary': x_test.tolist()[i][1],
                'Expected': y_test.tolist()[i],
                'Preditect': y_pred.tolist()[i],
            })

        print(result)

        return result

Esempio n. 31

0

Mostra file

File: face-recognition_svm.py Progetto: xieydd/xieydd-s-respository

def pca_svm_pipeline():  
    #svm_C=numpy.linspace(0.5,10,10)  
    svm_C=[1]  
    pca_n_components=numpy.arange(5,200,10)  
    data,target=get_data()  
    x_train,x_test,y_train,y_test=tts(data,target,random_state=33)  
    #scale_learner=StandardScaler()  
    pca_learner=decomposition.PCA()  
    svm_learner=svm.SVC()  
    pipe=pipeline.Pipeline([('pca',pca_learner),('svm',svm_learner)])  
    gscv=GridSearchCV(pipe,  
                      {'pca__n_components':pca_n_components,'svm__C':svm_C},n_jobs=-1)  
    gscv.fit(x_train,y_train)  
    y_test_pre=gscv.predict(x_test)  
    report=classification_report(y_test,y_test_pre)  
    print(gscv.best_params_ )
    print(report)  
    target_pre=gscv.predict(data)  
    n1,n2=data.shape  
    figure=pyplot.figure()  
    L=numpy.zeros((40,))  
    xx=numpy.linspace(0,1,64)+13  
    yy=numpy.linspace(1,0,64)+13  
    xx,yy=numpy.meshgrid(xx,yy)  
    for i in range(n1):  
        k=target_pre[i]  
        g=L[k]  
        L[k]+=1  
        xx1=xx-k  
        yy1=yy-g  
        pyplot.contourf(xx1,yy1,data[i].reshape((64,64)),cmap='gray')  
        if target[i]!=target_pre[i]:  
            pyplot.scatter(numpy.mean(xx1),numpy.mean(yy1),marker='x',c='red',s=40)  
    pyplot.axis('off')  
    pyplot.grid('off')  
    pyplot.title('PCA & SVM Recongnize Faces')  
    pyplot.show()

Esempio n. 32

0

Mostra file

def build_and_evaluate(text, leanings, classifier=SGDClassifier, verbose=True):
    def build(classifier, X, y=None):
        if isinstance(classifier, type):
            classifier = classifier()
        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)),
            ('classifier', classifier),
        ])
        model.fit(X, y)
        return model

    # Label encode the targets
    labels = LabelEncoder()
    leanings = labels.fit_transform(leanings)

    # Build model on training data.
    text_train, text_test, leanings_train, leanings_test = tts(text,
                                                               leanings,
                                                               test_size=0.2)
    #build(classifier, text_train, leanings_train)

    model = build(classifier, text_train, leanings_train)

    leanings_pred = model.predict(text_test)
    leanings_pred_prob = model.predict_proba(text_test)
    print(clsr(leanings_test, leanings_pred, target_names=labels.classes_))

    # Build model on all data.
    model = build(classifier, text, leanings)
    model.labels_ = labels

    return leanings_test, leanings_pred_prob, model

Esempio n. 33

0

Mostra file

File: cov_shift.py Progetto: choudharydhruv/dec-meg-2014

    for vid,Xt,yt in zip(subjId_val, X_val, y_val):
	levelOneTest = []
	levelOneTrain = []
	X_levelOne = []
	y_levelOne = []	
	level0Classifier = []
        for tid,Xp,yp in zip(subjId_train,X_train,y_train):
	    print "Predicting subject ", vid, "from subject ", tid
            y0 = np.zeros(yp.shape)
	    y1 = np.ones(Xt.shape[0])
	    X = np.vstack([Xp,Xt])
            yd = np.concatenate([y0,y1])

            pls = PLSRegression(n_components)
	    Xp_t, Xp_v, yp_t, yp_v = tts(Xp.copy(),yp.copy(),train_size=0.9)
	    yp_t = yp_t.astype(bool)
	    yp_t_not =  np.vstack((yp_t,~yp_t)).T
	    #print "yp_t_not ", yp_t_not.shape
	    pls.fit(Xp_t,yp_t_not.astype(int))
	    yp_new = pls.predict(Xp_t, copy=True)
	    yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int)
	    yp_t = yp_t.astype(int)
	    #print y_new,y_pred, y_t
	    error = ((yp_t - yp_pred) ** 2).sum()
   	    print "PLS Training error " , float(error)/yp_t.shape[0]
 	    yp_new = pls.predict(Xp_v, copy=True)
	    yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int)
	    #print y_new, y_pred, y_v
	    #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0]
	    error = ((yp_v - yp_pred) ** 2).sum()

Esempio n. 34

0

Mostra file

File: model.py Progetto: absulier/titanic

x=df.drop('Survived',axis=1)

#makes a baseline for accuracy
float(len(y[y==0]))/float(len(y))

#basic model, no validation
model=lr()
model.fit(x,y)
model.score(x,y)

#looking at correlations
for column in x.columns:
    print column, np.corrcoef(x[column],y)[1][0]

#building test train sets
x_train, x_test, y_train, y_test = tts(x,y, train_size=.8, random_state=1)

#train/test fitting and validation
model.fit(x_train,y_train)
model.score(x_test,y_test)
proba=model.predict_proba(x_test)
pred=model.predict(x_test)
s= cross_val_score(model,x_test,y_test, cv=12)
s.mean()
s.std()

#The f-1 scores show that our model does a fairly decent job of predicting those
#who died and an okay job predicting those who survived
print skcr(y_test,pred)

#(true negative) (false positive)

Esempio n. 35

0

Mostra file

File: script.py Progetto: shawk3/KNN

count = int(input('How many times would you like to run the test: '))
ts = float (input('What test size percentage(eg. 0.25): '))
r = float(input('What learning rate? '))


if dsetindex == 2:
    iris = datasets.load_iris()
    iris.data[: , 0] = do.normalize(iris.data[:,0])
    iris.data[: , 1] = do.normalize(iris.data[:,1])
    iris.data[: , 2] = do.normalize(iris.data[:,2])
    iris.data[: , 3] = do.normalize(iris.data[:,3])

    

    for i in range(count):
        xtrain, xtest, ytrain, ytest = tts(iris.data, iris.target, test_size= ts)
        xtrain, xvalidate, ytrain, yvalidate = tts(xtrain, ytrain, test_size= ts)
        nn = NN.NeuralNetwork(3,4,r)
        nn.addNewLayer(3)
        scores = nn.train(xtrain, ytrain, xvalidate, yvalidate)
        print('Test: ', nn.test(xtest, ytest))

if dsetindex == 1:
    data = np.array(do.read_file("indianDiabetes.txt")).astype(np.float16)
    data[: , 0] = do.normalize(data[:,0])
    data[: , 1] = do.normalize(data[:,1])
    data[: , 2] = do.normalize(data[:,2])
    data[: , 3] = do.normalize(data[:,3])
    data[: , 4] = do.normalize(data[:,4])
    data[: , 5] = do.normalize(data[:,5])
    data[: , 6] = do.normalize(data[:,6])

Esempio n. 36

0

Mostra file

File: my_grid_Search.py Progetto: mehmetgoren/machine_learning

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cross_validation import train_test_split as tts
from sklearn.metrics import confusion_matrix

dataset = pd.read_csv("Social_Network_Ads.csv")
corr = dataset.corr()  #koreleasyon matrix' ine göre cinsiyet anlamsız zaten.

X = dataset.iloc[:, 2:4]
y = dataset.iloc[:, -1]

X_train, X_test, y_train, y_test = tts(X, y, test_size=.2, random_state=0)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

from sklearn.svm import SVC
classifier = SVC(kernel="rbf",
                 random_state=0)  #çıkan değerler sürekli değişmesin diye.
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

#apliying k-fold cross validation
from sklearn.model_selection import cross_val_score

Esempio n. 37

0

Mostra file

File: mnistex.py Progetto: ML-KA/presentations

from sklearn.datasets import fetch_mldata
from sklearn import svm
from sklearn.cross_validation import train_test_split as tts

mnist = fetch_mldata('MNIST original')
print("Data fetched.")

Xtr, Xts, Ytr, Yts = tts(mnist.data,
                         mnist.target,
                         test_size=10000)
print("tts done.")
clf = svm.SVC(gamma=0.001, C=100.)
clf.fit(Xtr, Ytr)
print("fitted.")
predicted_label = clf.predict(Xts[-1])

Esempio n. 38

0

Mostra file

File: Git_Cap_Language_Machine_Learning.py Progetto: Pythonsgo/Capstone-Project

## Module Constants
##########################################################################
##########################################################################
## Modules
##########################################################################
##########################################################################
## Program
##########################################################################

if __name__ == "__main__":

    corpus = load_files("Language_Folder")

    # print len(corpus.data)

    X_train, X_test, y_train, y_test = tts(corpus.data, corpus.target, test_size=0.20)

    text_clf = Pipeline([("vec", CountVectorizer(analyzer="char_wb")), ("clf", MultinomialNB())])

    text_clf = text_clf.fit(X_train, y_train)

    # Store the instance using pickle.
    with open("experiment_file", "w") as f:
        pickle.dump(text_clf, f)

    predicted = text_clf.predict(X_test)
    accuracy = np.mean(predicted == y_test)
    print accuracy

    print "Here it is."

Esempio n. 39

0

Mostra file

File: plot_pipeline_classification.py Progetto: ystone1025/imbalanced-learn

print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=1.25, weights=[0.3, 0.7],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=5, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)

# Create the samplers
enn = EditedNearestNeighbours()
renn = RepeatedEditedNearestNeighbours()

# Create teh classifier
knn = KNN(1)


# Make the splits
X_train, X_test, y_train, y_test = tts(X, y, random_state=42)

# Add one transformers and two samplers in the pipeline object
pipeline = make_pipeline(pca, enn, renn, knn)

pipeline.fit(X_train, y_train)
y_hat = pipeline.predict(X_test)

print(classification_report(y_test, y_hat))

Esempio n. 40

0

Mostra file

File: splitter.py Progetto: absulier/waterquality

#File is too big to run all at once, splits file into small files so I can run
#in chunks

import pandas as pd
from sklearn.cross_validation import train_test_split as tts

df=pd.read_csv('sdwis_clean.csv')
df=df.drop(['Unnamed: 0'], axis=1)

df,df1=tts(df,train_size=.9)
df,df2=tts(df, train_size=.89)
df,df3=tts(df, train_size=.88)
df,df4=tts(df, train_size=.86)
df,df5=tts(df, train_size=.83)
df,df6=tts(df, train_size=.8)
df,df7=tts(df, train_size=.75)
df,df8=tts(df, train_size=.66)
df10,df9=tts(df,train_size=.5)


pd.DataFrame.to_csv(df1,"df1.csv")
pd.DataFrame.to_csv(df2,"df2.csv")
pd.DataFrame.to_csv(df3,"df3.csv")
pd.DataFrame.to_csv(df4,"df4.csv")
pd.DataFrame.to_csv(df5,"df5.csv")
pd.DataFrame.to_csv(df6,"df6.csv")
pd.DataFrame.to_csv(df7,"df7.csv")
pd.DataFrame.to_csv(df8,"df8.csv")
pd.DataFrame.to_csv(df9,"df9.csv")
pd.DataFrame.to_csv(df10,"df10.csv")

Esempio n. 41

0

Mostra file

File: evaluate_poi_identifier.py Progetto: abdullahcsedu/data-science-intensive

sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )

### add more features to features_list!
features_list = ["poi", "salary"]

data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)

### your code goes here 

from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.cross_validation import train_test_split as tts

features_train, features_test, labels_train,labels_test = tts(features, labels, test_size=0.3, random_state=42)

print 'Baseline accuracy:',list(labels_test).count(0)/float(len(labels_test))
clf = DTC()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

print 'Predicted number of person\'s of interest',list(pred).count(1)
print('Accuracy:',accuracy_score(labels_test,pred))
print('Precision:',precision_score(labels_test,pred))
print('Recall:',recall_score(labels_test,pred))

Esempio n. 42

0

Mostra file

File: Lab3_3_4.py Progetto: absulier/Iowa_Liquor_Sales

#
# x=sales15['margin_sum_15q1']
# y=sales15['sale_total_15']
# plt.scatter(x, y)
# plt.xlabel("Total Margin 2015 Q1")
# plt.ylabel("Total Sales 2015")
# plt.show()

#These variables from Q1 were all highly correlated with sales for the year,
#use them to predict.
#these variables are also correlated with each other, so it is redundant to use all
#However, for the sake of practicing a multvariable linear regression, well use them all
x=['sale_total_15q1','vol_sol_l_sum_15q1','margin_sum_15q1']

#Split data into test and train
train, test = tts(sales15, train_size=.85)
train_x=train[x]
train_y=train['sale_total_15']
test_x=test[x]
test_y=test['sale_total_15']

#Builds the model using the train data.
lm = linear_model.LinearRegression()
model = lm.fit(train_x, train_y)
predictions = lm.predict(test_x)
print "Sample:", lm.score(test_x, test_y)

#Builds the model with a Ridge Regularization
lm = linear_model.RidgeCV()
model = lm.fit(train_x, train_y)
predictions = lm.predict(test_x)

Esempio n. 43

0

Mostra file

File: down.py Progetto: omicron-theta/dvoa

	
linComb = LR()
linComb.fit(df_avg[['all_prev', 'all_avg']].values, df_avg['all_yr'].values)
print linComb.score(df_avg[['all_prev', 'all_avg']].values, df_avg['all_yr'].values)

linDet = LR()
linDet.fit(X,y)
print linDet.score(X,y)

# df_avg == 3 year rolling average + yr4 stats
X,y = df_avg[['all_avg']].values, df_avg['all_yr'].values
X,y = df_avg[['all_prev']].values, df_avg['all_yr'].values
X,y = df_avg[['all_avg', 'all_prev']].values, df_avg['all_yr'].values

X,y = df_avg[['1D_avg', '2D_avg', '3D_avg', 'all_avg','1D_prev', '2D_prev', '3D_prev', 'all_prev']].values, df_avg['all_yr'].values
X_train, X_test, y_train, y_test = tts(X, y)

lin = LR(fit_intercept=False)
lin.fit(X,y)
lin.score(X,y)

knn = KNR(n_neighbors=5)
knn.fit(X_train,y_train)
print knn.score(X_train,y_train)
print knn.score(X_test,y_test)


ns = range(1,30,2)
scores = []
for n in ns:
	knn = KNR(n_neighbors=n)

Esempio n. 44

0

Mostra file

File: Perceptron_Classifier.py Progetto: Cbkhare/Machine-Learning

        X_test,y_test = X[test_idx,:],y[test_idx]
        plt.scatter(X_test[:,0],X_test[:,1],c='',
                    alpha=1.0,linewidth=1,marker='o',
                    s=55,label='test set')

    

if __name__=='__main__':

    iris = datasets.load_iris()
    X = iris.data[:,[2,3]]
    y = iris.target
    
    #spliting the data for test(30%) and training(70%) using tts 
    X_train,X_test,y_train, y_test = \
            tts(X,y,test_size=0.3, random_state=0)    


    #Standardising the feature (feature scaling) using ss 
    sc =ss()
    #Using fit to estimate 'sample mean','standard deviation' to do feature scaling 
    #for each feature dimension using training data 
    sc.fit(X_train)
    #tranform is used to standardize the trainig data (TrDS) and test data(TsDS)
    #Note: we have used same parameter for feature scaling 
    X_train_std = sc.transform(X_train)
    X_test_std  = sc.transform(X_test)


    #n_iter:-  Number of Epochs(passes over the TrDS set)
    #eta0/eta:-learning rate

Esempio n. 45

0

Mostra file

File: Class2.py Progetto: RobbieShan/MindOnData

import numpy as np
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.decomposition import PCA as PCA
from sklearn.cross_validation import train_test_split as tts



datapath = 'G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/OttoProductClassification/Data/'
trainfile = 'train.csv'
testfile = 'test.csv'

trd = pd.read_csv(datapath+trainfile)
trd = trd.values

# Split data into training and cross-validation dataset
nptrd, npcvd = tts(trd,test_size=0.33)


# Train the model

pca = PCA(n_components=40)
pca.fit(nptrd[:,range(1,94)])
X = pca.transform(nptrd[:,range(1,94)])
PCAExplained = sum(pca.explained_variance_ratio_)

# Most of the features are highly skewed i.e. their 75% value ranges when we do td.describe() is 0 while their max is much higher. 
# This indicates that only a few values are non-zero for most features.
# This could mean that these features are actually categorical variables that are encoded in the test data.. could .. not sure

forest = rfc(n_estimators=500,criterion = 'entropy' , n_jobs=-1,min_samples_split=5,min_samples_leaf=5,max_depth=20)
#forest = forest.fit(nptrd[:,range(1,94)],nptrd[:,-1])

Esempio n. 46

0

Mostra file

File: Smote1.py Progetto: asyafiq/sentimen_ta

from sklearn import metrics
from imblearn.over_sampling import ADASYN
from imblearn.ensemble import BalanceCascade
from imblearn.over_sampling import RandomOverSampler
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 


sm = SMOTE()
X_res, y_res = sm.fit_sample(X_train_tf.toarray(), datatrain['sentiment'])
X_res, y_res=sm.fit_sample(X_res,y_res)
print ('Data sentmen asli {}'.format (Counter(datatrain['sentiment'])))
print('Resampled dataset shape {}'.format(Counter(y_res)))

clf=svm.LinearSVC()
#clf = svm.SVC(decision_function_shape='ovo')
X_train, X_test, y_train, y_test = tts(X_res, y_res,test_size=0.2)
clf.fit(X_train,y_train)
predicted=clf.predict(X_test)
print(metrics.classification_report(y_test, predicted))
presisi_svm_smote=metrics.precision_score(y_test, predicted,average='macro')
recall_svm_smote=metrics.recall_score(y_test, predicted,average='macro')
f1_svm_smote=metrics.f1_score(y_test, predicted,average='macro')
akurasi_svm_smote=metrics.accuracy_score(y_test, predicted)
print "Presisi:",presisi_svm_smote 
print "Recall:", recall_svm_smote
print "F1-Score:", f1_svm_smote
print "Akurasi:", akurasi_svm_smote

Esempio n. 47

0

Mostra file

File: cfb.py Progetto: omicron-theta/dvoa

			else:
				df_tmp = pd.merge(df_tmp,df[(df.Year == y2) & (df.Team.isin(tms_include))][['Team','f']],how='left', on=['Team'])
				df_tmp.rename(columns={'f':'f_yr-%d' % (n)}, inplace=True)
	if df_tmp is not None:
		df_lag = df_lag.append(df_tmp[df_tmp.columns])
	
	
# Calculate changes
# df_lag['change'] = df_lag.yr2 - df_lag.yr1
# df_lag['abs_change'] = abs(df_lag.yr2 - df_lag.yr1)
# for c in df_lag.columns:
	# df_lag[c] = df_lag[c].astype(float)

Xcol = ['yr1_f','off_f','def_f','st_f','s_p','fei']
ycol = ['yr2_f']
X_train, X_test, y_train, y_test = tts(df_lag[Xcol].values, df_lag[ycol].values)

linreg = LR()
linreg.fit(X_train, y_train)
linreg.score(X_train, y_train)



# Train on all existing seasons to project 2014
X,y = df_lag[Xcol].values, df_lag[ycol].values
linreg = LR()
linreg.fit(X, y)
linreg.score(X, y)

# build 3yr avgs
df_3avg = pd.DataFrame(columns=['avg_f']+['off_f','def_f','st_f','s_p','fei','yr4_f'])

Esempio n. 48

0

Mostra file

File: titanic.py Progetto: kongsinyi/kaggle-scripts

f_Age = merged[merged.Sex==0]['Age'].median()
merged['age_fill'] = merged['Age']
merged.loc[merged.Age.isnull(),'age_fill'] = 27.5

#scale and fill NaN with mean
cols_to_scale = ['Fare','Pclass','Sex','age_fill','embarked_num','Parch','SibSp']
merged[cols_to_scale] = merged[cols_to_scale].fillna(merged[cols_to_scale].mean())

for i in range(len(cols_to_scale)):
    merged[[cols_to_scale[i]]] = pp.scale(merged[[cols_to_scale[i]]])

train = merged[:len(train)]
test = merged[len(train):]

#modeling with logit
xtrain,xval, ytrain,yval= tts(np.array(train[cols_to_scale]), np.ravel(train['Survived']))
LR = lm.LogisticRegression()
model = LR.fit(xtrain, ytrain)
score = model.score(xval,yval)
print('validation score: ',score)

xtest = np.array(test[cols_to_scale])
results = pd.DataFrame([test['PassengerId'], model.predict(xtest)], index = None).transpose()
results =results.rename(columns = {'Unnamed 0' : 'Survived'})

with open('./Submission.csv','w') as wfile:
    results.to_csv(wfile, index = False)
    wfile.close()

Esempio n. 49

0

Mostra file

File: extra.py Progetto: SujitKRay/Listener_Classification_Public

import os
import numpy as np
import load_data

from sklearn import cross_validation as cv
from sklearn.cross_validation import train_test_split as tts
from sklearn.ensemble import ExtraTreesClassifier

for root, dirs, files in os.walk('data'):
    for name in files:
        if name.endswith('.csv'):
            print "Loading " + root + "/" + name
            dataset = load_data.load_data(name, root)

            splits = tts(dataset.data, dataset.target, test_size=0.2)
            X_train, X_test, y_train, y_test = splits

            # Build a forest and compute the feature importances
            forest = ExtraTreesClassifier(n_estimators=250)
            forest.fit(X_train, y_train)
            importances = forest.feature_importances_
            std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
            indices = np.argsort(importances)[::-1]

             # Print the feature ranking
            print("Feature ranking:")

            for f in range(X_train.shape[1]):
                print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Esempio n. 50

0

Mostra file

le = LabelEncoder()
for i in range(0, 6):
    features[:, i] = le.fit_transform(features[:, i])
la = LabelEncoder()
features[:, 11] = la.fit_transform(features[:, 11])
#onehotencoder
'''before that we need to perform labelencoding in the columns of the dataframe'''

ohe = OneHotEncoder(categorical_features=[11])
features = ohe.fit_transform(features).toarray()

#labelencoding of the label
lc = LabelEncoder()
labels[:, 0] = lc.fit_transform(labels[:, 0])
from sklearn.cross_validation import train_test_split as tts

f_train, f_test, l_train, l_test = tts(features,
                                       labels,
                                       random_state=0,
                                       test_size=.20)
'''*****************************Now With Pandas***************************************'''
feature = df.drop("Target", axis=1)
for i in feature.select_dtypes(include=[object]):
    feature[i] = feature[i].astype('category').cat.codes

feature = pd.get_dummies(feature, columns=["Property_Area"])

label = df["Target"]
label = label.astype('category').cat.codes
label = pd.get_dummies(label)

Esempio n. 51

0

Mostra file

File: GitCapBikeMachine.py Progetto: jhli973/bike-psychics

	
#Initialize the list to keep the scores from each iteration.
	OLS_score = []
	Ridge_score = []
	RidgeCV_score = []
	DecTree1_score = []
	DecTree2_score = []
	Lasso_score = []
	LassoCV_score = []
	RandomForest_score = []
	
		
# Obtain results for running the model a specified number of times
	for i in range(1,15):
#Train the data
		splits = tts(data, target, test_size=0.20)
		X_train, X_test, y_train, y_test = splits
	
#Run the OLS model.
		regr = linear_model.LinearRegression()
		regr.fit(X_train, y_train)
		OLS_score.append(regr.score(X_test, y_test))
		#print 'Coefficients OLS: \n', regr.coef_
		#print 'Intercept OLS: \n', regr.intercept_
		
#Run the Ridge model.
		clf = linear_model.Ridge(alpha=0.5)
		clf.fit(X_train, y_train)
		Ridge_score.append(clf.score(X_test, y_test))
		
#Run the RidgeCV model.

Esempio n. 52

0

Mostra file

#converting independent variables' values to positive
x1 = X
x2 = np.ones(shape=(np.size(X[:, 1]), np.size(X[1, :]))).astype(int)
X = np.add(x1, x2).astype(int)

#feature selection on the basis of chi squared test
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
select = SelectKBest(chi2, 42)
sel = select.fit(X, y)
feature_score = sel.scores_  #visualization of features' scores on the basis of chi2
X = sel.transform(X)

#for cross validation
from sklearn.cross_validation import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size=2000, random_state=1)

#feature scaling
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit(X_train)
X_train = scale.transform(X_train)
X_test = scale.transform(X_test)

#while training on whole dataset, trained the whole dataset on the performance of svc
scale2 = StandardScaler()
scale2.fit(X)
X = scale2.transform(X)

#testing score of multi layered perceptron
from sklearn.neural_network import MLPClassifier as mlp

Esempio n. 53

0

Mostra file

File: Lesson 7.py Progetto: ercamasn/Machine-Learning

#   -testing accuracy is a better estimate than training accuracy of out-of-sample performance
#   -but, it provides a high variance estimate since changing which observations happen to be in the testing
#    set can significantly change testing accuracy

from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split as tts
from sklearn.neighbors import KNeighborsClassifier as knn_
from sklearn import metrics 

# read in the iris data 
iris = load_iris()
X = iris.data
y = iris.target

# train/test split 
X_train, X_test, y_train, y_test = tts(X, y, random_state=4)

# check classification of KNN with K=5
knn = knn_(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)

# What if we created a bunch of train/test splits, calculated the accuracy for each,
# then averaged the results together?
# That's the essence of cross-validation

# Steps for K-fold cross-validation
# 1)  Split the data into K equal partitions (or "folds")
# 2)  Use fold 1 as the testing set and the union of the other folds as the training set
# 3)  Calculate testing accuracy