def train_test_split(model_df, percent, num_bootstraps): ''' Splits the model dataset into the required train and test data sets Use 'percent' to first split train and test Then use the train data set to understand how much churned subs are present Split unchurned subs into multiple random selections equivalent in size to the churned subs set Return each dataset as train set in a dictionary ''' print('Entered train test split') print model_df.head() print model_df['customer_life'] model_df['flag'] = 0 model_df['flag'][(model_df['customer_life'] < 60)] = 1 model_df = model_df[model_df['flag'] == 0] col = model_df.columns.tolist() col.remove('flag') model_df = model_df[col].copy() master_train, test_data = tts( model_df, test_size=percent ) #Master train = 80% of data, Therefore PERCENT = 0.2 train_churn = master_train[master_train['churn_flag'] == 1] # train_uchurn = master_train[master_train['churn_flag'] == 0] print len(train_churn) print len(train_uchurn) train_subsample_size = int(len(train_churn) * 0.8) sub_uchurn_percent = float(train_subsample_size * 9) / float( len(train_uchurn)) test_size = sub_uchurn_percent train_indep_dsamples = {} train_dep_dsamples = {} print test_size for i in range(num_bootstraps): print(str(i)) dummy, down_train_uchurn = tts(train_uchurn, test_size=test_size) dummy, down_train_churn = tts(train_churn, test_size=0.8) indep_columns = down_train_churn.columns.tolist() indep_columns.remove('churn_flag') dep_columns = ['churn_flag'] indep_set = pd.concat([ down_train_uchurn[indep_columns], down_train_churn[indep_columns] ]) dep_set = pd.concat( [down_train_uchurn[dep_columns], down_train_churn[dep_columns]]) print len(indep_set) print len(dep_set) train_indep_dsamples[i] = indep_set train_dep_dsamples[i] = dep_set return_dict = { 'test_set': test_data, 'train_indep': train_indep_dsamples, 'train_dep': train_dep_dsamples, 'master_train': master_train } return return_dict
def build_and_evaluate(X, y, classifier=svm.SVC, verbose=True): def build(classifier, X, y=None): if isinstance(classifier, type): classifier = classifier() model = Pipeline([ ( 'union', FeatureUnion(transformer_list=[ ( 'bag_words', Pipeline([ ('preprocessor', NLTKPreprocessor()), #('tfidf', TfidfVectorizer(ngram_range=(1, 2), tokenizer=identity, preprocessor=None, lowercase=False)), #('tfidf', TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words='english')), ( 'topics_and_ngrams', FeatureUnion(transformer_list=[ ('grams', Pipeline([( 'ngram', TfidfVectorizer(ngram_range=(1, 2), tokenizer=identity, preprocessor=None, lowercase=False) ), ('best', TruncatedSVD(n_components=50))])), #('topics', Pipeline([ # ('tfid', TfidfVectorizer(ngram_range=(1, 1), tokenizer=identity, preprocessor=None, lowercase=False)), # ('topic', NMF(n_components=9, random_state=1, # alpha=.1, l1_ratio=.5)), # ])), ])), ])), # add other features here as an element in transformer list ('capitalize', Pipeline([('cap_words', CaptilizationExtractor())])), ('punctuation', PuncuationExtractor()) #('emotion', Pipeline([ # ('emotion_words', EmotionExtractor()) #])) ])), ('svc', svm.SVC()), ]) model.fit(X, y) return model labels = LabelEncoder() y = labels.fit_transform(y) if verbose: print("Building for evaluation") X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) model = build(classifier, X_train, y_train) if verbose: print("classification Report: \n") y_pred = model.predict(X_test) print(clsr(y_test, y_pred))
def Run(self, csv): dataset = pd.read_csv(StringIO(csv), delimiter=';') x = dataset.iloc[:, 0:1].values y = dataset.iloc[:, 1].values #split base into train and test from sklearn.cross_validation import train_test_split as tts x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2, random_state=0) #fit the regression from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(x_train, y_train) #regression y_pred = regressor.predict(x_test) result = [] for i in range(0, len(y_pred)): result.append({ 'Test': x_test[i][0], 'Expected': y_test[i], 'Predicted': y_pred[i], }) print(result) return result
def Run(self, csv): dataset = pd.read_csv(StringIO(csv)) x = dataset.iloc[:, 0:1].values y = dataset.iloc[:, 1].values from sklearn.cross_validation import train_test_split as tts x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2, random_state=0) from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures feature_poly = PolynomialFeatures(degree=4) x_poly = feature_poly.fit_transform(x_train) pr = LinearRegression() pr.fit(x_poly, y_train) y_pred = pr.predict(feature_poly.fit_transform(x_test)) result = [] for i in range(0, len(y_pred)): result.append({ 'Expected': x_test.tolist()[i][0], 'Preditect': y_pred[i], }) print(result) return result
def fit_model(X, y): Xtr, Xts, ytr, yts = tts(X, y, test_size=1 / 6, random_state=0) svc.fit(Xtr, ytr) yhat_ts = svc.predict(Xts) acc = np.mean(yhat_ts == yts) print('Accuaracy = {0:f}'.format(acc)) return acc
def plot_roc_curve(estimators, X, y): try: if type(estimators) is not type([]): estimators = [estimators] X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=5557) for i, clf in enumerate(estimators): name = clf.__class__.__name__ clf.fit(X_train, y_train) if 'predict_proba' in dir(clf): y_probas = clf.predict_proba(X_test)[:,1] elif 'decision_function' in dir(clf): y_probas = clf.decision_function(X_test) else: print('Probability score not available in {}, skipping.'.format(name)) continue fpr, tpr, thresholds = metrics.roc_curve(y_test, y_probas, drop_intermediate=True) plt.plot(fpr, tpr, label=name) plt.title('ROC Comparison'.format(name)) plt.xlim(-0.05, 1.05) plt.ylim(-0.05, 1.05) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(loc='lower right') plt.savefig('aggregate_roccurve.png') plt.clf() except Exception as e: print(e)
def train(self): self.configs['text1'].delete('1.0', END) try: self.LIST except: self.configs['text1'].insert( "1.0", 'The entities have not been initialized!') return x_train_o, x_test_o = tts(self.LIST, test_size=0.2) x_train = np.array([model[i] for i in x_train_o]) x_test = np.array([model[i] for i in x_test_o]) train_model = get_model(x_train, self.ini) y_pred = train_model.predict(x_test) labels = sorted(set(y_pred)) most = [sum(y_pred == i) for i in labels] if len(most) > 1: arg_outlier = np.argmin(most) outliers = x_test_o[y_pred == labels[arg_outlier]] self.outliers[self.ini] = outliers most = max(most) ACC = most * 1.0 / len(y_pred) self.ACC[self.ini] = ACC self.trained[self.ini] = True self.configs['text1'].insert( "1.0", 'Type of classifier: ' + names[self.ini] + '\n The ACC is:\n' + str(ACC))
def train_split(data, outcome, predictors, ratio=0.3): x_train, x_test, l_train, l_test = tts(data[predictors], data[outcome], test_size=ratio, random_state=123) return x_train, x_test, l_train, l_test
def buildnEvaluateModel(X, y): ''' The function takes training data and splits it further into Training and Cross-validate sets. And returns the model. ''' # Split the traning data input to get 20% cross-validation data set # for model evaluation X_train, X_cv, y_train, y_cv = tts(X, y, test_size=0.2) #convert dataframe with float valaues into bool y_train = [bool(int(i)) for i in y_train] y_cv = [bool(int(i)) for i in y_cv] #output classification labels labels = LabelEncoder() labels.fit_transform(y_train) # define classification model text_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC(kernel='linear', probability=True)), ]) #Traning the model text_clf = text_clf.fit(X_train, y_train) ''' Following section evaluates the model performance ''' predicted = text_clf.predict(X_cv) print("Model Accuracy = " + str(np.mean(predicted == y_cv))) print(clsr(y_cv, predicted, target_names=[str(i) for i in labels.classes_])) return text_clf
def build_and_evaluate(X, y, classifier=SGDClassifier, outpath=None, verbose=True): # @timeit def build(classifier, X, y=None): """ Inner build function that builds a single model. """ if isinstance(classifier, type): classifier = classifier() model = Pipeline([ ('preprocessor', NLTKPreprocessor()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), ('classifier', classifier), ]) model.fit(X, y) return model # Label encode the targets labels = LabelEncoder() y = labels.fit_transform(y) secs = time() # Begin evaluation if verbose: print("Building for evaluation") X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) model = build(classifier, X_train, y_train) if verbose: print("Evaluation model fit in {:0.3f} seconds".format(time() - secs)) print("Classification Report:\n") y_pred = model.predict(X_test) print(clsr(y_test, y_pred, target_names=labels.classes_)) secs = time() if verbose: print("Building complete model and saving ...") model = build(classifier, X, y) model.labels_ = labels if verbose: print("Complete model fit in {:0.3f} seconds".format(time() - secs)) if outpath: with open(outpath, 'wb') as f: pickle.dump(model, f) print("Model written out to {}".format(outpath)) return model
def prepare_dataset(corpus, labels, test_data_proportion=0.3): ''' creates a train and test split of calssification dataset ''' train_x, test_x, train_y, test_y = tts(corpus, labels, test_size=0.3, random_state=42) return train_x, test_x, train_y, test_y
def train_subset(x_train, y_train, x_test, porc_corte, pipeline): x_train_1, x_test_1, y_train_1, y_test_1 = tts(x_train, y_train, random_state=0, test_size=porc_corte) pipeline.fit(x_train_1, y_train_1) print "Predict!!" return pipeline.predict(x_test)
def test_resid_plots(self): """ Assert no errors occur during Residual Plots integration """ model = SVR() X_train, X_test, y_train, y_test = tts(X, y, test_size=0.5) model.fit(X_train, y_train) visualizer = ResidualsPlot(model) visualizer.score(X_test, y_test)
def __init__(self, iris): #setting the train and test data and targets self.iris = iris self.d_train, self.d_test, self.t_train, self.t_test = tts( iris.data, iris.target, train_size=.7, random_state=random.randint(400, 600)) self.prediction = [] self.percent = 0
def data_preparation(x): #again and again so make a function x_features=x.iloc[:,x.columns!="Class"] x_labels=x.iloc[:,x.columns=="Class"] x_features_train,x_features_test,x_labels_train,x_labels_test=tts(x_features,x_labels,test_size=0.3) print("length of training data") print(len(x_features_train)) print("length of test data") print(len(x_features_test)) return(x_features_train,x_features_test,x_labels_train,x_labels_test)
def __init__(self, location, split=0.2): self.location = location datas1 = pd.read_csv(location) self.x = datas1.iloc[:, :-1].values self.y = datas1.iloc[:, -1].values self.xtr, self.xte, self.ytr, self.yte = tts(self.x, self.y, test_size=split) self.t = 0 self.tt = self.t + 1
def run_svmtest_int(num): n = 0 l = [] for n in range(num): I_train, I_test, y2_train, y2_test = tts(I, y2, test_size=.1) my_c1 = svm.SVC() my_c1.fit(I_train.values.reshape(-1, 1), y2_train) predictions1 = my_c1.predict(I_test.values.reshape(-1, 1)) score = accuracy_score(y2_test, predictions1) l.append(score) n += 1 return l
def fit(self, X, y): """ Fit all three models and also store the train/test splits. TODO: move to MultiModelMixin. """ # TODO: make test size a parameter and do better data storage on viz. self.X_train, self.X_test, self.y_train, self.y_test = tts( X, y, test_size=0.2) self.models = list( map(lambda model: model.fit(self.X_train, self.y_train), self.models))
def main(): kfold = KFold(len(yall), 10) sen = [] spe = [] acc = [] mcc = [] figs = [] #set the params of SVM C = np.linspace(0.6, 0.8, 10) G = np.linspace(0.13, 0.22, 10) clist = [] glist = [] aucs = [] param = {'C': C, 'gamma': G} for ind1, ind2 in kfold: print('*********') x_train = trall[ind1] y_train = yall[ind1] X_p = x_train[y_train == 1] X_n = x_train[y_train == 0] Table = frequences_matrix_mainFunc(X_p, X_n) x_train, y_train = GetFeatures(x_train, y_train, Table) x_test = trall[ind2] y_test = yall[ind2] x_test, y_test = GetFeatures(x_test, y_test, Table) svm = SVC(kernel='rbf', probability=True) x1, x2, y1, y2 = tts(x_train, y_train, test_size=0.2) cv = CV(svm, param, n_jobs=2) cv.fit(x2, y2) best = cv.best_params_ c = best['C'] g = best['gamma'] clist.append(c) glist.append(g) print('c,g:', c, g) svm = SVC(kernel='rbf', C=c, gamma=g, probability=True) svm.fit(x_train, y_train) acc_r = svm.score(x_test, y_test) mcc_r, sen_r, spe_r = getmcc2(svm, x_test, y_test) acc.append(acc_r) mcc.append(mcc_r) sen.append(sen_r) spe.append(spe_r) scores = svm.predict_proba(x_test)[:, 1] fpr, tpr, thres = roc_curve(y_test, scores) figs.append([fpr, tpr]) #print('sen:',sen_r,'\n','spe:',spe_r) auc_r = auc(fpr, tpr) aucs.append(auc_r) print(auc_r) print('acc:', acc_r, '\n', 'mcc:', mcc_r) print('*********') return mcc, acc, aucs, sen, spe, figs
def run_treetest_f1(num): n = 0 l = [] for n in range(num): V_train, V_test, y_train, y_test = tts(V, y, test_size=.1) my_c1 = tree.DecisionTreeClassifier() my_c1.fit(V_train.values.reshape(-1, 1), y_train) predictions1 = my_c1.predict(V_test.values.reshape(-1, 1)) score = accuracy_score(y_test, predictions1) l.append(score) n += 1 return l
def classifier(): vect,voc,txt=jiebaCounter() # normalisation x=np.array(vect/(np.max(vect,axis=1)+1e-10)) x_train,x_test,y_train,y_test=tts(x,y,test_size=0.25,train_size=0.75) clf=svm.LinearSVC() clf.fit(x_train,y_train) Cs=np.logspace(-5,0,10) clf_ = GridSearchCV(estimator=clf, param_grid=dict(C=Cs)) clf_.fit(x_,y) print(clf_.best_params_) print("train accuracy:") print(np.sum(clf_.predict(x_train)==y_train)/float(len(y_train))) print("test accuracy:") print(np.sum(clf_.predict(x_test)==y_test)/float(len(y_test)))
def train(x_dataset, y_dataset, test_size=.33): x_train, x_test, y_train, y_test = tts(x_dataset,y_dataset, test_size=test_size) lr = LinearRegression() lr.fit(x_train, y_train) predict = lr.predict(x_test) result = [] index = 0 for y_item in y_test.values: predicted_item = predict[index] index += 1 result.append((float(y_item), float(predicted_item))) return result, y_test.values, predict
def main(): #df = pd.read_csv('../data/seeds.data',error_bad_lines = False,sep = '\t') #df.columns=['area','perimeter','compactness','k_length','k_width','assy_coef','g_length','label'] df = pd.read_csv('../data/alabone.data',header = 0,error_bad_lines = False) tar = df['label'] df = df.drop(['c1','label'],axis=1) # Q1 split 50-50% rk = {} rk[1] = [] rk[2] = [] rk[3] = [] for i in range(0,10): print 'Test run',i xtrain,xtest,ytrain,ytest = tts(df,tar,test_size = 0.5) rk[1].append(results(xtrain,xtest,ytrain,ytest,k=1)) print rk[2].append(results(xtrain,xtest,ytrain,ytest,k=2)) print rk[3].append(results(xtrain,xtest,ytrain,ytest,k=3)) print "Mean accuracy and variance over 10 runs with k = 1",np.mean(rk[1]),np.var(rk[1]) print print "Mean accuracy and variance over 10 runs with k = 2",np.mean(rk[2]),np.var(rk[2]) print print "Mean accuracy and variance over 10 runs with k = 3",np.mean(rk[3]),np.var(rk[3]) ''' Cross validation 5 fold ''' sf = StratifiedKFold(tar,n_folds = 5) i = 1 rk[3] = [] for train,test in sf: print 'Fold',i i = i +1 xtrain,xtest,ytrain,ytest = df.values[train],df.values[test],tar.values[train],tar.values[test] print rk[3].append(result(xtrain,xtest,ytrain,ytest,k=3)) print print "Mean accuracy and variance over 5-folds",np.mean(rk[3]),np.var(rk[3])
def classifier(): vect, voc, txt = jiebaCounter() # normalisation x = np.array(vect / (np.max(vect, axis=1) + 1e-10)) x_train, x_test, y_train, y_test = tts(x, y, test_size=0.25, train_size=0.75) clf = svm.LinearSVC() clf.fit(x_train, y_train) Cs = np.logspace(-5, 0, 10) clf_ = GridSearchCV(estimator=clf, param_grid=dict(C=Cs)) clf_.fit(x_, y) print(clf_.best_params_) print("train accuracy:") print(np.sum(clf_.predict(x_train) == y_train) / float(len(y_train))) print("test accuracy:") print(np.sum(clf_.predict(x_test) == y_test) / float(len(y_test)))
def main(): df = pd.read_csv('../data/iris.data',) df.columns=['sepal_l','sepal_w','petal_l','petal_w','label'] tar = df['label'] df = df.drop(['label'],axis=1) # Q1 split 50-50% rk = {} rk[1] = [] rk[2] = [] rk[3] = [] for i in range(0,10): print 'Test run',i xtrain,xtest,ytrain,ytest = tts(df,tar,test_size = 0.5) rk[1].append(results(xtrain,xtest,ytrain,ytest,k=1)) print rk[2].append(results(xtrain,xtest,ytrain,ytest,k=2)) print rk[3].append(results(xtrain,xtest,ytrain,ytest,k=3)) print "Mean accuracy and variance over 10 runs with k = 1",np.mean(rk[1]),np.var(rk[1]) print print "Mean accuracy and variance over 10 runs with k = 2",np.mean(rk[2]),np.var(rk[2]) print print "Mean accuracy and variance over 10 runs with k = 3",np.mean(rk[3]),np.var(rk[3]) ''' Cross validation 5 fold ''' sf = StratifiedKFold(tar,n_folds = 5) i = 1 rk[3] = [] for train,test in sf: print 'Fold',i i = i +1 xtrain,xtest,ytrain,ytest = df.values[train],df.values[test],tar.values[train],tar.values[test] print rk[3].append(result(xtrain,xtest,ytrain,ytest,k=3)) print print "Mean accuracy and variance over 5-folds",np.mean(rk[3]),np.var(rk[3])
def build_and_save_model(X, y, filepath): """ This function does the following: - Build a classifier (SGD) - Fit our data to the classifier - Run cross validation to test the accuracy of our model """ def build(classifier, X, y=None): """ Build a model based on our process, a vectorizer and a linear classifier """ if isinstance(classifier, type): classifier = classifier() model = Pipeline([ ('preprocessor', DataPreProcessor()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), ('classifier', classifier), ]) model.fit(X, y) # Fit the model to our data return model # Label encode the classes we chose labels = LabelEncoder() y = labels.fit_transform(y) # Split data into train/test X_train, X_test, y_train, y_test = tts(X, y, test_size=0.1) model = build(SGDClassifier, X_train, y_train) # Predict the results of test data and calculate accuracy y_pred = model.predict(X_test) print(clsr(y_test, y_pred, target_names=labels.classes_)) model.labels_ = labels with open(filepath, 'wb') as f: pickle.dump(model, f) return model
def tfidf_iterator(batch_size=100,max_features=10000,path="/home/tingyubi/20w/data/",prefix="extraction-",begin=1,end=26): #tf,voc,txt = tfidf(max_features=max_features,path=path,prefix=prefix,begin=begin,end=end) #jsonfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".json" #with open(jsonfile,'r') as f: # data = json.load(f) #tf,voc = np.array(data['tfidf']), data['vocabulary'] pklfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".mat" with open(pklfile,'rb') as f: tf=cPickle.load(f) vocfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".voc" #vocfile = "allvoc.txt" f = open(vocfile,'r') voc=f.read().decode('utf-8').split("\n") f.close() tf = tf.toarray() tf = tf / (np.max(tf,axis = 1)[:, None] + 1e-10) x_train,x_test=tts(tf,train_size=0.9,test_size=0.1) train_iter = mx.io.NDArrayIter(data=x_train,batch_size=batch_size,shuffle=True) test_iter = mx.io.NDArrayIter(data=x_test,batch_size=batch_size,shuffle=True) return train_iter,test_iter,voc
def build_model(X, y, classifier, verbose=True): @timeit def build(classifier, X, y=None): """ Inner build function that builds a single model. """ model = Pipeline([ ('preprocessor', NLTKPreprocessor()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), ('classifier', classifier), ]) model.fit(X, y) return model # Label encode the targets labels = LabelEncoder() y = labels.fit_transform(y) # Begin evaluation if verbose: print("Building for evaluation") X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) model, secs = build(classifier, X_train, y_train) if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs)) if verbose: print("Classification Report:\n") y_pred = model.predict(X_test) print(clsr(y_test, y_pred, target_names=labels.classes_)) if verbose: print("Building complete model and saving ...") model, secs = build(classifier, X, y) model.labels_ = labels.inverse_transform(model.classes_) if verbose: print("Complete model fit in {:0.3f} seconds".format(secs)) return model
def pca_svm(pca_n=10,svm_C=1): t1=time.time() data,target=get_data() #scale_learner=StandardScaler() #data=scale_learner.fit_transform(data) x_train,x_test,y_train,y_test=tts(data,target,random_state=33) pca_learner=decomposition.PCA(n_components=pca_n) x_train=pca_learner.fit_transform(x_train) svm_learner=svm.SVC(C=svm_C) svm_learner.fit(x_train,y_train) x_test_pre=pca_learner.transform(x_test) y_test_pre=svm_learner.predict(x_test_pre) # report=classification_report(y_test,y_test_pre) # print 'The Main Explanied: ',numpy.sum(pca_learner.explained_variance_ratio_) # print report # print x_test_pre.shape,y_test_pre.shape,y_test.shape ac=svm_learner.score(x_test_pre,y_test) p=precision_score(y_test,y_test_pre,average='weighted') r=recall_score(y_test,y_test_pre,average='weighted') f1=2.0/(1.0/p+1.0/r) t=time.time()-t1 return ac,p,r,f1,t
def Run(self, csv): dataset = pd.read_csv(StringIO(csv)) x = dataset.iloc[:, [0, 1]].values y = dataset.iloc[:, 2].values from sklearn.cross_validation import train_test_split as tts x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2, random_state=0) from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() x_train_sc = sc_x.fit_transform(x_train) x_test_sc = sc_x.fit_transform(x_test) from sklearn.linear_model import LogisticRegression #FOR LINEAR LOGISTIC REGRESSION => ONLY TWO OUTPUTS (SIGMOID) llr = LogisticRegression(random_state=0) llr.fit(x_train_sc, y_train) y_pred = llr.predict(x_test_sc) result = [] for i in range(0, len(y_pred)): result.append({ 'Age': x_test.tolist()[i][0], 'Salary': x_test.tolist()[i][1], 'Expected': y_test.tolist()[i], 'Preditect': y_pred.tolist()[i], }) print(result) return result
def pca_svm_pipeline(): #svm_C=numpy.linspace(0.5,10,10) svm_C=[1] pca_n_components=numpy.arange(5,200,10) data,target=get_data() x_train,x_test,y_train,y_test=tts(data,target,random_state=33) #scale_learner=StandardScaler() pca_learner=decomposition.PCA() svm_learner=svm.SVC() pipe=pipeline.Pipeline([('pca',pca_learner),('svm',svm_learner)]) gscv=GridSearchCV(pipe, {'pca__n_components':pca_n_components,'svm__C':svm_C},n_jobs=-1) gscv.fit(x_train,y_train) y_test_pre=gscv.predict(x_test) report=classification_report(y_test,y_test_pre) print(gscv.best_params_ ) print(report) target_pre=gscv.predict(data) n1,n2=data.shape figure=pyplot.figure() L=numpy.zeros((40,)) xx=numpy.linspace(0,1,64)+13 yy=numpy.linspace(1,0,64)+13 xx,yy=numpy.meshgrid(xx,yy) for i in range(n1): k=target_pre[i] g=L[k] L[k]+=1 xx1=xx-k yy1=yy-g pyplot.contourf(xx1,yy1,data[i].reshape((64,64)),cmap='gray') if target[i]!=target_pre[i]: pyplot.scatter(numpy.mean(xx1),numpy.mean(yy1),marker='x',c='red',s=40) pyplot.axis('off') pyplot.grid('off') pyplot.title('PCA & SVM Recongnize Faces') pyplot.show()
def build_and_evaluate(text, leanings, classifier=SGDClassifier, verbose=True): def build(classifier, X, y=None): if isinstance(classifier, type): classifier = classifier() model = Pipeline([ ('preprocessor', NLTKPreprocessor()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), ('classifier', classifier), ]) model.fit(X, y) return model # Label encode the targets labels = LabelEncoder() leanings = labels.fit_transform(leanings) # Build model on training data. text_train, text_test, leanings_train, leanings_test = tts(text, leanings, test_size=0.2) #build(classifier, text_train, leanings_train) model = build(classifier, text_train, leanings_train) leanings_pred = model.predict(text_test) leanings_pred_prob = model.predict_proba(text_test) print(clsr(leanings_test, leanings_pred, target_names=labels.classes_)) # Build model on all data. model = build(classifier, text, leanings) model.labels_ = labels return leanings_test, leanings_pred_prob, model
for vid,Xt,yt in zip(subjId_val, X_val, y_val): levelOneTest = [] levelOneTrain = [] X_levelOne = [] y_levelOne = [] level0Classifier = [] for tid,Xp,yp in zip(subjId_train,X_train,y_train): print "Predicting subject ", vid, "from subject ", tid y0 = np.zeros(yp.shape) y1 = np.ones(Xt.shape[0]) X = np.vstack([Xp,Xt]) yd = np.concatenate([y0,y1]) pls = PLSRegression(n_components) Xp_t, Xp_v, yp_t, yp_v = tts(Xp.copy(),yp.copy(),train_size=0.9) yp_t = yp_t.astype(bool) yp_t_not = np.vstack((yp_t,~yp_t)).T #print "yp_t_not ", yp_t_not.shape pls.fit(Xp_t,yp_t_not.astype(int)) yp_new = pls.predict(Xp_t, copy=True) yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int) yp_t = yp_t.astype(int) #print y_new,y_pred, y_t error = ((yp_t - yp_pred) ** 2).sum() print "PLS Training error " , float(error)/yp_t.shape[0] yp_new = pls.predict(Xp_v, copy=True) yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int) #print y_new, y_pred, y_v #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0] error = ((yp_v - yp_pred) ** 2).sum()
x=df.drop('Survived',axis=1) #makes a baseline for accuracy float(len(y[y==0]))/float(len(y)) #basic model, no validation model=lr() model.fit(x,y) model.score(x,y) #looking at correlations for column in x.columns: print column, np.corrcoef(x[column],y)[1][0] #building test train sets x_train, x_test, y_train, y_test = tts(x,y, train_size=.8, random_state=1) #train/test fitting and validation model.fit(x_train,y_train) model.score(x_test,y_test) proba=model.predict_proba(x_test) pred=model.predict(x_test) s= cross_val_score(model,x_test,y_test, cv=12) s.mean() s.std() #The f-1 scores show that our model does a fairly decent job of predicting those #who died and an okay job predicting those who survived print skcr(y_test,pred) #(true negative) (false positive)
count = int(input('How many times would you like to run the test: ')) ts = float (input('What test size percentage(eg. 0.25): ')) r = float(input('What learning rate? ')) if dsetindex == 2: iris = datasets.load_iris() iris.data[: , 0] = do.normalize(iris.data[:,0]) iris.data[: , 1] = do.normalize(iris.data[:,1]) iris.data[: , 2] = do.normalize(iris.data[:,2]) iris.data[: , 3] = do.normalize(iris.data[:,3]) for i in range(count): xtrain, xtest, ytrain, ytest = tts(iris.data, iris.target, test_size= ts) xtrain, xvalidate, ytrain, yvalidate = tts(xtrain, ytrain, test_size= ts) nn = NN.NeuralNetwork(3,4,r) nn.addNewLayer(3) scores = nn.train(xtrain, ytrain, xvalidate, yvalidate) print('Test: ', nn.test(xtest, ytest)) if dsetindex == 1: data = np.array(do.read_file("indianDiabetes.txt")).astype(np.float16) data[: , 0] = do.normalize(data[:,0]) data[: , 1] = do.normalize(data[:,1]) data[: , 2] = do.normalize(data[:,2]) data[: , 3] = do.normalize(data[:,3]) data[: , 4] = do.normalize(data[:,4]) data[: , 5] = do.normalize(data[:,5]) data[: , 6] = do.normalize(data[:,6])
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.cross_validation import train_test_split as tts from sklearn.metrics import confusion_matrix dataset = pd.read_csv("Social_Network_Ads.csv") corr = dataset.corr() #koreleasyon matrix' ine göre cinsiyet anlamsız zaten. X = dataset.iloc[:, 2:4] y = dataset.iloc[:, -1] X_train, X_test, y_train, y_test = tts(X, y, test_size=.2, random_state=0) ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) from sklearn.svm import SVC classifier = SVC(kernel="rbf", random_state=0) #çıkan değerler sürekli değişmesin diye. classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) cm = confusion_matrix(y_test, y_pred) #apliying k-fold cross validation from sklearn.model_selection import cross_val_score
from sklearn.datasets import fetch_mldata from sklearn import svm from sklearn.cross_validation import train_test_split as tts mnist = fetch_mldata('MNIST original') print("Data fetched.") Xtr, Xts, Ytr, Yts = tts(mnist.data, mnist.target, test_size=10000) print("tts done.") clf = svm.SVC(gamma=0.001, C=100.) clf.fit(Xtr, Ytr) print("fitted.") predicted_label = clf.predict(Xts[-1])
## Module Constants ########################################################################## ########################################################################## ## Modules ########################################################################## ########################################################################## ## Program ########################################################################## if __name__ == "__main__": corpus = load_files("Language_Folder") # print len(corpus.data) X_train, X_test, y_train, y_test = tts(corpus.data, corpus.target, test_size=0.20) text_clf = Pipeline([("vec", CountVectorizer(analyzer="char_wb")), ("clf", MultinomialNB())]) text_clf = text_clf.fit(X_train, y_train) # Store the instance using pickle. with open("experiment_file", "w") as f: pickle.dump(text_clf, f) predicted = text_clf.predict(X_test) accuracy = np.mean(predicted == y_test) print accuracy print "Here it is."
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=1.25, weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=5, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Create the samplers enn = EditedNearestNeighbours() renn = RepeatedEditedNearestNeighbours() # Create teh classifier knn = KNN(1) # Make the splits X_train, X_test, y_train, y_test = tts(X, y, random_state=42) # Add one transformers and two samplers in the pipeline object pipeline = make_pipeline(pca, enn, renn, knn) pipeline.fit(X_train, y_train) y_hat = pipeline.predict(X_test) print(classification_report(y_test, y_hat))
#File is too big to run all at once, splits file into small files so I can run #in chunks import pandas as pd from sklearn.cross_validation import train_test_split as tts df=pd.read_csv('sdwis_clean.csv') df=df.drop(['Unnamed: 0'], axis=1) df,df1=tts(df,train_size=.9) df,df2=tts(df, train_size=.89) df,df3=tts(df, train_size=.88) df,df4=tts(df, train_size=.86) df,df5=tts(df, train_size=.83) df,df6=tts(df, train_size=.8) df,df7=tts(df, train_size=.75) df,df8=tts(df, train_size=.66) df10,df9=tts(df,train_size=.5) pd.DataFrame.to_csv(df1,"df1.csv") pd.DataFrame.to_csv(df2,"df2.csv") pd.DataFrame.to_csv(df3,"df3.csv") pd.DataFrame.to_csv(df4,"df4.csv") pd.DataFrame.to_csv(df5,"df5.csv") pd.DataFrame.to_csv(df6,"df6.csv") pd.DataFrame.to_csv(df7,"df7.csv") pd.DataFrame.to_csv(df8,"df8.csv") pd.DataFrame.to_csv(df9,"df9.csv") pd.DataFrame.to_csv(df10,"df10.csv")
sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") ) ### add more features to features_list! features_list = ["poi", "salary"] data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) ### your code goes here from sklearn.tree import DecisionTreeClassifier as DTC from sklearn.metrics import accuracy_score, precision_score, recall_score from sklearn.cross_validation import train_test_split as tts features_train, features_test, labels_train,labels_test = tts(features, labels, test_size=0.3, random_state=42) print 'Baseline accuracy:',list(labels_test).count(0)/float(len(labels_test)) clf = DTC() clf.fit(features_train, labels_train) pred = clf.predict(features_test) print 'Predicted number of person\'s of interest',list(pred).count(1) print('Accuracy:',accuracy_score(labels_test,pred)) print('Precision:',precision_score(labels_test,pred)) print('Recall:',recall_score(labels_test,pred))
# # x=sales15['margin_sum_15q1'] # y=sales15['sale_total_15'] # plt.scatter(x, y) # plt.xlabel("Total Margin 2015 Q1") # plt.ylabel("Total Sales 2015") # plt.show() #These variables from Q1 were all highly correlated with sales for the year, #use them to predict. #these variables are also correlated with each other, so it is redundant to use all #However, for the sake of practicing a multvariable linear regression, well use them all x=['sale_total_15q1','vol_sol_l_sum_15q1','margin_sum_15q1'] #Split data into test and train train, test = tts(sales15, train_size=.85) train_x=train[x] train_y=train['sale_total_15'] test_x=test[x] test_y=test['sale_total_15'] #Builds the model using the train data. lm = linear_model.LinearRegression() model = lm.fit(train_x, train_y) predictions = lm.predict(test_x) print "Sample:", lm.score(test_x, test_y) #Builds the model with a Ridge Regularization lm = linear_model.RidgeCV() model = lm.fit(train_x, train_y) predictions = lm.predict(test_x)
linComb = LR() linComb.fit(df_avg[['all_prev', 'all_avg']].values, df_avg['all_yr'].values) print linComb.score(df_avg[['all_prev', 'all_avg']].values, df_avg['all_yr'].values) linDet = LR() linDet.fit(X,y) print linDet.score(X,y) # df_avg == 3 year rolling average + yr4 stats X,y = df_avg[['all_avg']].values, df_avg['all_yr'].values X,y = df_avg[['all_prev']].values, df_avg['all_yr'].values X,y = df_avg[['all_avg', 'all_prev']].values, df_avg['all_yr'].values X,y = df_avg[['1D_avg', '2D_avg', '3D_avg', 'all_avg','1D_prev', '2D_prev', '3D_prev', 'all_prev']].values, df_avg['all_yr'].values X_train, X_test, y_train, y_test = tts(X, y) lin = LR(fit_intercept=False) lin.fit(X,y) lin.score(X,y) knn = KNR(n_neighbors=5) knn.fit(X_train,y_train) print knn.score(X_train,y_train) print knn.score(X_test,y_test) ns = range(1,30,2) scores = [] for n in ns: knn = KNR(n_neighbors=n)
X_test,y_test = X[test_idx,:],y[test_idx] plt.scatter(X_test[:,0],X_test[:,1],c='', alpha=1.0,linewidth=1,marker='o', s=55,label='test set') if __name__=='__main__': iris = datasets.load_iris() X = iris.data[:,[2,3]] y = iris.target #spliting the data for test(30%) and training(70%) using tts X_train,X_test,y_train, y_test = \ tts(X,y,test_size=0.3, random_state=0) #Standardising the feature (feature scaling) using ss sc =ss() #Using fit to estimate 'sample mean','standard deviation' to do feature scaling #for each feature dimension using training data sc.fit(X_train) #tranform is used to standardize the trainig data (TrDS) and test data(TsDS) #Note: we have used same parameter for feature scaling X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) #n_iter:- Number of Epochs(passes over the TrDS set) #eta0/eta:-learning rate
import numpy as np from sklearn.ensemble import RandomForestClassifier as rfc from sklearn.decomposition import PCA as PCA from sklearn.cross_validation import train_test_split as tts datapath = 'G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/OttoProductClassification/Data/' trainfile = 'train.csv' testfile = 'test.csv' trd = pd.read_csv(datapath+trainfile) trd = trd.values # Split data into training and cross-validation dataset nptrd, npcvd = tts(trd,test_size=0.33) # Train the model pca = PCA(n_components=40) pca.fit(nptrd[:,range(1,94)]) X = pca.transform(nptrd[:,range(1,94)]) PCAExplained = sum(pca.explained_variance_ratio_) # Most of the features are highly skewed i.e. their 75% value ranges when we do td.describe() is 0 while their max is much higher. # This indicates that only a few values are non-zero for most features. # This could mean that these features are actually categorical variables that are encoded in the test data.. could .. not sure forest = rfc(n_estimators=500,criterion = 'entropy' , n_jobs=-1,min_samples_split=5,min_samples_leaf=5,max_depth=20) #forest = forest.fit(nptrd[:,range(1,94)],nptrd[:,-1])
from sklearn import metrics from imblearn.over_sampling import ADASYN from imblearn.ensemble import BalanceCascade from imblearn.over_sampling import RandomOverSampler from sklearn.cross_validation import KFold from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier sm = SMOTE() X_res, y_res = sm.fit_sample(X_train_tf.toarray(), datatrain['sentiment']) X_res, y_res=sm.fit_sample(X_res,y_res) print ('Data sentmen asli {}'.format (Counter(datatrain['sentiment']))) print('Resampled dataset shape {}'.format(Counter(y_res))) clf=svm.LinearSVC() #clf = svm.SVC(decision_function_shape='ovo') X_train, X_test, y_train, y_test = tts(X_res, y_res,test_size=0.2) clf.fit(X_train,y_train) predicted=clf.predict(X_test) print(metrics.classification_report(y_test, predicted)) presisi_svm_smote=metrics.precision_score(y_test, predicted,average='macro') recall_svm_smote=metrics.recall_score(y_test, predicted,average='macro') f1_svm_smote=metrics.f1_score(y_test, predicted,average='macro') akurasi_svm_smote=metrics.accuracy_score(y_test, predicted) print "Presisi:",presisi_svm_smote print "Recall:", recall_svm_smote print "F1-Score:", f1_svm_smote print "Akurasi:", akurasi_svm_smote
else: df_tmp = pd.merge(df_tmp,df[(df.Year == y2) & (df.Team.isin(tms_include))][['Team','f']],how='left', on=['Team']) df_tmp.rename(columns={'f':'f_yr-%d' % (n)}, inplace=True) if df_tmp is not None: df_lag = df_lag.append(df_tmp[df_tmp.columns]) # Calculate changes # df_lag['change'] = df_lag.yr2 - df_lag.yr1 # df_lag['abs_change'] = abs(df_lag.yr2 - df_lag.yr1) # for c in df_lag.columns: # df_lag[c] = df_lag[c].astype(float) Xcol = ['yr1_f','off_f','def_f','st_f','s_p','fei'] ycol = ['yr2_f'] X_train, X_test, y_train, y_test = tts(df_lag[Xcol].values, df_lag[ycol].values) linreg = LR() linreg.fit(X_train, y_train) linreg.score(X_train, y_train) # Train on all existing seasons to project 2014 X,y = df_lag[Xcol].values, df_lag[ycol].values linreg = LR() linreg.fit(X, y) linreg.score(X, y) # build 3yr avgs df_3avg = pd.DataFrame(columns=['avg_f']+['off_f','def_f','st_f','s_p','fei','yr4_f'])
f_Age = merged[merged.Sex==0]['Age'].median() merged['age_fill'] = merged['Age'] merged.loc[merged.Age.isnull(),'age_fill'] = 27.5 #scale and fill NaN with mean cols_to_scale = ['Fare','Pclass','Sex','age_fill','embarked_num','Parch','SibSp'] merged[cols_to_scale] = merged[cols_to_scale].fillna(merged[cols_to_scale].mean()) for i in range(len(cols_to_scale)): merged[[cols_to_scale[i]]] = pp.scale(merged[[cols_to_scale[i]]]) train = merged[:len(train)] test = merged[len(train):] #modeling with logit xtrain,xval, ytrain,yval= tts(np.array(train[cols_to_scale]), np.ravel(train['Survived'])) LR = lm.LogisticRegression() model = LR.fit(xtrain, ytrain) score = model.score(xval,yval) print('validation score: ',score) xtest = np.array(test[cols_to_scale]) results = pd.DataFrame([test['PassengerId'], model.predict(xtest)], index = None).transpose() results =results.rename(columns = {'Unnamed 0' : 'Survived'}) with open('./Submission.csv','w') as wfile: results.to_csv(wfile, index = False) wfile.close()
import os import numpy as np import load_data from sklearn import cross_validation as cv from sklearn.cross_validation import train_test_split as tts from sklearn.ensemble import ExtraTreesClassifier for root, dirs, files in os.walk('data'): for name in files: if name.endswith('.csv'): print "Loading " + root + "/" + name dataset = load_data.load_data(name, root) splits = tts(dataset.data, dataset.target, test_size=0.2) X_train, X_test, y_train, y_test = splits # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250) forest.fit(X_train, y_train) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(X_train.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
le = LabelEncoder() for i in range(0, 6): features[:, i] = le.fit_transform(features[:, i]) la = LabelEncoder() features[:, 11] = la.fit_transform(features[:, 11]) #onehotencoder '''before that we need to perform labelencoding in the columns of the dataframe''' ohe = OneHotEncoder(categorical_features=[11]) features = ohe.fit_transform(features).toarray() #labelencoding of the label lc = LabelEncoder() labels[:, 0] = lc.fit_transform(labels[:, 0]) from sklearn.cross_validation import train_test_split as tts f_train, f_test, l_train, l_test = tts(features, labels, random_state=0, test_size=.20) '''*****************************Now With Pandas***************************************''' feature = df.drop("Target", axis=1) for i in feature.select_dtypes(include=[object]): feature[i] = feature[i].astype('category').cat.codes feature = pd.get_dummies(feature, columns=["Property_Area"]) label = df["Target"] label = label.astype('category').cat.codes label = pd.get_dummies(label)
#Initialize the list to keep the scores from each iteration. OLS_score = [] Ridge_score = [] RidgeCV_score = [] DecTree1_score = [] DecTree2_score = [] Lasso_score = [] LassoCV_score = [] RandomForest_score = [] # Obtain results for running the model a specified number of times for i in range(1,15): #Train the data splits = tts(data, target, test_size=0.20) X_train, X_test, y_train, y_test = splits #Run the OLS model. regr = linear_model.LinearRegression() regr.fit(X_train, y_train) OLS_score.append(regr.score(X_test, y_test)) #print 'Coefficients OLS: \n', regr.coef_ #print 'Intercept OLS: \n', regr.intercept_ #Run the Ridge model. clf = linear_model.Ridge(alpha=0.5) clf.fit(X_train, y_train) Ridge_score.append(clf.score(X_test, y_test)) #Run the RidgeCV model.
#converting independent variables' values to positive x1 = X x2 = np.ones(shape=(np.size(X[:, 1]), np.size(X[1, :]))).astype(int) X = np.add(x1, x2).astype(int) #feature selection on the basis of chi squared test from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 select = SelectKBest(chi2, 42) sel = select.fit(X, y) feature_score = sel.scores_ #visualization of features' scores on the basis of chi2 X = sel.transform(X) #for cross validation from sklearn.cross_validation import train_test_split as tts X_train, X_test, y_train, y_test = tts(X, y, test_size=2000, random_state=1) #feature scaling from sklearn.preprocessing import StandardScaler scale = StandardScaler() scale.fit(X_train) X_train = scale.transform(X_train) X_test = scale.transform(X_test) #while training on whole dataset, trained the whole dataset on the performance of svc scale2 = StandardScaler() scale2.fit(X) X = scale2.transform(X) #testing score of multi layered perceptron from sklearn.neural_network import MLPClassifier as mlp
# -testing accuracy is a better estimate than training accuracy of out-of-sample performance # -but, it provides a high variance estimate since changing which observations happen to be in the testing # set can significantly change testing accuracy from sklearn.datasets import load_iris from sklearn.cross_validation import train_test_split as tts from sklearn.neighbors import KNeighborsClassifier as knn_ from sklearn import metrics # read in the iris data iris = load_iris() X = iris.data y = iris.target # train/test split X_train, X_test, y_train, y_test = tts(X, y, random_state=4) # check classification of KNN with K=5 knn = knn_(n_neighbors=5) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) print metrics.accuracy_score(y_test, y_pred) # What if we created a bunch of train/test splits, calculated the accuracy for each, # then averaged the results together? # That's the essence of cross-validation # Steps for K-fold cross-validation # 1) Split the data into K equal partitions (or "folds") # 2) Use fold 1 as the testing set and the union of the other folds as the training set # 3) Calculate testing accuracy