def SVM_LinearSVC(self): SVM_Classifier = Pipeline([ ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC())) ]) SVM_Classifier.fit(self.X_train,self.y) predicted = SVM_Classifier.predict(self.X_test) y_pred = self.lb.inverse_transform(predicted) i=self.train_ex correct=0 for label in y_pred: if label==self.Y_train[i]: correct=correct+1 i = i + 1 print 'Number of Examples used for Training',self.train_ex print 'Number of Correctly classified',correct print 'Total number of samples classified in Test data',self.size-self.train_ex print 'The resulting accuracy using Linear SVC is ',(float(correct)*100/float(self.size-self.train_ex)),'%\n' cm=confusion_matrix(self.Y_train[self.train_ex:self.size],y_pred) print 'The confusion matrix is',cm return y_pred
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = RandomizedPCA(n_components=2, whiten=True) clf = SVC(probability=True, random_state=0) for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples,)) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def train_clf(self): pipeline = Pipeline([ ("tfidf", TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)), ("svc", LinearSVC(C=100)) ]) pipeline.fit(self.dataset.data, self.dataset.target) return pipeline
def calcCSPLDA(epochs_train, labels_train, nb): """Creates the CSP+LDA pipeline and applies it to training data. (just really a function to call the MNE and SKlearn processing functs) Parameters ---------- epochs_train : epochs in mne data format labels_train : labels of epochs in mne format nb: number of CSP components, must be even. (6 implies the 3 top-most and bottom eigenvectors) Returns ------- clf : the fitted model for the CSP+LDA approach csp.filters_ : CSP weight vector, shape (nchannels, nchannels) svc.coef_ : LDA weight vector, shape (1, nb) Examples -------- >>> data_path = "/PATH/TO/FILE/somematrix.txt" >>> matrix_data = loadAsMatrix(data_path) """ svc = LDA() csp = CSP(n_components=4, reg=None, log=True, cov_est='epoch') clf = Pipeline([('CSP', csp), ('SVC', svc)]) epochs_data = epochs_train.get_data() clf.fit(epochs_data, labels_train) return clf, csp.filters_, svc.coef_
class MachineLearning(object): def __init__(self): # Initialize classifier and vectorizer self.clf = Pipeline([('tfidf', TfidfVectorizer(min_df=1, ngram_range=(1, 2))), ('clf', MultinomialNB(alpha=.01)), ]) def init_training(self): self.x_train = [] self.y_train = [] def add_training_data(self, data, label): self.x_train.append(data) self.y_train.append(label) # Train classifier # Can also use grid search to optimize accuracy, like ''' parameters = {'tfidf__ngram_range': [(1, 1), (1, 2)], 'clf__alpha': (.01, .001), } gs_clf = GridSearchCV(clf, parameters, n_jobs=-1) ''' def train(self): self.clf.fit(self.x_train, self.y_train) # Predict result # We can roughly estimate the accuracy using cross validation, like ''' result = clf.predict(test_dc + test_marvel) baseline = [0 for x in range(len(test_dc))] + [1 for x in range(len(test_marvel))] print np.sum(result == baseline) / float(len(result)) ''' def predict(self, data): return self.clf.predict([data])[0]
def test_one_rf(): Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl") print "training data loaded" print_label_frequency(ytrain_raw) ############# create the pipeline pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)), ('tfidf', TfidfTransformer()), ('rf', RandomForestClassifier(n_estimators=500, max_depth=200, min_samples_split=10, oob_score=True, n_jobs=-1,verbose=1,class_weight='balanced')), ]) ############# train pipeline.fit(Xtrain_raw,ytrain_raw) ############# check result rf = pipeline.steps[-1][1] rf.oob_score_ ############# training error ytrain_predict = pipeline.predict(Xtrain_raw) print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict) print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict) ############# testing error Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl") ytest_predict = pipeline.predict(Xtest_raw) accuracy_score(y_true=ytest_raw,y_pred=ytest_predict) print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
def KFOLDTEST(self, text, sent): k_fold = KFold(n=len(text), n_folds=6) pipeline = Pipeline( [ ("vectorizer", CountVectorizer(ngram_range=(1, 2), tokenizer=self.tokenize_data)), ("tfidf", TfidfTransformer(norm="l2", smooth_idf=False, use_idf=False)), ("classifier", OneVsOneClassifier(LinearSVC())), ] ) scores = [] for train_indices, test_indices in k_fold: # print('Train: %s | test: %s' % (train_indices, test_indices)) train_text = text[train_indices] train_y = sent[train_indices] test_text = text[test_indices] test_y = sent[test_indices] pipeline.fit(train_text, train_y) score = pipeline.score(test_text, test_y) scores.append(score) score = sum(scores) / len(scores) print ("scores ", scores, " Score ", score) return score
def runCrossValidationTest(classifier_name, classifier_args=None, ngram=2, folds=5): if classifier_args is None: classifier_args = {} classifier = valid_classifiers[classifier_name](**classifier_args) X, y = load_non_preprocessed_data() # confusion = numpy.array([[0,0,0],[0,0,0],[0,0,0]]) ml_pipeline = Pipeline([ ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,ngram))), ('Classifier', classifier), ]) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y, test_size = 0.25, random_state=0) ml_pipeline.fit(X_train, y_train) predictions = ml_pipeline.predict(X_test) confusion = confusion_matrix(y_test, predictions) f1 = f1_score(y_test, predictions, pos_label=None, average = 'micro') precision = precision_score(y_test, predictions, pos_label=None, average = 'micro') recall = recall_score(y_test, predictions, pos_label=None, average = 'micro') print(" >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") print("F1 score: " + str(f1)) print("precision score: " + str(precision)) print("recall score: " + str(recall)) print(confusion) numpy.savetxt("data/test_results_confusion_matrix_" + classifier_name+".csv", confusion, delimiter=",") return ((f1, precision, recall))
class Regressor(BaseEstimator): def __init__(self): self.clf = Pipeline([ ("RF", RandomForestRegressor(n_estimators=200, max_depth=15, n_jobs=N_JOBS))]) self.scaler = StandardScaler() self.agglo = FeatureAgglomeration(n_clusters=500) def fit(self, X, y): y = y.ravel() n_samples, n_lags, n_lats, n_lons = X.shape self.scaler.fit(X[:, -1].reshape(n_samples, -1)) X = X.reshape(n_lags * n_samples, -1) connectivity = grid_to_graph(n_lats, n_lons) self.agglo.connectivity = connectivity X = self.scaler.transform(X) X = self.agglo.fit_transform(X) X = X.reshape(n_samples, -1) self.clf.fit(X, y) def predict(self, X): n_samples, n_lags, n_lats, n_lons = X.shape X = X.reshape(n_lags * n_samples, -1) X = self.scaler.transform(X) X = self.agglo.transform(X) X = X.reshape(n_samples, -1) return self.clf.predict(X)
def use_kfold_cross_validation(X_train, X_test, y_train, y_test): pipe_lr = Pipeline([ ('scl', StandardScaler()), ('pca', PCA(n_components=2)), ('clf', LogisticRegression(random_state=1)), ]) pipe_lr.fit(X_train, y_train) print("Test accuracy: %.3f\n" % pipe_lr.score(X_test, y_test)) kfold = StratifiedKFold(y=y_train, n_folds=10, random_state=1) scores = [] for k, (train, test) in enumerate(kfold): pipe_lr.fit(X_train[train], y_train[train]) score = pipe_lr.score(X_train[test], y_train[test]) scores.append(score) print( "Fold: %s, Class dist.: %s, Acc: %.3f" % (k+1, np.bincount(y_train[train]), score) ) print( "\nCustom CV accuracy: %.3f +/- %.3f\n" % (np.mean(scores), np.std(scores)), ) scores = cross_val_score(estimator=pipe_lr, X=X_train, y=y_train, cv=10) print("cross_val_score CV accuracy scores: %s" % scores) print( "cross_val_score CV accuracy: %.3f +/- %.3f" % (np.mean(scores), np.std(scores)) )
def test(): target_label = [u'weather', u'audio',u'pic',u'calculate',u'music', u'poem'] training_text_raw = [] training_label = [] with open ('./training_source.csv','r') as f: for line in f.readlines(): line = line.strip().split('\t') if len(line) > 1 and line[1] in target_label: training_text_raw.append(unicode(line[0],"utf-8")) training_label.append(line[1]) print training_label training_text = [] for text in training_text_raw: seg_text = seg(text) training_text.append(seg_text) text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', MultinomialNB()), ]) scores = cross_validation.cross_val_score(text_clf, training_text, training_label, cv=8) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) text_clf.fit(training_text, training_label) while True: k_text = raw_input("\nPlease input:") if k_text == "exit": break print text_clf.predict([seg(unicode(k_text,'utf-8'))])
def main(datafile, threshold): filename = "out{}{}.hrc".format(os.sep, os.path.basename(datafile.name)) if not os.path.isfile(filename): header = datafile.readline() collist = [i for i, toggle in enumerate(header.split(",")) if toggle != "0"] datafile.seek(0) data = pd.read_csv(datafile, usecols=collist).as_matrix() pipeline = Pipeline([("clf", Hierarchical())]) pipeline.set_params(**{}) pipeline.fit(data) clf = pipeline.get_params()["clf"] hierarchy = clf.hierarchy_ with open(filename, "wb") as fh: fh.write(ET.tostring(hierarchy.to_xml())) else: with open(filename, "rb") as fh: hierarchy = Cluster.from_xml(ET.parse(fh).getroot()) print(ET.tostring(hierarchy.to_xml()).decode("utf-8")) if threshold != None: clusters = hierarchy.cut(threshold) print("\n".join(c.to_str(i) for i, c in enumerate(clusters))) dump_graph(clusters)
def svcDictVector(): recipeData = getRecipeData() labels = [recipe['cuisine'] for recipe in recipeData] ingredientsFixtures = [sorted(set(e['ingredients'])) for e in recipeData] for i, w in enumerate(ingredientsFixtures): ingredientsFixtures[i] = dict(zip(w, [1] * len(w))) pipeline = Pipeline([ ('dict', DictVectorizer()), ('variance', VarianceThreshold()), ('tfidf', TfidfTransformer()), ('bayes', svm.LinearSVC()), ]) pipeline.fit(ingredientsFixtures, labels) print pipeline testRecipes = getTestData() testIngredientsFixtures = [sorted(set(e['ingredients'])) for e in testRecipes] for i, w in enumerate(testIngredientsFixtures): testIngredientsFixtures[i] = dict(zip(w, [1] * len(w))) predictions = pipeline.predict(testIngredientsFixtures) outputPercentCorrect(predictions) copyAndOutput(predictions, testRecipes)
class Classifier: def __init__(self, clf, scaler=None, selector=False): if scaler: if selector: self.clf = Pipeline([ ('scaler', scaler), ('selector', SelectFromModel(SELECTOR_POOL['extra_trees_classifier'], .001)), ('classifier', clf) ]) else: self.clf = Pipeline([ ('scaler', scaler), ('classifier', clf) ]) else: if selector: self.clf = Pipeline([ ('selector', SelectFromModel(SELECTOR_POOL['extra_trees_classifier'], .001)), ('classifier', clf) ]) else: self.clf = clf def __str__(self): if isinstance(self.clf, Pipeline): return ', '.join(type(v).__name__ for k, v in self.clf.steps) return type(self.clf).__name__ def fit(self, X, y): self.clf.fit(X, y) def predict(self, X): return self.clf.predict(X)
def clasificador(self,X_train, y_train, X_test, target_names, y_test,all_labels): lb = preprocessing.MultiLabelBinarizer() Y = lb.fit_transform(y_train) classifier = Pipeline([ ('vectorizer',CountVectorizer(strip_accents='unicode')), ('tfidf',TfidfTransformer()), ('to_dense', DenseTransformer()), ('clf',OneVsRestClassifier(GaussianNB()))]) classifier.fit(X_train,Y) predicted = classifier.predict(X_test) etiquetas = lb.inverse_transform(predicted) for i in range(0,len(etiquetas)): etiquetas[i]=list(etiquetas[i]) valoresMacro = self.macro(etiquetas,y_test) valoresMicro = self.micro(etiquetas, y_test)
def predict(): pipeline = Pipeline([ ('min/max scaler', MinMaxScaler(feature_range=(0.0, 1.0))), ('neural network', Classifier(layers=[Layer("ExpLin", units=5), Layer("Softmax")], n_iter=25))]) X = np.load('All_features.npz')['arr_0'] D = np.load('Akunin_features.npz')['arr_0'] all_samples = [1]*141 + [0]*123 y = np.array(all_samples) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0, random_state=0) pipeline.fit(X_train, y_train) pickle.dump(pipeline, open('NeuralNet_model.pkl', 'wb')) prediction = pipeline.predict(D) probs = pipeline.predict_proba(D) gradation = {1.01: 5, 0.9: 4, 0.8: 3, 0.7: 2, 0.6: 1} ress1 = [] simple_predicts = [] scale_predicts = [] for i in prediction: simple_predicts.append(i[0]) for i in probs: scale_predicts.append(i[1]*10) compare = [] for u in gradation: if i[1] < u: compare.append(gradation[u]) ress1.append(min(compare)) return simple_predicts, scale_predicts
class Model10(Model): def __init__(self): pass def fit(self, Xmask, y): pr = prepare.Prepare_0(model=10, preproc=1, min_df=1, use_svd=False, tfidf=2, stemmer=0) (X_all_df,_,BP,params) = pr.load_transform(update=False) names = list(X_all_df.columns) X_all = np.asarray(X_all_df) self.X_all, self.names = X_all, names clf0 = GaussianNB() clf1 = MultinomialNB(alpha=0.8) clf2 = BernoulliNB(alpha=1, binarize=0.01) clf = clf1 self.rd = Pipeline([ ("trans", Transformer(names=self.names, X_all=X_all, BP=BP)), #("scaler",StandardScaler(with_mean=False)), ("est", clf) ]) self.rd.fit(Xmask,np.asarray(y)) return self def predict_proba(self, Xmask): return self.rd.predict_proba(Xmask) def predict(self, Xmask): return self.rd.predict(Xmask) def starter(self): print "Model10 starter" self.fit(np.arange(100),np.arange(100))
class ModelPipeline(object): def __init__(self, clf): self.columns =[] self.pipeline = Pipeline([ ('clf', clf) ]) def fit(self, X_train, y_train): self.pipeline.fit(X_train, y_train) self.columns = list(X_train.columns) def predict(self, X_test): return self.pipeline.predict(X_test) def feat_importances(self, n=10, string=True): imp = self.pipeline.steps[0][1].feature_importances_ if string: return ''.join('%s: %s%%\n' % (self.columns[feat], round( imp[feat] * 100, 3)) for feat in np.argsort(imp)[-1:-(n+1):-1]) else: return self.columns[np.argsort(imp)[-1:-(n+1):-1]], \ sorted(imp)[-1:-(n+1):-1] def grid_search(self, X, y): parameters = { 'clf__n_estimators': [100, 200, 300] , 'clf__max_features': ['sqrt', 50, 80], 'clf__max_depth': [None, 50, 100], 'clf__oob_score': [False, True], 'clf__random_state':[29], 'clf__class_weight':['balanced', None, 'balanced_subsample'], 'clf__min_samples_split': [2, 10, 20] } grid_search = GridSearchCV(self.pipeline, parameters, n_jobs=-1, verbose=1, scoring = "recall") print("Performing grid search...") print("pipeline:", [name for name, _ in self.pipeline.steps]) print("parameters:") pprint(parameters) t0 = time() grid_search.fit(X, y) print("done in %0.3fs" % (time() - t0)) print() print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) return best_parameters
def train_regressor(data, X_columns, y_show=y_init+y_curr): X = data.loc[:,X_columns] ys = data.loc[:, [i for i in y_show if i not in X_columns]] print() for n_trees in [256]: #list(range(4, 16)) + [18,20] + [2**n for n in range(4, 12)]: #[n for n in range(4, 64)]:#[2**n for n in range(1, 12)]: forest = Pipeline(steps=[ ('forest', ExtraTreesRegressor( #RandomForestRegressor( n_estimators=n_trees, n_jobs=min(n_trees, 62), oob_score=True, bootstrap=True))]) start = time() forest.fit(X, ys)#new_ys) end = time() print(n_trees, forest.steps[0][1].oob_score_, end-start) print() print("%.5g seconds to train regressor" % (end-start)) print() y_names = ys.columns X_names = X.columns return [forest, y_names, X_names]
def train(docs): """ Trains and serializes (pickles) a vectorizing pipeline based on training data. `min_df` is set to filter out extremely rare words, since we don't want those to dominate the distance metric. `max_df` is set to filter out extremely common words, since they don't convey much information. """ pipeline = Pipeline([ ('vectorizer', CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer(), min_df=0.015, max_df=0.9)), ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)), ('feature_reducer', TruncatedSVD(n_components=100)), ('normalizer', Normalizer(copy=False)) ]) print('Training on {0} docs...'.format(len(docs))) pipeline.fit(docs) PIPELINE = pipeline print('Serializing pipeline to {0}'.format(PIPELINE_PATH)) pipeline_file = open(PIPELINE_PATH, 'wb') pickle.dump(pipeline, pipeline_file) print('Training complete.')
def classify(text, label): #~ Testing purpose: 10-fold cross validation cv = KFold(n = len(label), n_folds = 10) n_c = [100, 200, 500, 1000, 2000, 5000, 10000] for i in n_c: clf = Pipeline([ ('vect', TfidfVectorizer( analyzer='word', ngram_range=(1, 1), stop_words = 'english', lowercase=True, token_pattern=r'\b\w+\b', tokenizer=tokenize_doc, min_df = 1)), ('dim_reduction', TruncatedSVD(n_components=i)), #~ ('feature_selection', #~ SelectKBest( #~ chi2, #~ k=35)), ('classification', LogisticRegression()) #~ SVC(kernel = 'linear')) ]) print "len(label) ", len(label), " | text ", len(text) print "" clf.fit(np.asarray(text), np.asarray(label)) cv_score = cross_val_score(clf, text, label, cv = cv, verbose = 1) print "Log Reg | n_c = ", i print "Accuracy List ", cv_score, " | Avg Accuracy ", np.mean(cv_score)
def main(): X_all, y, lentrain = load_Boilerplate() X_all, tfv = transform_Tfidf(X_all, lentrain) X = X_all[:lentrain] clf = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=123) rd = Pipeline([ #("selector", SelectPercentile(chi2, percentile=90)), #("pca", PCA(n_components='mle')), #("pca", PCA(n_components=500)), ("svd", TruncatedSVD(n_components=500, random_state=1)), ("est", clf) ]) if True: cv_run(rd, X, y) return else: print "Prepare submission.." print "training on full data" rd.fit(X,y) X_test = X_all[lentrain:] pred = rd.predict_proba(X_test)[:,1] testfile = pd.read_csv('../data/test.tsv', sep="\t", na_values=['?'], index_col=1) pred_df = pd.DataFrame(pred, index=testfile.index, columns=['label']) submname = 'submission_%s' % (datetime.datetime.today().strftime("%Y%m%d_%H%M%S"),) #print submname pred_df.to_csv('../data/%s.csv' % submname) print "%s file created.." % submname
def useTFIDF(): print "TFIDF" trainData = pd.read_csv("data/multinomialTrain.csv", header=0) # dat = trainData[["rating", 'numDet', 'innerPunctuation','avgWordLength', # 'numPresVerb', "numFirstPerson",'numPropNoun', "numOtherNoun", "numWords", "numAdj", # "numPastVerb", "numConj", "exclamationPoints"]] dat = trainData knn = KNeighborsClassifier(n_neighbors=21, weights='distance') scaler = preprocessing.StandardScaler() tfidf = TfidfTransformer() tfidf_scaled_knn = Pipeline([('tfidf', tfidf), ('knn', knn)]) kf = KFold(len(trainData), n_folds=3, shuffle=True) for train, test in kf: trainX, trainy = transform_sklearn_dictionary(transform_csv(dat.iloc[train], target_col="rating", ignore_cols=["01v234", "2v34", "words","words_nostopwords", "review", 'numDet', 'innerPunctuation','avgWordLength','numPresVerb', "numFirstPerson",'numPropNoun', "numOtherNoun", "numWords", "numAdj", "numPastVerb", "numConj", "exclamationPoints"])) testX, testy = transform_sklearn_dictionary(transform_csv(dat.iloc[test], target_col="rating", ignore_cols=["01v234", "2v34", "words","words_nostopwords", "review", 'numDet', 'innerPunctuation','avgWordLength','numPresVerb', "numFirstPerson",'numPropNoun', "numOtherNoun", "numWords", "numAdj", "numPastVerb", "numConj", "exclamationPoints"])) tfidf_scaled_knn.fit(trainX, trainy) print tfidf_scaled_knn.score(testX, testy)
class Classifier(BaseEstimator): def __init__(self, rf_max_depth=10, rf_n_estimators=50, n_estimators=50, n_jobs=1): self.rf_max_depth = rf_max_depth self.rf_n_estimators = rf_n_estimators self.n_estimators = n_estimators self.n_jobs = n_jobs def fit(self, X, y): self.clf = Pipeline([ ('rf', AdaBoostClassifier( base_estimator=RandomForestClassifier( max_depth=self.rf_max_depth, n_estimators=self.rf_n_estimators, n_jobs=self.n_jobs), n_estimators=self.n_estimators) ) ]) self.clf.fit(X, y) return self def predict(self, X): return self.clf.predict(X) def predict_proba(self, X): return self.clf.predict_proba(X)
def allFeatureClassify(cosine=False): print "AllFeatureClassifier" if cosine: print "Cosine" trainData = pd.read_csv("data/multinomialTrain.csv", header=0) # dat = trainData[["rating", 'numDet', 'innerPunctuation','avgWordLength', # 'numPresVerb', "numFirstPerson",'numPropNoun', "numOtherNoun", "numWords", "numAdj", # "numPastVerb", "numConj", "exclamationPoints"]] dat = trainData if cosine: knn = KNeighborsClassifier(n_neighbors=21, metric=pairwise.cosine_similarity) else: knn = KNeighborsClassifier(n_neighbors=21) scaler = preprocessing.StandardScaler() scaled_knn = Pipeline([('scaler', scaler), ('knn', knn)]) kf = KFold(len(trainData), n_folds=3, shuffle=True) for train, test in kf: trainX, trainy = transform_sklearn_dictionary(transform_csv(dat.iloc[train], target_col="rating", ignore_cols=["01v234", "2v34", "words", "words_nostopwords", "review"])) testX, testy = transform_sklearn_dictionary(transform_csv(dat.iloc[test], target_col="rating", ignore_cols=["01v234", "2v34", "words", "words_nostopwords", "review"])) scaled_knn.fit(trainX, trainy) print scaled_knn.score(testX, testy)
def run(training, validation, k, config=None): isError, OneError, nDocs = 0, 0, 0 margins, AP = [], [] class_index = Index() traindocs, train_X, train_y = zip(*load_data(training, class_index)) testdocs, test_X, test_y = zip(*load_data(validation, class_index)) n_iter = np.ceil(10**6 / len(traindocs)) clf = SGDClassifier(alpha=.000001, loss='log', n_iter=50, penalty='elasticnet') #clf = MultinomialNB(alpha=0.000001) classifier = Pipeline([ ('vectorizer', CountVectorizer(min_df=1, max_df=1.0, analyzer=lambda t: t)), ('tfidf', TfidfTransformer(norm='l2')), ('clf', OneVsRestClassifier(clf, n_jobs=-1))]) classifier.fit(train_X, train_y) predictions = classifier.predict_proba(test_X) for j, prediction in enumerate(predictions): nDocs += 1 refs = np.zeros(len(prediction)) refs[list(test_y[j])] = 1 preds = sorted(range(len(prediction)), key=lambda i: prediction[i], reverse=True) refs = set(test_y[j]) ap = average_precision(preds, refs) AP.append(ap) isError += is_error(ap) OneError += one_error(preds, refs) margins.append(margin(preds, refs)) return isError, OneError, nDocs, margins, AP
def cross_validation(self, X, Y, n_folds=10): """ n-fold cross validation to get the best classifier. """ kf = KFold(len(X), n_folds=n_folds) best_accuracy = -1 training_accuracy = 0 for train, cv in kf: classifier = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('svm', LinearSVC(C=1))]) # forms the training and test set X_train = [] X_train.extend(X[0:cv[0]]) X_train.extend(X[cv[-1]:]) Y_train = [] Y_train.extend(Y[0:cv[0]]) Y_train.extend(Y[cv[-1]:]) X_cv = X[cv[0]:cv[-1]+1] Y_cv = Y[cv[0]:cv[-1]+1] classifier.fit(X_train, Y_train) accuracy = self.__accuracy(classifier, X_cv, Y_cv) if accuracy > best_accuracy: best_classifier = classifier best_accuracy = accuracy training_accuracy = self.__accuracy( classifier, X_train, Y_train) return best_classifier, training_accuracy, best_accuracy
def main(): data = import_files(filenames) sentences = defaultdict(lambda: []) # invert the dictionary for cat in data: if cat == 'yn': continue for sentence in data[cat]: sentences[sentence].append(cat) X_list = [] y_data = [] for s in sentences: X_list.append(s) y_data.append(sentences[s]) X_data = np.array(X_list) # X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.01, random_state=802701)\ X_train = X_data y_train = y_data classifier = Pipeline([ ('vectorizer', TfidfVectorizer()), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(X_train, y_train) save_classifier(classifier, outfile)
def pipeline_test(params, data_path, dataset): data_train = os.path.expanduser(os.path.join(data_path, dataset, 'train.arff')) X_train, y_train = load_arff_data(data_train) data_test = os.path.expanduser(os.path.join(data_path, dataset, 'test.arff')) X_test, y_test = load_arff_data(data_test) dpr = get_data_preprocessor_rescaling(params) params = get_data_preprocessor_balancing(params, y_train) fp = get_feature_preprocessor(params) clf = get_classifier(params) steps = [] if dpr is not None: steps.append(('data_preprocessor_rescaling', dpr)) if fp is not None: steps.append(('feature_preprocessor', fp)) steps.append(('classifier', clf)) ppl = Pipeline(steps) ppl.fit(X_train, y_train) y_pred = ppl.predict(X_test) score = accuracy_score(y_test, y_pred) result = 100.0 - 100.0 * score return result
def test_sklearn_pipeline(self): t = vw.VWClassifier(target="target") f1 = {"target":0,"b":1.0,"c":0} f2 = {"target":1,"b":0,"c":2.0} fs = [] for i in range (1,50): fs.append(f1) fs.append(f2) print "features=>",fs df = pd.DataFrame.from_dict(fs) estimators = [("vw",t)] p = Pipeline(estimators) print "fitting" p.fit(df) print "get preds 1 " preds = p.predict_proba(df) print preds print "-------------------" t.close() joblib.dump(p,"/tmp/pipeline/p") p2 = joblib.load("/tmp/pipeline/p") print "get preds 2" df3 = p2.predict_proba(df) print df3 vw2 = p2._final_estimator vw2.close()
random_state=1, max_iter=5, tol=None), 'clf_params': { 'clf__alpha': (0.001, 1.0), } } } for model in models.keys(): print('\nRunning the model - {}'.format(model)) text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', models[model]['clf'])]) text_clf.fit(X_train, y_train) predicted = text_clf.predict(X_test) accuracy = np.mean(predicted == y_test) print('\nFirst run - Accuracy of {} - {}'.format(model, accuracy * 100)) print('\nTuning training parameters') # 4. Auto-tuning the training parameters using Grid Search for both feature extraction and classifier parameters = { 'vect__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vect__stop_words': ['english', None], 'vect__max_df': (0.5, 1.0), 'vect__min_df': (1, 2), 'tfidf__use_idf': (True, False), 'tfidf__smooth_idf': (True, False), 'tfidf__sublinear_tf': (True, False),
def create_and_evaluate_model(args): global trial_nr trial_nr += 1 start = time.time() score = 0 for cv_iter in range(n_splits): dt_test_prefixes = dt_prefixes[cv_iter] dt_train_prefixes = pd.DataFrame() for cv_train_iter in range(n_splits): if cv_train_iter != cv_iter: dt_train_prefixes = pd.concat( [dt_train_prefixes, dt_prefixes[cv_train_iter]], axis=0) # Bucketing prefixes based on control flow bucketer_args = { 'encoding_method': bucket_encoding, 'case_id_col': dataset_manager.case_id_col, 'cat_cols': [dataset_manager.activity_col], 'num_cols': [], 'random_state': random_state } if bucket_method == "cluster": bucketer_args["n_clusters"] = args["n_clusters"] bucketer = BucketFactory.get_bucketer(bucket_method, **bucketer_args) print(bucketer) bucket_assignments_train = bucketer.fit_predict(dt_train_prefixes) bucket_assignments_test = bucketer.predict(dt_test_prefixes) preds_all = [] test_y_all = [] if "prefix" in method_name: scores = defaultdict(int) for bucket in set(bucket_assignments_test): relevant_train_cases_bucket = dataset_manager.get_indexes( dt_train_prefixes)[bucket_assignments_train == bucket] relevant_test_cases_bucket = dataset_manager.get_indexes( dt_test_prefixes)[bucket_assignments_test == bucket] dt_test_bucket = dataset_manager.get_relevant_data_by_indexes( dt_test_prefixes, relevant_test_cases_bucket) test_y = dataset_manager.get_label_numeric(dt_test_bucket) if len(relevant_train_cases_bucket) == 0: preds = [class_ratios[cv_iter] ] * len(relevant_test_cases_bucket) else: dt_train_bucket = dataset_manager.get_relevant_data_by_indexes( dt_train_prefixes, relevant_train_cases_bucket) # one row per event train_y = dataset_manager.get_label_numeric(dt_train_bucket) if len(set(train_y)) < 2: preds = [train_y[0]] * len(relevant_test_cases_bucket) else: feature_combiner = FeatureUnion([ (method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods ]) if cls_method == "rf": cls = RandomForestClassifier( n_estimators=500, max_features=args['max_features'], random_state=random_state) elif cls_method == "xgboost": cls = xgb.XGBClassifier( objective='binary:logistic', n_estimators=500, learning_rate=args['learning_rate'], subsample=args['subsample'], max_depth=int(args['max_depth']), colsample_bytree=args['colsample_bytree'], min_child_weight=int(args['min_child_weight']), seed=random_state) elif cls_method == "logit": cls = LogisticRegression(C=2**args['C'], random_state=random_state) elif cls_method == "svm": cls = SVC(C=2**args['C'], gamma=2**args['gamma'], random_state=random_state) if cls_method == "svm" or cls_method == "logit": pipeline = Pipeline([('encoder', feature_combiner), ('scaler', StandardScaler()), ('cls', cls)]) else: pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)]) pipeline.fit(dt_train_bucket, train_y) #pipeline.fit(dt_train_bucket, train_y, cls__early_stopping_rounds=15, cls__eval_set=[[pipeline.named_steps["encoder"].transform(dt_test_bucket), test_y]]) if cls_method == "svm": preds = pipeline.decision_function(dt_test_bucket) else: preds_pos_label_idx = np.where(cls.classes_ == 1)[0][0] preds = pipeline.predict_proba( dt_test_bucket)[:, preds_pos_label_idx] if "prefix" in method_name: auc = 0.5 if len(set(test_y)) == 2: auc = roc_auc_score(test_y, preds) scores[bucket] += auc preds_all.extend(preds) test_y_all.extend(test_y) score += roc_auc_score(test_y_all, preds_all) if "prefix" in method_name: for k, v in args.items(): for bucket, bucket_score in scores.items(): fout_all.write( "%s;%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, bucket, k, v, bucket_score / n_splits)) fout_all.write("%s;%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, 0, "processing_time", time.time() - start, 0)) else: for k, v in args.items(): fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, k, v, score / n_splits)) fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, "processing_time", time.time() - start, 0)) fout_all.flush() return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
clfnames = [ "Multinomial Naive Bayes", "Linear SVM", "Logistic Regression", "Stochastic Gradient Descent", "Random Forest", "Bagging Random Forest", "Gradient Boosting", "Ada Boost" ] #building a pipeline for vectname, vect, in zip(vectnames, vects): for clfname, clf in zip(clfnames, clfs): pipe = Pipeline([ ('vect', vect), ('clf', clf), ]) pipe.fit(x_train, y_train) pred = pipe.predict(x_test) train_acc = metrics.accuracy_score(y_train, pipe.predict(x_train)) test_acc = metrics.accuracy_score(y_test, pred) print("{} + {} - train acc: {} test acc: {} ".format( vectname, clfname, train_acc, test_acc)) """Best result = Tfidf Vect + Linear SVM - train acc: 0.9880763116057234 test acc: 0.7857142857142857""" tfidf = TfidfVectorizer() linear_svm = LinearSVC() tfidf.fit(x_train) x_train_dtm = tfidf.transform(x_train) x_test_dtm = tfidf.transform(x_test) linear_svm.fit(x_train_dtm, y_train)
feature_combiner = FeatureUnion( [(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods]) cls = xgb.XGBClassifier(objective='binary:logistic', n_estimators=500, learning_rate=current_args['learning_rate'], subsample=current_args['subsample'], max_depth=int(current_args['max_depth']), colsample_bytree=current_args['colsample_bytree'], min_child_weight=int(current_args['min_child_weight']), seed=random_state) pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)]) pipeline.fit(dt_train_bucket, train_y) # predict separately for each prefix case preds = [] test_all_grouped = dt_test_bucket.groupby(dataset_manager.case_id_col) for _, group in test_all_grouped: test_y_all.extend(dataset_manager.get_label_numeric(group)) _ = bucketer.predict(group) preds_pos_label_idx = np.where(cls.classes_ == 1)[0][0] pred = pipeline.predict_proba(group)[:, preds_pos_label_idx]
), ( "feature_selection", Pipeline( [ ("mutual_info_selector", mutual_info_selector), ("recurse_importance_selector", recurse_importance_selector), ] ), ), ("classifier", classifier), ] ) X_train = train.text y_train = train.label pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_train) print("Train acc:", accuracy_score(y_train, y_pred)) X_test = test.text y_test = test.label pred_test = pipeline.predict(X_test) print("Test acc:", accuracy_score(y_test, pred_test)) dump(pipeline, "data/classification_pipeline.joblib")
X = df_amazon['verified_reviews'] # the features we want to analyze ylabels = df_amazon['feedback'] # the labels, or answers, we want to test against X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3) # Logistic Regression Classifier from sklearn.linear_model import LogisticRegression classifier = LogisticRegression() # Create pipeline using Bag of Words pipe = Pipeline([("cleaner", predictors()), ('vectorizer', bow_vector), ('classifier', classifier)]) # model generation pipe.fit(X_train,y_train) from sklearn import metrics # Predicting with a test dataset predicted = pipe.predict(X_test) # Model Accuracy print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted)) print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted)) print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted)) # Creating our tokenizer function def spacy_tokenizer(sentence):
f1_macro:0.6802467902467904 f1_micro:0.6637426900584795 f1_macro:0.45717351289792896 f1_micro:0.6502923976608187 f1_macro:0.510114468864469 f1_micro:0.6982456140350877 f1_macro:0.5879108391608392 ''' ########## Evaluate the Best Classifier (Ensemble) ########## #first just get test prediciton scores ens_pipe.fit(X_train, y_train) ens_preds = ens_pipe.predict(X_test) f1_score(ens_preds, y_test, average='micro') #.7917 f1_score(ens_preds, y_test, average='macro') #.7363 accuracy_score(ens_preds, y_test) #.7917 #next get train size vs test prediciton f1 scores len_list = [] ens_accs = [] # for each subset of train set for i in range(1, 11): # initialize the classifier svc_sub = svm.SVC(gamma='scale', C=1, kernel="rbf") gnb_sub = GaussianNB() rf_sub = RandomForestClassifier(n_estimators=10)
## feature scaling + dimension reduction: pca #print "After feature scaling and PCA:" #pipe(estimatorsNb_scaled_pca) print "-----------------------------------------------------" # ============================================================================= # scaling + kbest for params tunning # ============================================================================= # tune parameters # add scaling+ pca into pipepine pipe_Nb = Pipeline(estimatorsNb_scaled_kbest) # fit the pipeline pipe_Nb.fit(features_train, labels_train) # tuning params param_grid_Nb = dict(feature_selection__k=[3, 4, 5, 6, 7, 8, 9, 10]) # use StratifiedKFold to make the classifier more robust!! ### this is a small dataset, with the ratio of poi and non-poi highly unbalanced grid_search_Nb = GridSearchCV(pipe_Nb, param_grid=param_grid_Nb, cv=StratifiedKFold(10)) grid_search_Nb.fit(features_train, labels_train) # get the best Nb clf best_Nb = grid_search_Nb.best_estimator_ # selected featuers:
header=None) X = df.iloc[:, 2:].values y = df.iloc[:, 1].values le = LabelEncoder() le.fit(np.unique(y)) y = le.transform(y) # 訓練データとテストデータに分割 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1) # パイプライン作成 pipe_svm = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))]) scores = cross_val_score(estimator=pipe_svm, X=X_train, y=y_train, cv=10, n_jobs=-1) print('CV accuracy scores: {}'.format(scores)) print('CV accuracy: {:0.3f} +/- {:0.3f}'.format(np.mean(scores), np.std(scores))) pipe_svm.fit(X_train, y_train) print('Test accuracy: {:0.3f}'.format(pipe_svm.score(X_test, y_test)))
def main(): (X_all, y, lentrain) = Prepare_1().fit(update=True) X = X_all[:lentrain] clf1 = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=123) clf2 = RandomForestClassifier(n_estimators=200, max_depth=24, n_jobs=-1, random_state=1, verbose=0) clf3 = GradientBoostingClassifier(n_estimators=42, max_depth=24, random_state=1, verbose=2, subsample=0.9) clf4 = svm.SVC(probability=True) clf5 = KNeighborsClassifier(n_neighbors=5) clf6 = SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None, rho=None, shuffle=False, verbose=0, warm_start=False) clf = clf1 """ selector = RFECVp(clf2,clf2, step=50, cv=4, scoring="roc_auc", verbose=2) selector = selector.fit(X, y) clf = selector """ rd = Pipeline([ #("selector", SelectPercentile(chi2, percentile=90)), #("selector", SelectPercentile(f_classif, percentile=50)), #("selector", lm.RandomizedLogisticRegression(C=1, random_state=1, verbose=1)), #("pca", PCA(n_components='mle')), #("pca", PCA(n_components=500)), #("svd", TruncatedSVD(n_components=200, random_state=1 )), #("lasso",svm.LinearSVC(C=0.5, penalty="l1", dual=False)), ("est", clf) ]) if True: cv_run(rd, X, y) return else: print "Prepare submission.." print "training on full data" rd.fit(X, y) X_test = X_all[lentrain:] pred = rd.predict_proba(X_test)[:, 1] testfile = pd.read_csv('../data/test.tsv', sep="\t", na_values=['?'], index_col=1) pred_df = pd.DataFrame(pred, index=testfile.index, columns=['label']) submname = 'submission_%s' % ( datetime.datetime.today().strftime("%Y%m%d_%H%M%S"), ) #print submname pred_df.to_csv('../data/%s.csv' % submname) print "%s file created.." % submname
def text_data_pipeline(text_data): X, y = text_data pipeline_prefit = Pipeline([('vectorizer', DictVectorizer()), ('clf', RandomForestClassifier())]) return pipeline_prefit.fit(X, y)
from sklearn.preprocessing import StandardScaler from sklearn.svm import LinearSVC iris = datasets.load_iris() X = iris["data"][:, (2, 3)] # petal length, petal width y = (iris["target"] == 2).astype(np.float64) # If is Iris-Virginica == 1 # else == 0 #%% SVM Classification svm_clf = Pipeline([("scaler", StandardScaler()), ("linear_svc", LinearSVC(C=1, loss="hinge"))]) svm_clf.fit(X, y) print(svm_clf.predict([[5.5, 1.7]])) #%% Nonlinear SVM Classification from sklearn.preprocessing import PolynomialFeatures polynomial_svm_clf = Pipeline([("poly_features", PolynomialFeatures(degree=3)), ("scaler", StandardScaler()), ("svm_clf", LinearSVC(C=10, loss="hinge"))]) polynomial_svm_clf.fit(X, y) print(polynomial_svm_clf.predict([[5.5, 1.7]]))
def model_selection(self): """ hyperparameter tuning is performed using GridSearchCV technique uses cross-validation when applying the default values of a 5-fold cross validation as a means of splitting the training data into a training and validation sets. model score is representen with the R-squared metrics """ models = [] models_1 = ["Ridge","Lasso","LinearRegression","PoissonRegressor"] models_2 = ["RandomForestRegressor","GradientBoostingRegressor"] model_3 = ["SVR"] models += models_1 + models_2 + model_3 models_dictionary = {"Ridge":Ridge(),"Lasso":Lasso(),"LinearRegression":LinearRegression(fit_intercept=True), "RandomForestRegressor":RandomForestRegressor(random_state=0),"GradientBoostingRegressor":GradientBoostingRegressor(random_state=0), "SVR":SVR(epsilon=0.5),"PoissonRegressor":PoissonRegressor(max_iter=200)} models_score = {} # Tuning of parameters for regression by cross-validation # Number of cross valiations is 5 for model in models: if model in models_1: pipe = Pipeline([ ('scaler', StandardScaler()), ('reduce_dim', PCA()), ('regressor', models_dictionary[model]) ]) pipe = pipe.fit(self.X_train, self.y_train) n_features_to_test = np.arange(1, 13) alpha_to_test = 2.0**np.arange(-6, +6) if model == "LinearRegression": params = {'reduce_dim__n_components': n_features_to_test, 'scaler' : [StandardScaler(), RobustScaler()]} else: params = {'reduce_dim__n_components': n_features_to_test, 'regressor__alpha': alpha_to_test, 'scaler' : [StandardScaler(), RobustScaler()]} gridsearch = GridSearchCV(pipe, params, verbose=1).fit(self.X_train, self.y_train) elif model in models_2: if model == "RandomForestRegressor": model_estimator =models_dictionary[model] params={'n_estimators':[20,30,40,60,80,100], 'max_depth': [5,10,15,20],'max_features':[2,5,8]} else: model_estimator = models_dictionary[model] params = {'learning_rate': [0.01,0.02,0.03,0.04], 'subsample' : [0.9, 0.5, 0.2, 0.1], 'n_estimators' : [20,30,40,60,80,100], 'max_depth' : [4,6,8,10] } gridsearch = GridSearchCV(estimator = model_estimator,param_grid = params,n_jobs=-1).fit(self.X_train, self.y_train) else: parameters = {'gamma': [1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5, 0.6, 0.9],'C': [1, 2.5, 5,7.5,10,15]} gridsearch = GridSearchCV(models_dictionary[model], parameters).fit(self.X_train, self.y_train) print(" Results from Grid Search:",model) print("\n The best estimator across ALL searched params:\n",gridsearch.best_estimator_) print("\n The best score across ALL searched params:\n",gridsearch.best_score_) print("\n The best parameters across ALL searched params:\n",gridsearch.best_params_) print('\n Final score is: ', gridsearch.score(self.X_test, self.y_test)) print("") models_score[model] = gridsearch.score(self.X_test, self.y_test) self.models_score = models_score
count_vector = CountVectorizer(tokenizer=tokenizer, ngram_range=(1, 1)) tfidf_vector = TfidfVectorizer(tokenizer=tokenizer) # In[6]: X = df.iloc[:, 0] y = df.iloc[:, 1] xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.15) # In[7]: mnb = MultinomialNB() pipmodel = Pipeline([('cleaner', customCleaner()), ('counter', count_vector), ('model', mnb)]) pipmodel.fit(xtrain, ytrain) # In[8]: ypred = pipmodel.predict(xtest) cm = confusion_matrix(ytest, ypred) print(cm) cr = classification_report(ytest, ypred) print(cr) accuracy = accuracy_score(ytest, ypred) print(accuracy) cv = RepeatedKFold(n_splits=5, n_repeats=2) cv_score = cross_val_score(pipmodel, X, y, cv=cv) print(cv_score.mean()) # In[ ]:
plt.figure(figsize=(20,20)) for index, (image, label) in enumerate(zip(train_images[0:100], clusterAssignement[0:100])): plt.subplot(5, 20, index + 1) plt.axis("off") plt.imshow(np.reshape(image, (28,28)), cmap=plt.cm.gray) plt.title(label, fontsize = 20) plt.show() ## comparison to the sklearn algorithm pca = PCA(n_components=10) kmeans = KMeans(n_clusters=10,n_init=1) predictor = Pipeline([('pca', pca), ('kmeans', kmeans)]) predict = predictor.fit(test_images).predict(test_images) acc = 0 for i in range(len(predict)): acc += predict[i] == test_labels[i] print("accuracy = ", acc/len(predict)) plt.figure(figsize=(20,20)) for index, (image, label) in enumerate(zip(train_images[0:100], predict[0:100])): plt.subplot(5, 20, index + 1) plt.axis("off") plt.imshow(np.reshape(image, (28,28)), cmap=plt.cm.gray) plt.title(label, fontsize = 20) plt.show()
# Random forest classifier classifier = RandomForestClassifier(n_estimators=50, max_depth=4) # Build the machine learning pipeline pipeline_classifier = Pipeline([('selector', selector_k_best), ('rf', classifier)]) # We can set the parameters using the names we assigned # earlier. For example, if we want to set 'k' to 6 in the # feature selector and set 'n_estimators' in the Random # Forest Classifier to 25, we can do it as shown below pipeline_classifier.set_params(selector__k=6, rf__n_estimators=25) # Training the classifier pipeline_classifier.fit(X, y) # Predict the output prediction = pipeline_classifier.predict(X) print("Predictions:", prediction) # Print score print("Score:", pipeline_classifier.score(X, y)) # Print the selected features chosen by the selector features_status = pipeline_classifier.named_steps['selector'].get_support() selected_features = [] for count, item in enumerate(features_status): if item: selected_features.append(count)
def PCA_with_SVM(X_train, Y_train): estimators = [('reduce_dim', PCA()), ('clf', SVC())] pipe = Pipeline(estimators) pipe.fit(X_train, Y_train) return (pipe)
from sklearn.pipeline import Pipeline from sklearn.metrics import classification_report, confusion_matrix import csv if __name__ == "__main__": X, y, s = load_data('ARTIFICIAL_V3', stats=True) print(s) qubits_class = [y[i] * 100 + len(X[i]) for i in range(len(X))] indices = list( StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED).split(X, qubits_class)) pipeline = Pipeline([("Histogramizer", Histogramizer(bins=11, range=(s['first_arrival'], s['last_arrival']))), ("Neural network", MLPClassifier(hidden_layer_sizes=(33, 33), activation='relu', solver='adam', max_iter=50, tol=0.001, verbose=True))]) for i in indices[:1]: pipeline.fit(X[i[0]], y[i[0]]) y_pred = pipeline.predict(X[i[1]]) print(classification_report(y[i[1]], y_pred, digits=8)) print(confusion_matrix(y[i[1]], y_pred)) dp = filter_datapoints(X[i[1]], y[i[1]], y_pred, indices=i[1])
class lr_model: 'Modeling Class, default to LinearRegression' def __init__(self, df, target, pipe_steps): self.df = df self.target = self.df[target] self.features = self.df.drop(target, axis=1) self.num_features = self.features.select_dtypes(include='number') self.nom_features = self.features.select_dtypes(exclude='number') self.pipe = Pipeline(pipe_steps) self.summary = pd.DataFrame({ 'random_state': [], 'val_score': [], 'train_score': [], 'test_score': [] }) def test_models(self, run_time=3): 'Run Models X amount of time with different random state' for i in np.random.choice(100, run_time, replace=False): seed = i X = self.features y = self.target X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=seed) self.pipe.fit(X_train, y_train) val_score = round( cross_val_score(self.pipe, X_train, y_train, cv=5).mean(), 2) test_score = round(self.pipe.score(X_test, y_test), 2) train_score = round(self.pipe.score(X_train, y_train), 2) self.summary = self.summary.append( { 'random_state': i, 'val_score': val_score, 'train_score': train_score, 'test_score': test_score }, ignore_index=True) return self.summary def final_model(self): pass def predictions(self): pass def coef_score(self): X = self.features y = self.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8) self.model.fit(X_train, y_train) val_score = cross_val_score(self.model, X_train, y_train, cv=5).mean() test_score = self.model.score(X_test, y_test) train_score = self.model.score(X_train, y_train) summary = pd.DataFrame({ "coefficients": np.transpose([(round(coef, 2)) for coef in self.model.coef_]), 'avg_feature_value': self.features.mean(), 'avg_feature__median': self.features.median() }) summary['avg_change'] = summary['avg_feature_value'] * summary[ 'coefficients'] summary['count'] = self.features[self.features > 0].count() return summary.sort_values('avg_change', ascending=False)
best_model_mlp = grid_mlp.best_estimator_ best_model_mlp """Modelo selecionado tem os parametros definidos como **learning_rate='adaptive'** - Mantém a taxa de aprendizagem constante referente ao valor de learning rate inicial enquanto a perda de treinamento continua diminuindo. Cada vez que duas epochs* consecutivas falham em diminuir a perda de treinamento ou falham em aumentar a pontuação de validação, a taxa de aprendizado atual é dividida por 5. **activation='tanh'** - a função tan hiperbólica, retorna f (x) = tanh (x). **solver='sgd'** - 'Sgd' refere-se à descida gradiente estocástica *epochs: é um hiperparâmetro que define o número de vezes que o algoritmo de aprendizado funcionará em todo o conjunto de dados de treinamento. ## Validacao Modelo """ from sklearn.metrics import accuracy_score, classification_report, recall_score pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', best_model_mlp)]) pipe.fit(X_train, y_train.ravel()) pred_test = pipe.predict(X_test) accuracy_result = accuracy_score(y_test, pred_test) print(classification_report(y_test, pred_test)) """Algoritmo obteve uma performance boa, com acuracia de 76 % em teste. Porem é preciso avaliar que neste caso a recomendação de pessoas boas de credito como ruins deve acontecer em uma escala menor. A metrica recall indicou um valor de 89 % de precisão na predição de pessoas boas de credito que realmente era boas de creditc, o que indica uma boa performance graças ao custo alto associado aos falso negativos. """
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False) #pca = PCA(n_components =5) pcapoly = PCA(n_components=100) #selection = SelectKBest(k =10) feaPipeline = Pipeline([ ("MinMaxScaler",min_max_scaler),\ ("pcapoly",pcapoly) ]) #feaPipeline = Pipeline([ # ("MinMaxScaler",min_max_scaler),\ # ("pcapoly",pcapoly),\ # ("poly",poly) # ]) feaPipeline.fit(X_train, Y_train) X_train = np.concatenate((X_train, feaPipeline.transform(X_train)), axis=1) X_valid = np.concatenate((X_valid, feaPipeline.transform(X_valid)), axis=1) #============================================================================== # tranning and prediction #============================================================================== C0 = 10000 verbose0 = 2 sv =[ svm.SVR(C=C0, verbose = verbose0),\ svm.SVR(C=C0, verbose = verbose0),\ svm.SVR(C=C0, verbose = verbose0),\ svm.SVR(C=C0, verbose = verbose0),\ svm.SVR(C=C0, verbose = verbose0)]
scaler = StandardScaler() svm_clf1 = LinearSVC(C=1, loss="hinge", random_state=42) svm_clf2 = LinearSVC(C=100, loss="hinge", random_state=42) scaled_svm_clf1 = Pipeline([ ("scaler", scaler), ("linear_svc", svm_clf1), ]) scaled_svm_clf2 = Pipeline([ ("scaler", scaler), ("linear_svc", svm_clf2), ]) scaled_svm_clf1.fit(x, y) scaled_svm_clf2.fit(x, y) # In[14]: b1 = svm_clf1.decision_function([-scaler.mean_ / scaler.scale_]) b2 = svm_clf2.decision_function([-scaler.mean_ / scaler.scale_]) w1 = svm_clf1.coef_[0] / scaler.scale_ w2 = svm_clf2.coef_[0] / scaler.scale_ svm_clf1.intercept_ = np.array([b1]) svm_clf2.intercept_ = np.array([b2]) svm_clf1.coef_ = np.array([w1]) svm_clf2.coef_ = np.array([w2])
mlb = MultiLabelBinarizer() X = corpus Y = mlb.fit_transform(tag_corp) random_state = np.random.RandomState(0) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.1, random_state=random_state) pipe1 = Pipeline([("wordVectz", AverageEmbeddingVectorizer(w2v)), ("multilabel", OneVsRestClassifier(LinearSVC()))]) pipe2 = Pipeline([("wordVectz", TfidfEmbeddingVectorizer(w2v)), ("multilabel", OneVsRestClassifier(LinearSVC()))]) pipe1.fit(X_train, y_train) predicted = pipe1.predict(X_test) all_labels = mlb.inverse_transform(predicted) print(all_labels) accuracy1 = accuracy_score(y_test, predicted) print("accuracy=", accuracy1) # print("Evaluation- BOW, SVC") precision1 = precision_score(y_test, predicted, average='macro') print(precision1) ham_loss = hamming_loss(y_test, predicted) print("hamming loss=", ham_loss) recall1 = recall_score(y_test, predicted, average='macro') print("recall=", recall1)
] # create a list of the values we want to assign for each condition values = ['Positive', 'Neutral', 'Negative'] # create a new column and use np.select to assign values to it using our lists as arguments df['tier'] = np.select(conditions, values) x = df.iloc[:,2].values y = df.iloc[:,-1].values from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import SVC text_model = Pipeline([('tfidf',TfidfVectorizer()),('model',SVC())]) text_model.fit(x,y) select = st.text_input('Enter your message') if(st.markdown( '<span class="badge badge-pill badge-success"> Badge </span>', unsafe_allow_html=True )): op = text_model.predict([select]) ans=op[0] if ans == 'Positive': st.success("Positive 🙂") if ans == 'Negative': st.error("Negative 😠") if ans== 'Neutral': st.warning("Neutral 😐")
handle_unknown='ignore')), ('model', XGBClassifier(n_estimators=800, learning_rate=0.07, reg_alpha=8, reg_lambda=0.75, gamma=3, max_depth=4))]) logger.info("Predicting score (w/Cross-Val) on X...") results = cross_val_predict(model, X, y, cv=cfg["folds"], method='predict_proba')[:, 1] score = gini_normalized(y, results) logger.info("normalized gini score on training set is {}".format(score)) logger.info("Fitting model on upscaled X...") model.fit(X_up, y_up) logger.info("Loading and predicting on Test set...") test = load_file("test") test.drop(drop_cols, axis=1, inplace=True) #test = make_missing_zero(test, get_cat_features_idx(test)) test['target'] = model.predict_proba(test)[:, 1] write_submission_file(test, columns=['target'], name='xgb-imp-ohe-ups2') logger.info("Finished with time {:.3f} minutes".format( (time.time() - start) / 60.0))
def createRandomForest(X, y): svm_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('RFC', RandomForestClassifier())]) svm_clf = svm_clf.fit(X, y) return svm_clf
y = np.array(y)[perm] nslice = 600 X_train = X[0:nslice] X_test = X[nslice:] y_train = y[0:nslice] y_test = y[nslice:] rbm1 = BernoulliRBM(random_state=seed, verbose=True, n_iter=200, n_components=128) rbm1.fit(X_test.tolist() + unknowns) final = MLPClassifier( solver='sgd', hidden_layer_sizes=(64, 2), random_state=seed, max_iter=200, learning_rate="adaptive" ) clf = Pipeline(steps=[('rbm1', rbm1), ('final', final)]) model = clf.fit(X_train, y_train) y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(accuracy) test_ids = flatten( pd.read_csv('data/test_ids.csv', sep=',',header=None).values.tolist() ) unknown_pred = model.predict(unknowns).tolist() predictions = pd.DataFrame([test_ids, unknown_pred]).values.T.tolist() np.savetxt('data/predictions.csv', predictions, fmt='%d', delimiter=',', header='PassengerId,Survived')
from sklearn.model_selection import train_test_split X = df['text'] y = df['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC # Linear SVC: text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1, 2))), ('clf', LinearSVC()), ]) # Next we'll run Linear SVC text_clf_lsvc.fit(X_train, y_train) # Form a prediction set predictions = text_clf_lsvc.predict(X_test) # Report the confusion matrix from sklearn import metrics print(metrics.confusion_matrix(y_test,predictions)) # Print a classification report print(metrics.classification_report(y_test,predictions)) # Print the overall accuracy print(metrics.accuracy_score(y_test,predictions))
def run(dataset, features, word2vec, metrics, fname=None): if dataset == 'fdcl18': df1 = load_fdcl18(num_classes=2) df2 = load_dwmw17(num_classes=2) df2 = df2.drop( ['count', 'hate_speech', 'offensive_language', 'neither'], axis=1) else: df1 = load_dwmw17(num_classes=2) df2 = load_fdcl18(num_classes=2) df1 = df1.drop( ['count', 'hate_speech', 'offensive_language', 'neither'], axis=1) # # Preprocessing preprocess = TweetPreprocessor(normalize=['link', 'mention']).preprocess tokenize = TweetTokenizer().tokenize # # # DF 1 - Preprocessing tqdm.pandas(desc='Preprocessing Progress: ') df1['clean_tweet'] = df1.tweet.progress_apply(preprocess) tqdm.pandas(desc='Tokenizing Progress: ') df1['tokens'] = df1.clean_tweet.progress_apply(tokenize) # # # DF 2 - Preprocessing tqdm.pandas(desc='Preprocessing Progress: ') df2['clean_tweet'] = df2.tweet.progress_apply(preprocess) tqdm.pandas(desc='Tokenizing Progress: ') df2['tokens'] = df2.clean_tweet.progress_apply(tokenize) # # # # Feature Extraction # # # tfidf_pipeline ff = [] if 'tfidf_vectorizer' in features: tfidf_kwargs = dict(tokenizer=TweetTokenizer().tokenize, stop_words=stopwords, min_df=.0025, max_df=0.25, ngram_range=(1, 3)) ff += [('tfidf_vectorizer', TfidfVectorizer(**tfidf_kwargs), 'clean_tweet')] # # # framenet_pipeline if 'framenet_pipeline' in features: count_vectorizer = ('count_vectorizer', CountVectorizer()) truncated_svd = ('truncated_svd', TruncatedSVD(algorithm='randomized', n_components=10)) ff += [('framenet_pipeline', Pipeline([count_vectorizer, truncated_svd]), 'framenet')] # # # mean_embedding if 'mean_embedding' in features: ff += [('mean_embedding', mean_embedding(word2vec), 'tokens')] # # # hatebase_vectorizer if 'hatebase_vectorizer' in features: ff += [('hatebase_vectorizer', HatebaseVectorizer(features=features['hatebase_vectorizer']), 'clean_tweet')] # # # transfer_vectorizer if 'transfer_vectorizer' in features: hyper_params = features['transfer_vectorizer'] hyper_params['module'] = TextCNN hyper_params['corpus'] = df1.tokens hyper_params['word_vectors'] = word2vec # """ # Cross-validate and save predictions args = [ NeuralNetClassifier, hyper_params, ['conv_%i' % i for i in range(3)], False ] ff += [('transfer_vectorizer', TransferVectorizer(*args), 'tokens')] # # # estimator pipeline = Pipeline([('column_transformer', ColumnTransformer(ff)), ('clf', LinearSVC())]) # # Grid Search`` # param_grid = [ # {'clf__C': [0.1, 1, 10, 50], 'classifier': linear_svc}, # # {'classifier': sgd_classifier}, # ] # gs = GridSearchCV(pipeline, param_grid, cv=5) # result = gs.fit(df, df.label).predict(df) # # Evaluation pipeline.fit(df1, df1.label) y_true, y_pred = df2.label, pipeline.predict(df2) # df2['predictions'] = y_pred # """ Print Scores pprint({'dataset': dataset, 'features': features}) scores = {} for scorer in metrics: scores[scorer] = [get_score_func(scorer)(y_true, y_pred)] pprint(scores, type='table')
class MKSHomogenizationModel(MKSStructureAnalysis): """ The `MKSHomogenizationModel` takes in microstructures and a their associated macroscopic property, and created a low dimensional structure property linkage. The `MKSHomogenizationModel` model is designed to integrate with dimensionality reduction techniques and predictive models. Attributes: degree: Degree of the polynomial used by `property_linker`. n_components: Number of components used by `dimension_reducer`. dimension_reducer: Instance of a dimensionality reduction class. property_linker: Instance of class that maps materials property to the microstuctures. correlations: spatial correlations to be computed basis: instance of a basis class reduced_fit_data: Low dimensionality representation of spatial correlations used to fit the model. reduced_predict_data: Low dimensionality representation of spatial correlations predicted by the model. periodic_axes: axes that are periodic. (0, 2) would indicate that axes x and z are periodic in a 3D microstrucure. coef_: Array of values that are the coefficients. intercept_: Value that are the intercept Below is an example of using MKSHomogenizationModel to predict (or classify) the type of microstructure using PCA and Logistic Regression. >>> import numpy as np >>> n_states = 3 >>> domain = [-1, 1] >>> from pymks.bases import LegendreBasis >>> leg_basis = LegendreBasis(n_states=n_states, domain=domain) >>> from sklearn.decomposition import PCA >>> from sklearn.linear_model import LogisticRegression >>> reducer = PCA(n_components=3) >>> linker = LogisticRegression() >>> model = MKSHomogenizationModel( ... basis=leg_basis, dimension_reducer=reducer, property_linker=linker) >>> from pymks.datasets import make_cahn_hilliard >>> X0, X1 = make_cahn_hilliard(n_samples=50) >>> y0 = np.zeros(X0.shape[0]) >>> y1 = np.ones(X1.shape[0]) >>> X = np.concatenate((X0, X1)) >>> y = np.concatenate((y0, y1)) >>> model.fit(X, y) >>> X0_test, X1_test = make_cahn_hilliard(n_samples=3) >>> y0_test = model.predict(X0_test) >>> y1_test = model.predict(X1_test) >>> assert np.allclose(y0_test, [0, 0, 0]) >>> assert np.allclose(y1_test, [1, 1, 1]) """ @deprecate def __init__(self, basis=None, dimension_reducer=None, n_components=None, property_linker=None, degree=1, periodic_axes=None, correlations=None, compute_correlations=True, n_jobs=1, store_correlations=False, mean_center=True): """ Create an instance of a `MKSHomogenizationModel`. Args: basis (class, optional): an instance of a bases class. dimension_reducer (class, optional): an instance of a dimensionality reduction class with a fit_transform method. The default class is PCA. property_linker (class, optional): an instance for a machine learning class with fit and predict methods. n_components (int, optional): number of components kept by the dimension_reducer degree (int, optional): degree of the polynomial used by property_linker. periodic_axes (list, optional): axes that are periodic. (0, 2) would indicate that axes x and z are periodic in a 3D microstrucure. correlations (list, optional): list of spatial correlations to compute, default is the autocorrelation with the first local state and all of its cross correlations. For example if basis has basis.n_states=3, correlation would be [(0, 0), (0, 1), (0, 2)]. If n_states=[0, 2, 4], the default correlations are [(0, 0), (0, 2), (0, 4)] corresponding to the autocorrelations for the 0th local state, and the cross correlations with the 0 and 2 as well as 0 and 4. compute_correlations (boolean, optional): If false spatial correlations will not be calculated as part of the fit and predict methods. The spatial correlations can be passed as `X` to both methods, default is True. n_jobs (int, optional): number of parallel jobs to run, only used if pyfftw is installed. store_correlations (boolean, optional): indicate if spatial correlations should be stored mean_center (boolean, optional): If true the data will be mean centered before dimensionality reduction is computed. """ if property_linker is None: property_linker = LinearRegression() self._linker = Pipeline([('poly', PolynomialFeatures(degree=degree)), ('connector', property_linker)]) self.degree = degree self.property_linker = property_linker if not callable(getattr(self.property_linker, "fit", None)): raise RuntimeError("property_linker does not have fit() method.") if not callable(getattr(self.property_linker, "predict", None)): raise RuntimeError( "property_linker does not have predict() method.") self.compute_correlations = compute_correlations self.reduced_fit_data = None self.reduced_predict_data = None if self.compute_correlations: if basis is None: raise RuntimeError(('a basis is need to compute spatial ') + ('correlations')) super(MKSHomogenizationModel, self).__init__(store_correlations=store_correlations, dimension_reducer=dimension_reducer, correlations=correlations, n_jobs=n_jobs, n_components=n_components, basis=basis, mean_center=mean_center, periodic_axes=periodic_axes) @property def n_components(self): return self._n_components @n_components.setter def n_components(self, value): """Setter for the number of components using by the dimension_reducer """ self._n_components = value self.dimension_reducer.n_components = value @property def degree(self): return self._degree @degree.setter def degree(self, value): """Setter for the polynomial degree for property_linker. """ self._degree = value self._linker.set_params(poly__degree=value) @property def coef_(self): return self._linker.named_steps['connector'].coef_ @coef_.setter def coef_(self, coef): """Setter for the coefficients for property_linker. """ self._linker.named_steps['connector'].coef_ = coef @property def intercept_(self): return self._linker.named_steps['connector'].intercept_ @intercept_.setter def intercept_(self, intercept): """Setter for the intercept for property_linker. """ self._linker.named_steps['connector'].intercept_ = intercept @property def property_linker(self): return self._property_linker @property_linker.setter def property_linker(self, prop_linker): """Setter for the property_linker class. """ self._property_linker = prop_linker self._linker.set_params(connector=prop_linker) def fit(self, X, y, reduce_labels=None, confidence_index=None, size=None): """ Fits data by calculating 2-point statistics from X, preforming dimension reduction using dimension_reducer, and fitting the reduced data with the property_linker. Args: X (ND array): The microstructures or spatial correlations, a `(n_samples, n_x, ...)` shaped array where `n_samples` is the number of samples and `n_x` is the spatial discretization. y (1D array): The material property associated with `X`. reducer_labels (1D array, optional): label for X used during the fit_transform method for the `dimension_reducer`. confidence_index (ND array, optional): array with same shape as X used to assign a confidence value for each data point. Example Let's first start with using the microstructure and effective properties. >>> import numpy as np >>> from sklearn.decomposition import PCA >>> from sklearn.linear_model import LinearRegression >>> from pymks.bases import PrimitiveBasis >>> from pymks.stats import correlate >>> reducer = PCA(n_components=2) >>> linker = LinearRegression() >>> prim_basis = PrimitiveBasis(n_states=2, domain=[0, 1]) >>> correlations = [(0, 0), (1, 1), (0, 1)] >>> model = MKSHomogenizationModel(prim_basis, ... dimension_reducer=reducer, ... property_linker=linker, ... correlations=correlations) >>> np.random.seed(99) >>> X = np.random.randint(2, size=(3, 15)) >>> y = np.array([1, 2, 3]) >>> model.fit(X, y) >>> X_stats = correlate(X, prim_basis) >>> X_reshaped = X_stats.reshape((X_stats.shape[0], -1)) >>> X_pca = reducer.fit_transform(X_reshaped - np.mean(X_reshaped, ... axis=1)[:, None]) >>> assert np.allclose(model.reduced_fit_data, X_pca) Now let's use the same method with spatial correlations instead of microtructures. >>> from sklearn.decomposition import PCA >>> from sklearn.linear_model import LinearRegression >>> from pymks.bases import PrimitiveBasis >>> from pymks.stats import correlate >>> reducer = PCA(n_components=2) >>> linker = LinearRegression() >>> prim_basis = PrimitiveBasis(n_states=2, domain=[0, 1]) >>> correlations = [(0, 0), (1, 1), (0, 1)] >>> model = MKSHomogenizationModel(dimension_reducer=reducer, ... property_linker=linker, ... compute_correlations=False) >>> np.random.seed(99) >>> X = np.random.randint(2, size=(3, 15)) >>> y = np.array([1, 2, 3]) >>> X_stats = correlate(X, prim_basis, correlations=correlations) >>> model.fit(X_stats, y) >>> X_reshaped = X_stats.reshape((X_stats.shape[0], X_stats[0].size)) >>> X_pca = reducer.fit_transform(X_reshaped - np.mean(X_reshaped, ... axis=1)[:, None]) >>> assert np.allclose(model.reduced_fit_data, X_pca) """ if self.compute_correlations: if size is not None: X = self.basis._reshape_feature(X, size) X = self._compute_stats(X, confidence_index) X_reshape = self._reduce_shape(X) X_reduced = self._fit_transform(X_reshape, reduce_labels) self._linker.fit(X_reduced, y) def predict(self, X, confidence_index=None): """Predicts macroscopic property for the microstructures `X`. Args: X (ND array): The microstructure, an `(n_samples, n_x, ...)` shaped array where `n_samples` is the number of samples and `n_x` is the spatial discretization. confidence_index (ND array, optional): array with same shape as X used to assign a confidence value for each data point. Returns: The predicted macroscopic property for `X`. Example >>> import numpy as np >>> from sklearn.manifold import LocallyLinearEmbedding >>> from sklearn.linear_model import BayesianRidge >>> from pymks.bases import PrimitiveBasis >>> np.random.seed(1) >>> X = np.random.randint(2, size=(50, 100)) >>> y = np.random.random(50) >>> reducer = LocallyLinearEmbedding() >>> linker = BayesianRidge() >>> prim_basis = PrimitiveBasis(2, domain=[0, 1]) >>> model = MKSHomogenizationModel(prim_basis, n_components=2, ... dimension_reducer=reducer, ... property_linker=linker) >>> model.fit(X, y) >>> X_test = np.random.randint(2, size=(1, 100)) Predict with microstructures >>> y_pred = model.predict(X_test) Predict with spatial correlations >>> from pymks.stats import correlate >>> model.compute_correlations = False >>> X_corr = correlate(X, prim_basis, correlations=[(0, 0)]) >>> model.fit(X_corr, y) >>> X_corr_test = correlate(X_test, prim_basis, ... correlations=[(0, 0)]) >>> y_pred_stats = model.predict(X_corr_test) >>> assert np.allclose(y_pred_stats, y_pred, atol=1e-3) """ if not hasattr(self._linker.get_params()['connector'], "coef_"): raise RuntimeError('fit() method must be run before predict().') _size = self._size_axes(self.basis) X = self.basis._reshape_feature(X, tuple(_size)) if self.compute_correlations is True: X = self._compute_stats(X, confidence_index) X_reduced = self._transform(X) self.reduced_predict_data = X_reduced return self._linker.predict(X_reduced) def score(self, X, y, confidence_index=None): """ The score function for the MKSHomogenizationModel. It formats the data and uses the score method from the property_linker. Args: X (ND array): The microstructure, an `(n_samples, n_x, ...)` shaped array where `n_samples` is the number of samples and `n_x` is the spatial discretization. y (1D array): The material property associated with `X`. confidence_index (ND array, optional): array with same shape as X used to assign a confidence value for each data point. Returns: Score for MKSHomogenizationModel from the selected property_linker. """ if not callable(getattr(self._linker, "score", None)): raise RuntimeError("property_linker does not have score() method.") _size = self._size_axes(self.basis) X = self.basis._reshape_feature(X, _size) if self.compute_correlations: X = self._compute_stats(X, confidence_index) X_reduced = self._transform(X) return self._linker.score(X_reduced, y) def _size_axes(self, basis): """Helper function used to get the correct size of the axes when using for both periodic and non-periodic axes. """ _size = self.basis._axes_shape if self.periodic_axes is None or len(self.periodic_axes) != len(_size): _axes = list(range(len(_size))) if self.periodic_axes is not None: [_axes.remove(a) for a in self.periodic_axes] _size = np.ones(len(_size), dtype=int) * _size _size[_axes] = _size[_axes] // 2 return tuple(_size)