def test_multi_output_predict_proba(): sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, tol=1e-3) param = {'loss': ('hinge', 'log', 'modified_huber')} # inner function for custom scoring def custom_scorer(estimator, X, y): if hasattr(estimator, "predict_proba"): return 1.0 else: return 0.0 grid_clf = GridSearchCV(sgd_linear_clf, param_grid=param, scoring=custom_scorer, cv=3, error_score=np.nan) multi_target_linear = MultiOutputClassifier(grid_clf) multi_target_linear.fit(X, y) multi_target_linear.predict_proba(X) # SGDClassifier defaults to loss='hinge' which is not a probabilistic # loss function; therefore it does not expose a predict_proba method sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, tol=1e-3) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) multi_target_linear.fit(X, y) err_msg = "The base estimator should implement predict_proba method" with pytest.raises(ValueError, match=err_msg): multi_target_linear.predict_proba(X)
def test_multi_output_classification(): # test if multi_target initializes correctly with base estimator and fit # assert predictions work as expected for predict, prodict_proba and score forest = RandomForestClassifier(n_estimators=10, random_state=1) multi_target_forest = MultiOutputClassifier(forest) # train the multi_target_forest and also get the predictions. multi_target_forest.fit(X, y) predictions = multi_target_forest.predict(X) assert_equal((n_samples, n_outputs), predictions.shape) predict_proba = multi_target_forest.predict_proba(X) assert len(predict_proba) == n_outputs for class_probabilities in predict_proba: assert_equal((n_samples, n_classes), class_probabilities.shape) assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1), predictions) # train the forest with each column and assert that predictions are equal for i in range(3): forest_ = clone(forest) # create a clone with the same state forest_.fit(X, y[:, i]) assert_equal(list(forest_.predict(X)), list(predictions[:, i])) assert_array_equal(list(forest_.predict_proba(X)), list(predict_proba[i]))
def test_multiclass_multioutput_estimator_predict_proba(): seed = 542 # make test deterministic rng = np.random.RandomState(seed) # random features X = rng.normal(size=(5, 5)) # random labels y1 = np.array(['b', 'a', 'a', 'b', 'a']).reshape(5, 1) # 2 classes y2 = np.array(['d', 'e', 'f', 'e', 'd']).reshape(5, 1) # 3 classes Y = np.concatenate([y1, y2], axis=1) clf = MultiOutputClassifier(LogisticRegression(random_state=seed)) clf.fit(X, Y) y_result = clf.predict_proba(X) y_actual = [np.array([[0.23481764, 0.76518236], [0.67196072, 0.32803928], [0.54681448, 0.45318552], [0.34883923, 0.65116077], [0.73687069, 0.26312931]]), np.array([[0.5171785, 0.23878628, 0.24403522], [0.22141451, 0.64102704, 0.13755846], [0.16751315, 0.18256843, 0.64991843], [0.27357372, 0.55201592, 0.17441036], [0.65745193, 0.26062899, 0.08191907]])] for i in range(len(y_actual)): assert_almost_equal(y_result[i], y_actual[i])
def test_multi_output_exceptions(): # NotFittedError when fit is not done but score, predict and # and predict_proba are called moc = MultiOutputClassifier(LinearSVC(random_state=0)) assert_raises(NotFittedError, moc.predict, y) assert_raises(NotFittedError, moc.predict_proba, y) assert_raises(NotFittedError, moc.score, X, y) # ValueError when number of outputs is different # for fit and score y_new = np.column_stack((y1, y2)) moc.fit(X, y) assert_raises(ValueError, moc.score, X, y_new)
def test_multiclass_multioutput_estimator(): # test to check meta of meta estimators svc = LinearSVC(random_state=0) multi_class_svc = OneVsRestClassifier(svc) multi_target_svc = MultiOutputClassifier(multi_class_svc) multi_target_svc.fit(X, y) predictions = multi_target_svc.predict(X) assert_equal((n_samples, n_outputs), predictions.shape) # train the forest with each column and assert that predictions are equal for i in range(3): multi_class_svc_ = clone(multi_class_svc) # create a clone multi_class_svc_.fit(X, y[:, i]) assert_equal(list(multi_class_svc_.predict(X)), list(predictions[:, i]))
def test_multi_output_classification_partial_fit_sample_weights(): # weighted classifier Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]] yw = [[3, 2], [2, 3], [3, 2]] w = np.asarray([2., 1., 1.]) sgd_linear_clf = SGDClassifier(random_state=1) clf_w = MultiOutputClassifier(sgd_linear_clf) clf_w.fit(Xw, yw, w) # unweighted, but with repeated samples X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]] y = [[3, 2], [3, 2], [2, 3], [3, 2]] sgd_linear_clf = SGDClassifier(random_state=1) clf = MultiOutputClassifier(sgd_linear_clf) clf.fit(X, y) X_test = [[1.5, 2.5, 3.5]] assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
def test_multi_output_classification_sample_weights(): # weighted classifier Xw = [[1, 2, 3], [4, 5, 6]] yw = [[3, 2], [2, 3]] w = np.asarray([2., 1.]) forest = RandomForestClassifier(n_estimators=10, random_state=1) clf_w = MultiOutputClassifier(forest) clf_w.fit(Xw, yw, w) # unweighted, but with repeated samples X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]] y = [[3, 2], [3, 2], [2, 3]] forest = RandomForestClassifier(n_estimators=10, random_state=1) clf = MultiOutputClassifier(forest) clf.fit(X, y) X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]] assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
bestcg.append([c,g,score.mean()]) bestCG = pd.DataFrame(bestcg,columns=['c','g','score']) print("Best score is",max(bestCG["score"])) bestCG.loc[bestCG["score"]==max(bestCG["score"])] # transformation from sklearn.preprocessing import StandardScaler sc = StandardScaler() StrainX = sc.fit_transform(trainX) StestX = sc.transform(testX) # training and fitting model bii_model = MultiOutputClassifier(OneVsRestClassifier(SVC(C=100.0,gamma=1.0),n_jobs=-1),n_jobs=-1) bii_Smodel = MultiOutputClassifier(OneVsRestClassifier(SVC(C=100.0,gamma=1.0),n_jobs=-1),n_jobs=-1) bii_model.fit(trainX,trainy) bii_Smodel.fit(StrainX,trainy) # predicting pred = bii_model.predict(testX) Spred = bii_Smodel.predict(StestX) print("Hamming Loss",hamloss(testy,pred)) hamlossL(testy,pred) print("Exact Match Score",exactmatch(testy, pred)) exactmatchL(testy, pred) print("Standardized data Hamming Loss",hamloss(testy,Spred)) hamlossL(testy,Spred)
# Establish a GridSearchCV variable with the classifier and paramter grid Grid = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5) Grid2 = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5) # Fit the GridSearchCV with the training data Gridfit = Grid.fit(xtrain, ytrain['Green Type']) Gridfit2 = Grid2.fit(xtrain, ytrain['Fairway Type']) print("Best Parameters for Predicting Green Turf", Grid.best_params_) print("Best Parameters for Predicting Fairway Turf", Grid2.best_params_) # Get the best model, use it in the Multi-Output Classifier, and make predictions bestforest = Grid.best_estimator_ multi_target = MultiOutputClassifier(bestforest, n_jobs=1) preds = multi_target.fit(xtrain, ytrain).predict(xtest) preds = pd.DataFrame(preds, columns=['Green Type', 'Fairway Type']) print(preds) # Encode target data and predictions in numerics so they can be plugged into scoring metrics le = preprocessing.LabelEncoder() labels = pd.concat([y['Green Type'], y['Fairway Type']], axis=0) lefit = le.fit(labels) ymatrix = ytest.copy() ymatrixp = preds.copy() ytest['Green Type'] = lefit.transform(ytest['Green Type']) ytest['Fairway Type'] = lefit.transform(ytest['Fairway Type']) preds['Green Type'] = lefit.transform(preds['Green Type']) preds['Fairway Type'] = lefit.transform(preds['Fairway Type'])
import numpy as np import pandas as pd from simulations.irs_v2x_simulation import IRSV2XSimulation from sklearn.multioutput import MultiOutputClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from joblib import dump, load data = pd.read_csv('data_position_simulation.csv') irs_antnum = 256 cols_x = [IRSV2XSimulation.COL_POS_X, IRSV2XSimulation.COL_POS_Y] # IRSV2XSimulation.COL_POS_Z, # IRSV2XSimulation.COL_SPEED] cols_y = [] for n in range(irs_antnum): cols_y.append(IRSV2XSimulation.COL_PHASE + str(n)) X = data[cols_x].to_numpy() Y = data[cols_y].to_numpy() # X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42) forest = RandomForestClassifier(n_estimators=100, random_state=1) classifier = MultiOutputClassifier(forest, n_jobs=-1) classifier.fit(X[0:100, :], Y[0:100, :]) dump(classifier, 'classifier.joblib') print(classifier.score(X, Y))
for j in range(len(i) - 1): temp.append(i[j]) evalX.append(temp) # The data from your screenshot # Q1 Q5c Q5d Q5e Q5f StateMap p_age_group_sdc p_education_sdc, Q4 train_data = np.array(trainSet) # These I just made up test_data_x = np.array(testSet) eval_data_x = np.array(evalX) x = train_data[:, :8] y = train_data[:, 8:] forest = RandomForestClassifier(n_estimators=100, random_state=1) classifier = MultiOutputClassifier(forest, n_jobs=-1) classifier.fit(x, y) pr = classifier.predict(evalX) print(pr) for i in pr: predict.append(i[0]) error_count = 0 for i in range(len(actual)): if actual[i] != predict[i]: error_count += 1 print("Precision: ", (593 - error_count) / 593)
datasetY.append(indices)#np.array(indices).astype('int')) mlb = MultiLabelBinarizer()#classes=len(radionuclides)) datasetY = mlb.fit_transform(datasetY) #datasetX = StandardScaler().fit_transform(datasetX) X_train, X_test, y_train, y_test = \ train_test_split(datasetX, datasetY, test_size=.4, random_state=42) #print(y_train) #print(type(y_train)) #y_train = y_train.astype('int') #y_test = y_test.astype('int') #print(X_train) #print(y_train) classifier.fit(X_train, y_train) score = classifier.score(X_test, y_test) print(score) # predict inv = ag.UnstablesInventory(data=[ (db.getzai(radionuclides[2]), ACTIVITY), (db.getzai(radionuclides[0]), ACTIVITY), (db.getzai(radionuclides[5]), ACTIVITY), (db.getzai(radionuclides[3]), ACTIVITY) ]) hist, _ = lc(inv, spectype=SPECTYPE) print(classifier.predict([[1 if bin > 0 else 0 for bin in hist ]]))
earlystopping = EarlyStopping(monitor='val_f1_score', patience=10, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='best.hdf5', verbose=1, save_best_only=True, save_weights_only=True, monitor='val_f1_score', mode='max') keras_model = KerasClassifier(build_fn=create_model, epochs=20, batch_size=batch_size) multi_target_forest = MultiOutputClassifier(keras_model, n_jobs=-1) print("fitting ...") multi_target_forest.fit(X_train, Y_train) # model.load_weights('best.hdf5') Y_pred = multi_target_forest.predict(test_sequences) Y_pred_thresh = (Y_pred > thresh).astype('int') with open(output_path, 'w') as output: print('\"id\",\"tags\"', file=output) for index, labels in enumerate(Y_pred_thresh): labels = [tag_list[i] for i, value in enumerate(labels) if value == 1] if len(labels) == 0: labels.append(tag_list[np.argmax(Y_pred[index])]) labels_original = ' '.join(labels) print('\"%d\",\"%s\"' % (index, labels_original), file=output)
# X,y=iris.data,iris.target # res=OutputCodeClassifier(LinearSVC(random_state=0),code_size=2,random_state=0).fit(X,y).predict(X) # print(res) #多输出回归 # from sklearn.datasets import make_regression # from sklearn.multioutput import MultiOutputRegressor # from sklearn.ensemble import GradientBoostingRegressor # X,y=make_regression(n_samples=10,n_targets=3,random_state=1) # res=MultiOutputRegressor(GradientBoostingRegressor(random_state=0)).fit(X,y).predict(X) # print(res) #多输出分类 from sklearn.datasets import make_classification from sklearn.multioutput import MultiOutputClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.utils import shuffle import numpy as np X,y1=make_classification(n_samples=10,n_features=100,n_informative=30,n_classes=3,random_state=1) y2=shuffle(y1,random_state=1) y3=shuffle(y1,random_state=2) Y=np.vstack((y1,y2,y3)).T n_samples,n_features=X.shape n_outputs=Y.shape[1] n_classes=3 forest=RandomForestClassifier(n_estimators=100,random_state=1) multi_target_forest=MultiOutputClassifier(forest,n_jobs=-1) res=multi_target_forest.fit(X,Y).predict(X) print(res)
y_train_regr = pd.DataFrame() y_test_regr = pd.DataFrame() for col in y_train.columns.values: if 'z' in col: y_train_regr[col] = y_train[col] y_test_regr[col] = y_test[col] else: y_train_clf[col] = y_train[col] y_test_clf[col] = y_test[col] mo_clf = MultiOutputClassifier(rs_clf) # Fit the data to the models print('Fitting data') mo_clf.fit(x_train, y_train_clf) rs_regr.fit(x_train, y_train_regr) # Print the results of the fit on the test data print('Test classification score: %.3f' % mo_clf.score(x_test, y_test_clf)) print('Test regression R2 score: %.3f' % rs_regr.score(x_test, y_test_regr)) # Plot the decision surfaces of the classifier and regressor x = pd.DataFrame(np.linspace(0, 5, 25)) y = pd.DataFrame(np.linspace(0, 5, 25)) # Create a grid to plot our predicted values over surf_x = pd.DataFrame(np.array(np.meshgrid( x, y,
refit=True, cv=3, random_state=1, return_train_score=True) fit_model(random_forest_Bayes_optimized_classifier, X_train, y_train, X_test) print(random_forest_Bayes_optimized_classifier.best_estimator_) #Show Confusion Matrix random_forest_optim = MultiOutputClassifier( RandomForestClassifier(n_estimators=2000, max_depth=20, min_samples_split=20, min_samples_leaf=4, max_features='auto')) classifier = random_forest_optim.fit(X_train, y_train) cm = multilabel_confusion_matrix(y_test, random_forest_optim.predict(X_test)) print(cm) ## Retrain best model on full dataset and fit to test_set_features random_forest_optim.fit(scaled_training_features, training_set_labels) preds = random_forest_optim.predict_proba(scaled_test_features) ## Format for submittal on DrivenData #Code copied from DrivenData to ensure correct format for submittal # Save predictions to submission data frame submission_format["h1n1_vaccine"] = preds[0][:, 1] submission_format["seasonal_vaccine"] = preds[1][:, 1] print(submission_format.head())
saen = StackedAutoEncoder(28*28, (400, ), tdata) ret = saen.fit_transform(tdata) #import matplotlib.pyplot as plt #graphs, axes = plt.subplots(nrows=5, ncols=5) #axes = axes.flatten() #for t in range(25): # axes[t].imshow(ret[t,:].reshape((28,28)), cmap='gray') # graphs.tight_layout() #plt.show() print("Training Model1 ...") model1 = MultiOutputClassifier(sklearn.ensemble.RandomForestClassifier(10)) model1.fit(saen.transform(tdata), ldata) print("Training Model2 ...") model2 = MultiOutputClassifier(sklearn.ensemble.RandomForestClassifier(10)) model2.fit(tdata, ldata) ttest = DataReader.ImageReader("../dataset/t10k-images-idx3-ubyte.gz").to_tensor() ltest = DataReader.LabelReader("../dataset/t10k-labels-idx1-ubyte.gz").to_tensor() pred1 = model1.predict(saen.transform(ttest)) pred2 = model2.predict(ttest) accu1 = accuracy_score(pred1, ltest) accu2 = accuracy_score(pred2, ltest) print("Model 1 Accuracy (with SEAN): %.3f, Model 2 Accuracy: %.3f" %(accu1, accu2))
return -log(yHat) else: return -log(1 - yHat) #Binary cross entropy def binary_cross_entropy(actual, predicted): sum_score = 0.0 for i in range(len(actual)): sum_score += actual[i] * log(1e-15 + predicted[i]) mean_sum_score = 1.0 / len(actual) * sum_score return -mean_sum_score print(binary_cross_entropy([1, 0, 1, 0], [1, 1, 1, 0])) #Multi-classification from sklearn.datasets import make_classification from sklearn.multioutput import MultiOutputClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.utils import shuffle X, y1 = make_classification(n_samples=10, n_features=100, n_informative=30, n_classes=3, random_state=1) y2 = shuffle(y1, random_state=1) y3 = shuffle(y1, random_state=2) Y = np.vstack((y1, y2, y3)).T n_samples, n_features = X.shape n_outputs = Y.shape[1] n_classes = 3 forest = RandomForestClassifier(random_state=1) multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1) print(multi_target_forest.fit(X, Y).predict(X))
df["reduced_text"] = df["text"].apply(lambda x: re.sub( r"""[\d\n!@#$%^&*()_\-=+/,<>?;:"[\]{}`~]""", " ", x.lower())) df["reduced_text"] = df["reduced_text"].apply( lambda x: re.sub("[\.']", "", x.lower())) # =========================================== # Baseline model, linear multi-label classifier, Bag of Words # Let's try to beat the most basic linear model using keras # =========================================== lin_cv = CountVectorizer(min_df=10, max_df=0.9) X_bow = lin_cv.fit_transform(df["reduced_text"]) Xl_train, Xl_test, y_train, y_test = get_data_splits(X_bow, y) bm = MultiOutputClassifier(SGDClassifier()) bm.fit(Xl_train, np.array(y_train.todense()) * 1) Xl_pred = bm.predict(Xl_test) score_lin = f1_score(y_test, Xl_pred, average="micro") print("Linear score", score_lin) # 82.5% print_confusion_matrix_sample(y_test, Xl_pred, 0) # =========================================== # NN model, Bag of Words # =========================================== X_train, X_test, y_train, y_test = get_data_splits(X_bow, y) model = nn_dense((1000, 0.5, 800, 0.5), X_train.shape[1], y_train.shape[1]) model.fit(X_train, y_train, batch_size=500, epochs=64, verbose=True) y_basicnn_pred = model.predict(X_test) bestscore_basic = f1_score(y_test, y_basicnn_pred > 0.5, average="micro") print("Linear score", score_lin) # 82.5%
class Classifier: def __init__(self): self.REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') self.BAD_SYMBOLS_RE = re.compile('[^\w\s]') self.STOPWORDS = set(stopwords.words('spanish')) self.tokenizer = None self.multilabel_binarizer = MultiLabelBinarizer() self.model = None self.maxlen = 100 def clean_text(self, text): text = text.lower() # lowercase text text = self.REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space. text = self.BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. # text = re.sub(r'\W+', '', text) text = ' '.join(word for word in text.split() if word not in self.STOPWORDS) # remove stopwors from text return text def clean_text_in_tags(self, tags): clean_tags = [] for tag in tags: clean_tags = clean_tags + [self.clean_text(tag)] return clean_tags def clean_news(self, df): print("cleaning the text data") df = df.reset_index(drop=True) df.dropna(subset=['tags'], inplace=True) df['tags'] = df['tags'].apply(self.clean_text_in_tags) df['content'] = df['content'].apply(self.clean_text) df['content'] = df['content'].str.replace('\d+', '') return df def create_tags_and_multilabel_biniarizer(self, df): print("creating tags and tag index for classes") y = self.multilabel_binarizer.fit_transform(df.tags) # Serialize both the pipeline and binarizer to disk. with open('../data/neural_network_config/multilabel_binarizer.pickle', 'wb') as f: pickle.dump((self.multilabel_binarizer), f, protocol=pickle.HIGHEST_PROTOCOL) return y def load_tokenizer(self, sentences): print("loading toikenizer") self.tokenizer = Tokenizer(num_words=5000) self.tokenizer.fit_on_texts(sentences) # saving tokenizer with open('../data/neural_network_config/tokenizer.pickle', 'wb') as handle: pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) def create_train_and_test_data(self, sentences, y): print("separating data into test data and train data") sentences_train, sentences_test, y_train, y_test = train_test_split( sentences, y, test_size=0.25, random_state=1000) X_train = self.tokenizer.texts_to_sequences(sentences_train) X_test = self.tokenizer.texts_to_sequences(sentences_test) X_train = pad_sequences(X_train, padding='post', maxlen=self.maxlen) X_test = pad_sequences(X_test, padding='post', maxlen=self.maxlen) return X_train, X_test, y_train, y_test def create_model(self): print("creating model") #self.model = Sequential() #self.model.add(Embedding(vocab_size, 20, input_length=self.maxlen)) #self.model.add(Dropout(0.1)) #self.model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1)) #self.model.add(GlobalMaxPool1D()) #self.model.add(Dense(output_size)) #self.model.add(Activation('sigmoid')) #self.model.compile(optimizer=Adam(0.015), loss='binary_crossentropy', metrics=['categorical_accuracy']) self.model = MultiOutputClassifier(KNeighborsClassifier()) def save_model(self): print("saving model") # saving model dump(self.model, '../data/neural_network_config/model-sklearn-kneighbors.joblib') def create_and_train_model(self): filename = "../data/json_news_tagged_bundle/clean_data-unified-tags.json" df = pd.read_json(filename) df = self.clean_news(df) y = self.create_tags_and_multilabel_biniarizer(df) sentences = df['content'].values self.load_tokenizer(sentences) X_train, X_test, y_train, y_test = self.create_train_and_test_data(sentences, y) self.create_model() history = self.model.fit(X_train, y_train) print(history) self.save_model()
# Split Train/Test ############################################################################### (inputs, outputs) = (DATA[FEATS], DATA[['CPT', 'WOP']]) (TRN_X, VAL_X, TRN_Y, VAL_Y) = train_test_split( inputs, outputs, test_size=float(VT_SPLIT), stratify=outputs ) (TRN_L, VAL_L) = [i.shape[0] for i in (TRN_X, VAL_X)] ############################################################################### # Define Model ############################################################################### rf = RandomForestClassifier( n_estimators=TREES, max_depth=DEPTH, criterion='entropy', min_samples_split=5, min_samples_leaf=50, max_features=None, max_leaf_nodes=None, n_jobs=JOB ) clf = MultiOutputClassifier(rf) # K-fold training ------------------------------------------------------------- kScores = cross_val_score(clf, TRN_X, TRN_Y) kScores ############################################################################### # Train Model ############################################################################### clf.fit(TRN_X, TRN_Y) # Predict --------------------------------------------------------------------- PRD_Y = clf.predict(VAL_X) clf.score(VAL_X, VAL_Y)
def pedicting_tag(request): print 'inside predicting tag' class lemmatokenizer(object): def __init__(self): self.stemmer = SnowballStemmer('english') self.token_pattern = r"(?u)\b\w\w+\b" # self.wnl = WordNetLemmatizer() def __call__(self,doc): # here, doc is one string sentence token_pattern = re.compile(self.token_pattern) return [self.stemmer.stem(t) for t in token_pattern.findall(doc)] # return lambda doc: token_pattern.findall(doc) # return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] vect_title = CountVectorizer(max_df=0.5,min_df=5,stop_words='english',tokenizer=lemmatokenizer(),ngram_range=(1,3)) # In[9]: tfidf_vect_title = TfidfVectorizer(smooth_idf=False,max_df=0.5,min_df=5,stop_words='english',tokenizer=lemmatokenizer(),ngram_range=(1,3)) le = preprocessing.LabelEncoder() le.fit(y_labels) d_set['label_num'] = pd.Series([le.transform(ast.literal_eval(i)) for i in d_set['tag']]) d_set.head() new_y_labels = d_set['label_num'].values.tolist() mlb = MultiLabelBinarizer() mlb.fit(new_y_labels) y_tag_dtm = mlb.transform(new_y_labels) y_tag_dtm.shape # In[14]: X_labels = d_set['title'].values.tolist() # print (X_labels) # In[15]: vect_title.fit(X_labels) X_title_dtm = vect_title.transform(X_labels) X_title_dtm from sklearn.decomposition import PCA pca = PCA(n_components=100).fit(X_title_dtm.toarray()) pca_samples = pca.transform(X_title_dtm.toarray()) pca_df = pd.DataFrame(np.round(pca_samples,4)) print (pca_df.head()) # In[ ]: # In[17]: new_df = pd.DataFrame(X_title_dtm.toarray(),columns=vect_title.get_feature_names()) new_df.shape d = collections.Counter(vect_title.get_feature_names()) new_df['target_list'] = [i for i in y_tag_dtm] tfidf_vect_title.fit(X_labels) X_title_dtm_tfidf = tfidf_vect_title.transform(X_labels) X_title_dtm_tfidf # In[23]: new_df_of_tfidf = pd.DataFrame(X_title_dtm_tfidf.toarray(),columns=tfidf_vect_title.get_feature_names()) # In[24]: new_df_of_tfidf['target_list'] = [i for i in y_tag_dtm] # In[25]: y = new_df_of_tfidf['target_list'] X = new_df_of_tfidf.drop('target_list',axis=1) X = np.array(X.values.tolist()) # it will convert list to numpy ndarray y = np.array(y.values.tolist()) # In[28]: # print (X[0]) # In[29]: pca_X = PCA(n_components=200).fit_transform(X) pca_X = np.round(pca_X,4) pca_y = PCA(n_components=50).fit_transform(y) pca_y = np.round(pca_y,4) # In[30]: print (pca_y) # In[31]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # In[32]: # X_train, X_test, y_train, y_test = train_test_split(pca_X, pca_y, test_size=0.2, random_state=1) # In[ ]: # In[33]: # clf = Pipeline([('classifier',OneVsRestClassifier(SVC(probability=True,random_state=0)))]) # just to for Pipeline example knn_clf = KNeighborsClassifier(n_neighbors=5) # mnb_clf = MultinomialNB() # not working for MultiLabelinput # svc_clf = OneVsRestClassifier(SVC(probability=True,random_state=0)) # time_pass_y = np.random.randint(2,size=(2838,1)) # produce ndarray of size 2838 X 1 knn_clf.fit(X_train, y_train) # mnb_clf.fit(X_train, y_train) knn_pred = knn_clf.predict(X_test) # mnb_pred = mnb_clf.predict(X_test) # svc_pred = svc_clf.predict(X_test) # In[34]: knn_clf.score(X_test, y_test) # In[53]: from sklearn import metrics knn_report = metrics.classification_report(y_test[:100], knn_pred[:100]) knn_f1_score = metrics.f1_score(y_test[:], knn_pred[:], average='samples') knn_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, knn_pred, average='samples') # on full data-set knn_avg_precision_score = metrics.average_precision_score(y_test, knn_pred, average='samples') knn_roc_auc_score = metrics.roc_auc_score(y_test, knn_pred, average='samples') # mnb_report = metrics.classification_report(y_test[:100], mnb_pred[:100]) #throwing error mnb_clf can't work on multilabel O/P # In[36]: metrics.accuracy_score(y_true=y_test[:100], y_pred=knn_pred[:100]) # I think it's same as calculating hamming_score # In[37]: # print (knn_report) # its type is str print "For knn_clf (KNearestNeighbours) : " print "precision, recall, fbeta_score, support : ",knn_precision_recall_fscore print "f1_score : ",knn_f1_score print "avg. precision_score : ",knn_avg_precision_score print "roc_auc_score : ",knn_roc_auc_score # In[38]: # def does_test_tag_match(d, list_of_tags): # no need for this function # In[39]: test = ["how to use policy iteration in ml ?"] # test = ["what is lstm ?"] # test_dtm = vect_title.transform(test) # without tfidf test_dtm = tfidf_vect_title.transform(test) # with tfidf # print (test_dtm.toarray()[0]) status = False for i in test_dtm.toarray()[0]: if (i!=0): status = True break ans = knn_clf.predict(test_dtm.toarray()) ans = mlb.inverse_transform(ans) if (len(ans[0])==0 or status==False): print ("sorry, we can't predict your category!!!") else: ans = le.inverse_transform(ans) print (ans) forest = RandomForestClassifier(n_estimators=100, random_state=0) rf_clf = MultiOutputClassifier(forest, n_jobs=-1) rf_clf.fit(X_train, y_train) rf_pred = rf_clf.predict(X_test) # In[41]: rf_clf # In[42]: metrics.accuracy_score(y_true=y_test[:100], y_pred=rf_pred[:100]) # I think it's same as calculating hamming_score # In[43]: rf_clf.score(X_test, y_test) rf_report = metrics.classification_report(y_test[:100], rf_pred[:100]) rf_f1_score = metrics.f1_score(y_test, rf_pred, average='samples') rf_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, rf_pred, average='samples') # on full data-set rf_avg_precision_score = metrics.average_precision_score(y_test, rf_pred, average='samples') rf_roc_auc_score = metrics.roc_auc_score(y_test, rf_pred, average='samples') # In[47]: # print (rf_report) print "For rf_clf (RandomForest) : " print "precision, recall, fbeta_score, support : ",rf_precision_recall_fscore print "f1_score : ",rf_f1_score print "avg. precision_score : ",rf_avg_precision_score print "roc_auc_score : ",rf_roc_auc_score # test = ["what is reinforcement learning ?"] test = ["what is ai,lstm and data visualization ?"] # test_dtm = vect_title.transform(test) # without tfidf test_dtm = tfidf_vect_title.transform(test) # with tfidf status = False for i in test_dtm.toarray()[0]: if (i!=0): status = True break ans = rf_clf.predict(test_dtm.toarray()) ans = mlb.inverse_transform(ans) if (len(ans[0])==0 or status==False): print ("sorry, we can't predict your category!!!") else: ans = le.inverse_transform(ans) print (ans)
def setup(train_files, test_files, specific): scalar = StandardScaler() mlb = MultiLabelBinarizer() train_labels = [] train_data = [] train_keys = [] for f in train_files.keys(): path = constants.path + 'acousticbrainz-mediaeval-train/' + f[: 2] + '/' + f + '.json' song = readjson(path) feat = getFeature(song) if len(feat) != 391: continue train_keys.append(f) train_data.append(feat) train_labels.append(train_files[f]) print('finished train') train_labels = mlb.fit_transform(train_labels) train_data = scalar.fit_transform(train_data) print('finished transforming') path = constants.path + specific + '_mlb.pkl' dump(mlb, path) path = constants.path + specific + '_scalar.pkl' dump(scalar, path) path = constants.path + specific + '_train.pkl' data = dict() data['features'] = train_data data['labels'] = train_labels data['keys'] = train_keys dump(data, path) print('finished dumping') #classifier = MultiOutputClassifier(LinearSVC(C=10, class_weight='balanced', dual=True), n_jobs = 4) classifier = MultiOutputClassifier(RandomForestClassifier( n_estimators=20, class_weight='balanced'), n_jobs=4) classifier.fit(train_data, train_labels) print('finished fitting') data = 0 train_data = 0 train_labels = 0 train_keys = 0 gc.collect() #test_labels = [] test_data = [] test_keys = list(test_files.keys()) mean = scalar.mean_ for f in test_keys: path = constants.path + 'acousticbrainz-mediaeval-train/' + f[: 2] + '/' + f + '.json' song = readjson(path) feat = getFeature(song) if len(feat) < 391: length = len(feat) for m in mean[length:]: feat += [m] test_data.append(feat) test_data = scalar.transform(test_data) predictions = classifier.predict(test_data) print('finished predictions') genre_predictions = mlb.inverse_transform(predictions) write(genre_predictions, test_keys, specific) print('finished writing predictions')
for key, value in sample_files.items(): value['data'] = get_training_data("data/" + key, num_training_lists) # Combine all the training data arrays into one big feature set X = np.vstack(list(map(lambda x: x['data'], sample_files.values()))) # X = normalize(X) # Build a label list that corresponds to the feature set y = [] for value in sample_files.values(): y += [value['labels']] * len(value['data']) y = np.array(mlb.transform(y)) # Use a multi-label classifier implementing Multinomial Naive Bayes clf = MultiOutputClassifier(MultinomialNB()) clf.fit(X, y) print(f'Mean accuracy: {clf.score(X, y)}') num_folds = 10 cv_score = cross_val_score(clf, X, y, cv=num_folds) print(f'{num_folds}-fold cross-validation: {cv_score}') # Perform real-time tests for each input file for key, value in sample_files.items(): print("\nPerforming real-time classification of " f"{', '.join(value['labels'])}") start_time = timeit.default_timer() features = Serializer("data/" + key).classify_realtime(clf) total_time = timeit.default_timer() - start_time print(f'Classified in {total_time} seconds')
def mycode(train_files, test_files, specific, indicies): indicies = np.array(indicies) #indicies = indicies[:len(indicies)//4] scalar = StandardScaler() mlb = MultiLabelBinarizer() train_labels = [] train_data = [] train_keys = [] keys = list(train_files.keys()) random.shuffle(keys) subset = 150000 #len(keys) count = 0 for f in keys[:subset]: count += 1 path = constants.path + 'acousticbrainz-mediaeval-train/' + f[: 2] + '/' + f + '.json' song = readjson(path) feat = getAllFeatures(song) if len(feat) != 2647: continue feat = np.array(feat) feat = feat[indicies] train_keys.append(f) train_data.append(feat) train_labels.append(train_files[f]) if count % 10000 == 0: print("on ", count, "length of keys: ", len(train_keys)) print('finished train') train_labels = mlb.fit_transform(train_labels) train_data = scalar.fit_transform(train_data) print('finished transforming') path = constants.path + specific + '_all2_mlb.pkl' dump(mlb, path) path = constants.path + specific + '_all2_scalar.pkl' dump(scalar, path) print(np.shape(train_data)) path = constants.path + specific + '_all2_train.pkl' data = dict() data['features'] = train_data data['labels'] = train_labels data['keys'] = train_keys dump(data, path) print('finished dumping') #classifier = MultiOutputClassifier(LinearSVC(C=10, class_weight='balanced', dual=True), n_jobs = 4) classifier = MultiOutputClassifier(RandomForestClassifier( n_estimators=32, class_weight='balanced'), n_jobs=4) data = 0 train_files = 0 train_keys = 0 keys = 0 gc.collect() classifier.fit(train_data, train_labels) print('finished fitting') path = constants.path + specific + '_all2_classifier.pkl' dump(classifier, path) """ with open(constants.path + specific + '_all2_scalar.pkl', 'rb') as data_file: scalar = pickle.load(data_file) with open(constants.path + specific + '_all2_mlb.pkl', 'rb') as data_file: mlb = pickle.load(data_file) with open(constants.path + specific + '_all2_classifier.pkl', 'rb') as data_file: classifier = pickle.load(data_file) """ data = 0 train_data = 0 train_labels = 0 train_keys = 0 gc.collect() #test_labels = [] test_data = [] test_keys = list(test_files.keys()) mean = scalar.mean_ for f in test_keys: path = constants.path + 'acousticbrainz-mediaeval-train/' + f[: 2] + '/' + f + '.json' song = readjson(path) feat = getAllFeatures(song) """ if len(feat) < 2647: length = len(feat) for m in mean[length:]: feat += [m] """ #feat = np.array(feat) if len(feat) < 2647: length = len(feat) print('Before: ', length) for m in range(2647 - length): feat += [np.random.rand()] #m = mean[indicies.index(2647)] #feat += [m] print('After: ', len(feat)) feat = np.array(feat) feat = feat[indicies] test_data.append(feat) test_data = scalar.transform(test_data) predictions = classifier.predict(test_data) print('finished predictions') genre_predictions = mlb.inverse_transform(predictions) write(genre_predictions, test_keys, specific) print('finished writing predictions')
def main(): # Script argument parsing parser = argparse.ArgumentParser( description= 'Homework 03 - Machine learning a.a. 2018/19 - Predict missing values', epilog=' coded by: Emanuele Palombo') parser.add_argument('dataset_name', metavar='DATASET', type=str, nargs='?', default=__default_ts_name, help='{} (default {}) - dataset name'.format( list(__ts_opts.keys()), __default_ts_name)) parser.add_argument( '--test-size', '-t', dest='test_size', action='store', metavar='TEST_SIZE', type=float, default=__default_test_size, help='[0-1] (default {}) - splitting size of TestSet'.format( __default_test_size)) parser.add_argument( '--question-marks-ts', '-q', dest='qm_repeted_ts', action='store', type=int, default=__default_question_mark_count_repeated, help= '{{0,1,2...}} (default {}) - (this value * {} * samples) added to TrainingSet' .format(__default_question_mark_count_repeated, __default_question_mark_count)) parser.add_argument( '--no-split', '-s', dest='no_split', action='store_true', default=__default_no_split, help='(default {}) - keep whole DataSet for training'.format( __default_no_split)) parser.add_argument('--img-tag', '-i', dest='img_tag', action='store', type=str, default='', help='string - add arbitrary string to saved images') parser.add_argument( '--verbose', '-v', dest='verbosity', action='count', default=__default_training_verbosity, help='add more verbosity to output (repeat it to increase)') args = parser.parse_args() if args.dataset_name not in __ts_opts: print('ERROR: Choose correct DataSet!\n') parser.print_help() exit(1) trainingset_selected_name = args.dataset_name test_size = args.test_size qm_repeted_ts = args.qm_repeted_ts dataset_no_split = args.no_split training_verbosity = args.verbosity img_tag = args.img_tag running_id = id_generator() ts_selected_opts = __ts_opts[trainingset_selected_name] # End script argument parsing print('\nDataSet selected: ' + ts_selected_opts['url']) # read dataset to pandas dataframe dataset = pd.read_csv(ts_selected_opts['url'], names=ts_selected_opts['columns']) if training_verbosity >= 1: print('\nFirst five rows of DataSet:\n') print(dataset.head()) print('\nDataSet Length: {}'.format(len(dataset))) # DataSet Manipulation # remove row with question marks (this avoid to have '?' on the output) dataset = dataset[~(dataset.astype(str) == '?').any(1)] # strip out (remove) the "real output" (y) dataset = dataset.iloc[ts_selected_opts['x_slice'][0], ts_selected_opts['x_slice'][1]] # Different approach to value conversion # convert all column to int (str => int) # dataset = dataset.apply(lambda x: pd.factorize(x)[0] + 1) # convert all columns to int dataset = dataset.astype(int) # dataSet Information features_count = len(dataset.columns) features_values = ds_features_values(dataset) # copy input features to output (columns * 2) for column in dataset.columns: dataset['y_' + column] = dataset[column] # Split DataSet training_set, test_set = train_test_split( dataset, test_size=test_size, random_state=__default_train_test_split_random_state) # check feature values between TrainingSet and TestSet # it's important avoid more value on TestSet (ie. error on log_loss for mismatch in predict_proba size) if not check_labels_split(features_count, training_set, test_set): exit(1) # Concat (add row) TrainingSet and TestSet # in this case model could see all sample (included queries without '?') if dataset_no_split: training_set = pd.concat([training_set, test_set], axis=0) print('\nTraining over the whole DataSet') else: print('\nSplit DataSet in TrainingSet and TestSet (test size: {})'. format(test_size)) # add (append) question mark # append qm_count rows, with 1 to qm_count '?' qm_count = int(ts_selected_opts['question_mark_count']) for i in range(qm_repeted_ts): for value_count in range(1, qm_count + 1): training_set = ds_mod_with_value(training_set, value_count, features_count, True) if training_verbosity >= 1: print( '{} Added {} question mark (?) to TrainingSet for each sample' .format(i, value_count)) # Shuffle TrainingSet training_set = training_set.sample(frac=1) if training_verbosity >= 1: print('\nManipulated TrainingSet:\n') print(training_set.head()) print('\nTrainingSet Length: {}'.format(len(training_set))) # TrainingSet: input X (features) and Output y ("mirrored" features)) x_train = training_set.iloc[:, 0:features_count] y_train = training_set.iloc[:, features_count:] # TestSet: input X (features) and Output y ("mirrored" features)) x_test = test_set.iloc[:, 0:features_count] y_test = test_set.iloc[:, features_count:] if training_verbosity >= 2: print('\nInput train:\n {}'.format(x_train.head())) print('\nOutput train:\n {}'.format(y_train.head())) print('\nInput test:\n {}'.format(x_test.head())) print('\nOutput test:\n {}'.format(y_test.head())) x_train = x_train.values y_train = y_train.values y_test = y_test.values # oneHot encoding (characteristic vector) # passing features_values without None force OneHotEncoder to transform None to null vector one_hot_encoder = OneHotEncoder(categories=features_values, handle_unknown='ignore') one_hot_encoder.fit(x_train) x_train_encoded = one_hot_encoder.transform(x_train).toarray() if training_verbosity >= 2: print('\nOneHotEncoding...\nexample: {} => {}'.format( x_train[0], x_train_encoded[0])) # store all results/metrics for each model/classifier results = {} for classifier_name in __deafult_model_classifier: filename = 'model_{}_{}.sav'.format(trainingset_selected_name, classifier_name) if os.path.isfile(filename): # load module already trained multi_output_classifier = joblib.load(filename) print( '\n### Model {} loaded by file: {}\nImportant: remove the file to re-train the model!' .format(classifier_name, filename)) else: n_jobs = None model_verbosity = True if training_verbosity >= 3 else False if classifier_name == 'MLP': classifier = MLPClassifier(hidden_layer_sizes=ts_selected_opts[ 'mlp_hidden_layers_sizes'], max_iter=1000, verbose=model_verbosity) elif classifier_name == 'KNN': n_jobs = None classifier = KNeighborsClassifier( n_neighbors=ts_selected_opts['knn_k']) elif classifier_name == 'SVM': classifier = SVC(gamma='scale', decision_function_shape='ovo', probability=True, verbose=model_verbosity) elif classifier_name == 'RandomForest': classifier = RandomForestClassifier( n_estimators=ts_selected_opts['random_forest_estimator'], verbose=model_verbosity) print('\n### Init and training the model: {}'.format( classifier_name)) # init MultiOutput for classifier multi_output_classifier = MultiOutputClassifier(classifier, n_jobs=n_jobs) multi_output_classifier.fit(x_train_encoded, y_train) # save the model to disk joblib.dump(multi_output_classifier, filename) results[classifier_name] = collections.defaultdict(list) metris_result = results[classifier_name] # create input test (query) with different number of '?' for query_count_question_mark in range( ts_selected_opts['question_mark_count'] + 1): print('\n## Add {} questions mark to input test (query)'.format( query_count_question_mark)) # modify (in place) input test with question marks x_test_with_qm = ds_mod_with_value( x_test.copy(), value_count=query_count_question_mark, append=False) if training_verbosity >= 2: print('\nInput test (query):\n {}'.format( pd.DataFrame(data=x_test_with_qm).head())) # encode the input test x_test_encoded = one_hot_encoder.transform( x_test_with_qm).toarray() # compute output prediction and probability y_pred = multi_output_classifier.predict(x_test_encoded) y_pred_proba = multi_output_classifier.predict_proba( x_test_encoded) # precision on whole output score = multi_output_classifier.score(x_test_encoded, y_test) # the Hamming loss corresponds to the Hamming distance between y_test and y_pred hamming_loss = np.sum(np.not_equal(y_test, y_pred)) / float( y_test.size) # compute y_test and y_pred how if the out was only the query question marks y_test_reduced, y_pred_reduced = reduce_y_to_qm( x_test_with_qm, y_test, y_pred) # write y_pred_proba to file (csv) write_pred_proba( y_pred_proba, '{}{}-{}-q{}-{}{}.csv'.format(__default_csv_path, trainingset_selected_name, classifier_name, query_count_question_mark, running_id, img_tag)) print('\nMetrics:') print(' {:<30} | {:^10} | {:>10}'.format('features', 'accuracy', 'log loss')) print('-' * (30 + 10 + 10 + 7)) log_loss_avg = 0 # for each output column => compute accuracy and log_loss for feature_index in range(y_test.shape[1]): y_test_column = y_test[:, feature_index] y_pred_column = y_pred[:, feature_index] accuracy = accuracy_score(y_test_column, y_pred_column) # note: for avoid error here was implemented check_labels_split() log_loss_value = log_loss( y_test_column, y_pred_proba[feature_index], labels=features_values[feature_index]) print(' {:<30} | {:^10.4f} | {:>10.4f}'.format( test_set.columns[feature_index], accuracy, log_loss_value)) log_loss_avg += log_loss_value metris_result['accuracy_' + str(feature_index)].append(accuracy) metris_result['log_loss_' + str(feature_index)].append(log_loss_value) print('\nVirtual reduced output:') # for each output reduced (only question marks) => compute accuracy for index in range(query_count_question_mark): accuracy = accuracy_score(y_test_reduced[:, index], y_pred_reduced[:, index]) print(' accuracy {}: {:>10.4f}'.format(index, accuracy)) metris_result['accuracy_reduced_' + str(index)].append(accuracy) print('\nAll output:') print(' accuracy: {:>10.4f}'.format(score)) print(' log_loss avg: {:>10.4f}'.format(log_loss_avg / y_test.shape[1])) print(' hamming loss: {:>10.4f}'.format(hamming_loss)) metris_result['accuracy'].append(score) metris_result['log_loss_avg'].append(log_loss_avg / y_test.shape[1]) metris_result['hamming_loss'].append(hamming_loss) # GRAPH PLOT per model/classifier plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1), [ results[classifier_name]['accuracy'], results[classifier_name]['log_loss_avg'], results[classifier_name]['hamming_loss'] ], labels=['accuracy', 'log loss avg', 'hamming loss'], fmt=['bo-', 'ro-', 'yo-'], title=classifier_name, xlabel='Number of Question Marks in the query', ymax=1) if __default_save_img: plt.savefig('{}{}-{}-{}{}.png'.format(__default_imgs_path, trainingset_selected_name, classifier_name, running_id, img_tag), dpi=200) # create list of list of accuracy x feature accuracy_lst = [ 'accuracy_' + str(index) for index in range(features_count) ] accuracy_lst = [ results[classifier_name][accuracy_key] for accuracy_key in accuracy_lst ] plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1), [results[classifier_name]['accuracy']] + accuracy_lst, fmt=['bo-'] + ['g.--'] * len(accuracy_lst), title=classifier_name + ': whole accuracy and those by features', xlabel='Number of Question Marks in the query', ylabel='accuracy', ymax=1) if __default_save_img: plt.savefig('{}{}-{}-accuracy-{}{}.png'.format( __default_imgs_path, trainingset_selected_name, classifier_name, running_id, img_tag), dpi=200) # create list of list of accuracy_reduced x feature (adding 0 in front when needed) accuracy_reduced_lst = [ 'accuracy_reduced_' + str(index) for index in range(ts_selected_opts['question_mark_count']) ] accuracy_reduced_lst = [ results[classifier_name][accuracy_reduced] for accuracy_reduced in accuracy_reduced_lst ] accuracy_reduced_lst = [[None] * (ts_selected_opts['question_mark_count'] - len(accuracy_reduced) + 1) + accuracy_reduced for accuracy_reduced in accuracy_reduced_lst] plot_line_graph( range(ts_selected_opts['question_mark_count'] + 1), [results[classifier_name]['accuracy']] + accuracy_reduced_lst, fmt=['bo-'] + ['m.--'] * len(accuracy_reduced_lst), title=classifier_name + ': whole accuracy and the virtual accuracies by features', xlabel='Number of Question Marks in the query', ylabel='accuracy', ymax=1) if __default_save_img: plt.savefig('{}{}-{}-accuracy-reduced-{}{}.png'.format( __default_imgs_path, trainingset_selected_name, classifier_name, running_id, img_tag), dpi=200) # create list of list of log_loss x feature log_loss_lst = [ 'log_loss_' + str(index) for index in range(features_count) ] log_loss_lst = [ results[classifier_name][log_loss_key] for log_loss_key in log_loss_lst ] plot_line_graph( range(ts_selected_opts['question_mark_count'] + 1), [results[classifier_name]['log_loss_avg']] + log_loss_lst, fmt=['ro-'] + ['c.--'] * len(log_loss_lst), title=classifier_name + ': average log loss and those by features', xlabel='Number of Question Marks in the query', ylabel='log loss') if __default_save_img: plt.savefig('{}{}-{}-log-loss-{}{}.png'.format( __default_imgs_path, trainingset_selected_name, classifier_name, running_id, img_tag), dpi=200) metrics_by_classifier = [ results[classifier][metric] for classifier in __deafult_model_classifier for metric in ['accuracy', 'log_loss_avg', 'hamming_loss'] ] label_by_classifier = [ classifier + ' ' + metric for classifier in __deafult_model_classifier for metric in ['accuracy', 'log_loss_avg', 'hamming_loss'] ] fmt_lst = [ style.replace('0', character) for character in ['o', '^', 'v', '<', '>', '.', ',', '+', 'x'] for style in ['b0-', 'r0-', 'y0-'] ] # GRAPH PLOT comparing model/classifier plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1), metrics_by_classifier, labels=label_by_classifier, fmt=fmt_lst, title='Compare all model', xlabel='Number of Question Marks in the query', ylabel='', ymax=1) if __default_save_img: plt.savefig('{}{}-comparing-{}{}.png'.format( __default_imgs_path, trainingset_selected_name, running_id, img_tag), dpi=200) if not __default_save_img: plt.show()
]] y_train = train_data[[ col for col in train_data.columns if col.startswith('target') ]].drop(['target_0'], axis=1) X_test = test_data[[ col for col in test_data.columns if col.startswith('feat') ]] y_test = test_data[[ col for col in test_data.columns if col.startswith('target') ]].drop(['target_0'], axis=1) ############################################################ # train classifier model = MultiOutputClassifier(DummyClassifier(strategy='stratified')) model.fit(X_train, y_train) # evaluate test data y_pred = model.predict(X_test) run.log('precision_macro', precision_score(y_test, y_pred, average='macro')) run.log('precision_samples', precision_score(y_test, y_pred, average='samples')) run.log('recall_macro', recall_score(y_test, y_pred, average='macro')) run.log('recall_samples', recall_score(y_test, y_pred, average='samples')) run.log('hamming_loss', hamming_loss(y_test, y_pred)) run.log('zero_one_loss', zero_one_loss(y_test, y_pred)) # evaluate train data y_pred = model.predict(X_train) run.log('precision_macro_train', precision_score(y_train, y_pred, average='macro'))
max_iter=500, class_weight="balanced", random_state=42, n_jobs=1)) else: model = MultiOutputRegressor( LassoCV(eps=1e-9, n_alphas=16, cv=3, tol=1e-4, max_iter=500, random_state=42, n_jobs=1)) # train the model model.fit(X.iloc[train_idx, :], Y.iloc[train_idx, :]) # In[2]: Collect the predictions # predict training and testing data train_predict = pd.DataFrame(model.predict(X.iloc[train_idx, :]), columns=Y.columns) test_predict = pd.DataFrame(model.predict(X.iloc[test_idx, :]), columns=Y.columns) # reshape all of the predictions into a single table predictions = pd.DataFrame() for j in range(outputs): # collect training data predict_j = np.array(train_predict.iloc[:, j]) actual_j = np.array(Y.iloc[train_idx, j])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=11) gbt = MLPClassifier(alpha=0.0, random_state=0, activation='relu', hidden_layer_sizes=(50, ), verbose=0) from sklearn.multioutput import MultiOutputClassifier #gbt = RandomForestClassifier(n_estimators=300, random_state=264, min_samples_leaf=300, min_samples_split=150) #MultiOutputClassifier(gbt,n_job=-1).fit(X_train, Y_train) mor = MultiOutputClassifier(gbt, n_jobs=-1) clf4 = mor.fit(X_train, Y_train) err_train = np.mean(Y_train != mor.predict(X_train)) err_test = np.mean(Y_test != mor.predict(X_test)) err_sum = np.mean(Y != mor.predict(X)) joblib.dump(mor, "training_models/cardio_hi.pkl", compress=1) print("gender 0", err_train, err_test, 'err_sum', err_sum) print("gbt score %s" % clf4.score(X_train, Y_train)) print() print("start ap_lo") X = data.drop([ 'cardio', 'ap_lo', 'id', 'weight_o', 'weight_nfg_o', 'weight_nfg_o_с', 'weight_o_c', 'alco', 'bmi_r_4', 'bmi_n_7', 'bmi_r_1', 'bmi_n_2', 'bmi_n_1' ],
'worksi1f', 'cleanair', 'sporty1f', 'alcoholf', 'cancerf', 'menarche', 'avdaysp', 'infpro1f', 'ocs' ] output_cols = ['WEIGHT', 'MODEDELIV', 'BABYCONDIT', 'Status'] data = pd.read_excel('/home/dell/Desktop/PALS_data.XLSX') print data.head() X = data.loc[:, feature_cols] y = data.loc[:, output_cols] print X.shape le = preprocessing.LabelEncoder() y.WEIGHT = le.fit_transform(y.WEIGHT) le = preprocessing.LabelEncoder() y.MODEDELIV = le.fit_transform(y.MODEDELIV) le = preprocessing.LabelEncoder() y.BABYCONDIT = le.fit_transform(y.BABYCONDIT) le = preprocessing.LabelEncoder() X.ocs = le.fit_transform(X.ocs) clf = RandomForestClassifier(n_estimators=100, random_state=1) #clf = svm.SVC(gamma=0.001, C=1000., kernel = 'linear') multi_target_forest = MultiOutputClassifier(clf, n_jobs=-1) multi_target_forest.fit(X, y) print "success"
target = ['Match_result'] worldcup = shuffle(worldcup) x = worldcup[features].values y = worldcup[target].values # Split the dataset into training dataset and testing dataset x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) #===================Perceptron========================= from sklearn.multioutput import MultiOutputClassifier from sklearn.linear_model import Perceptron ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0) #y=w.x+b multi_target_ppn = MultiOutputClassifier(ppn) y_pred = multi_target_ppn.fit(x_train, y_train).predict(x_test) print('Perceptron') print(classification_report(y_test, y_pred)) print('Accuracy classification score: %.2f' % accuracy_score(y_test, y_pred)) print('Average Hamming loss: %.2f' % hamming_loss(y_test, y_pred)) print('Jaccard similarity coefficient score: %.2f' % jaccard_similarity_score(y_test, y_pred)) print('Matthews correlation coefficient (MCC): %.2f' % matthews_corrcoef(y_test, y_pred)) print('Zero-one classification loss: %.2f' % zero_one_loss(y_test, y_pred)) #===================SVM========================= from sklearn.multioutput import MultiOutputClassifier from sklearn import svm #调用SVC() clf = svm.SVC()
audios = np.unique(mfcc_audio["Audio"]) train_audio, test_audio = train_test_split( audios, train_size=0.7, test_size=0.3, random_state=0) X_train = mfcc_audio[mfcc_audio["Audio"].isin(train_audio)] X_test = mfcc_audio[mfcc_audio["Audio"].isin(test_audio)] y_train = X_train[columns] y_test = X_test[columns] X_train.drop(columns + ["Audio"], inplace=True, axis=1) X_test.drop(columns + ["Audio"], inplace=True, axis=1) mor = MultiOutputClassifier( RandomForestClassifier(random_state=0, n_estimators=1000), n_jobs=-1) mor.fit(X_train, y_train) mor_pred = mor.predict(X_test) dummy = DummyClassifier() dummy.fit(X_train, y_train) dummy_pred = dummy.predict(X_test) estimators = mor.estimators_ for i, col in enumerate(columns): true = y_test[col] pred = mor_pred[:, i] d_p = dummy_pred[:, i] print(col)
def train_model_one_vs_rest(train_vectors, train_labels): model = RandomForestClassifier() clf = MultiOutputClassifier(model) clf.fit(train_vectors, train_labels) return clf
estimators = MultiOutputClassifier( estimator=XGBClassifier(penalty="l2", objective="binary:logistic", random_state=42) ) X_train, X_eval, y_train, y_eval = train_test_split( features_df, labels_df, test_size=0.33, shuffle=True, stratify=labels_df, random_state=RANDOM_SEED ) # Train model estimators.fit(features_df, labels_df) # Predict on evaluation set # This competition wants probabilities, not labels preds = estimators.predict_proba(X_eval) preds k = preds[0] y_preds = pd.DataFrame( { "h1n1_vaccine": preds[0][:, 1], "seasonal_vaccine": preds[1][:, 1], }, index = y_eval.index ) print("y_preds.shape:", y_preds.shape) y_preds.head()
class Igel(object): """ Igel is the base model to use the fit, evaluate and predict functions of the sklearn library """ available_commands = ('fit', 'evaluate', 'predict', 'experiment') supported_types = ('regression', 'classification', 'clustering') results_path = configs.get('results_path') # path to the results folder default_model_path = configs.get( 'default_model_path') # path to the pre-fitted model description_file = configs.get( 'description_file') # path to the description.json file evaluation_file = configs.get( 'evaluation_file') # path to the evaluation.json file prediction_file = configs.get( 'prediction_file') # path to the predictions.csv default_dataset_props = configs.get( 'dataset_props' ) # dataset props that can be changed from the yaml file default_model_props = configs.get( 'model_props') # model props that can be changed from the yaml file model = None def __init__(self, **cli_args): logger.info(f"Entered CLI args: {cli_args}") logger.info(f"Executing command: {cli_args.get('cmd')} ...") self.data_path: str = cli_args.get('data_path') # path to the dataset logger.info(f"reading data from {self.data_path}") self.command = cli_args.get('cmd', None) if not self.command or self.command not in self.available_commands: raise Exception(f"You must enter a valid command.\n" f"available commands: {self.available_commands}") if self.command == "fit": self.yml_path = cli_args.get('yaml_path') self.yaml_configs = read_yaml(self.yml_path) logger.debug(f"your chosen configuration: {self.yaml_configs}") # dataset options given by the user self.dataset_props: dict = self.yaml_configs.get( 'dataset', self.default_dataset_props) # model options given by the user self.model_props: dict = self.yaml_configs.get( 'model', self.default_model_props) # list of target(s) to predict self.target: list = self.yaml_configs.get('target') self.model_type: str = self.model_props.get('type') logger.info(f"dataset_props: {self.dataset_props} \n" f"model_props: {self.model_props} \n " f"target: {self.target} \n") # if entered command is evaluate or predict, then the pre-fitted model needs to be loaded and used else: self.model_path = cli_args.get('model_path', self.default_model_path) logger.info(f"path of the pre-fitted model => {self.model_path}") # load description file to read stored training parameters with open(self.description_file, 'r') as f: dic = json.load(f) self.target: list = dic.get( "target") # target to predict as a list self.model_type: str = dic.get( "type" ) # type of the model -> regression or classification self.dataset_props: dict = dic.get( 'dataset_props') # dataset props entered while fitting getattr(self, self.command)() def _create_model(self, **kwargs): """ fetch a model depending on the provided type and algorithm by the user and return it @return: class of the chosen model """ model_type: str = self.model_props.get('type') model_algorithm: str = self.model_props.get('algorithm') model_args = None if not model_type or not model_algorithm: raise Exception(f"model_type and algorithm cannot be None") algorithms: dict = models_dict.get( model_type) # extract all algorithms as a dictionary model = algorithms.get( model_algorithm) # extract model class depending on the algorithm logger.info( f"Solving a {model_type} problem using ===> {model_algorithm}") if not model: raise Exception("Model not found in the algorithms list") else: model_props_args = self.model_props.get('arguments', None) if model_props_args and type(model_props_args) == dict: model_args = model_props_args elif not model_props_args or model_props_args.lower() == "default": model_args = None model_class = model.get('class') logger.info(f"model arguments: \n" f"{self.model_props.get('arguments')}") model = model_class(**kwargs) if not model_args else model_class( **model_args) return model, model_args def _save_model(self, model): """ save the model to a binary file @param model: model to save @return: bool """ try: if not os.path.exists(self.results_path): logger.info( f"creating model_results folder to save results...\n" f"path of the results folder: {self.results_path}") os.mkdir(self.results_path) else: logger.info(f"Folder {self.results_path} already exists") logger.warning( f"data in the {self.results_path} folder will be overridden. If you don't " f"want this, then move the current {self.results_path} to another path" ) except OSError: logger.exception( f"Creating the directory {self.results_path} failed ") else: logger.info( f"Successfully created the directory in {self.results_path} ") pickle.dump(model, open(self.default_model_path, 'wb')) return True def _load_model(self, f: str = ''): """ load a saved model from file @param f: path to model @return: loaded model """ try: if not f: logger.info(f"result path: {self.results_path} ") logger.info(f"loading model form {self.default_model_path} ") model = pickle.load(open(self.default_model_path, 'rb')) else: logger.info(f"loading from {f}") model = pickle.load(open(f, 'rb')) return model except FileNotFoundError: logger.error(f"File not found in {self.default_model_path} ") def _prepare_fit_data(self): return self._process_data(target='fit') def _prepare_eval_data(self): return self._process_data(target='evaluate') def _process_data(self, target='fit'): """ read and return data as x and y @return: list of separate x and y """ assert isinstance(self.target, list), "provide target(s) as a list in the yaml file" if self.model_type != "clustering": assert len( self.target) > 0, "please provide at least a target to predict" try: dataset = pd.read_csv(self.data_path) logger.info(f"dataset shape: {dataset.shape}") attributes = list(dataset.columns) logger.info(f"dataset attributes: {attributes}") # handle missing values in the dataset preprocess_props = self.dataset_props.get('preprocess', None) if preprocess_props: # handle encoding encoding = preprocess_props.get('encoding') if encoding: encoding_type = encoding.get('type', None) column = encoding.get('column', None) if column in attributes: dataset, classes_map = encode( df=dataset, encoding_type=encoding_type.lower(), column=column) if classes_map: self.dataset_props[ 'label_encoding_classes'] = classes_map logger.info( f"adding classes_map to dataset props: \n{classes_map}" ) logger.info( f"shape of the dataset after encoding => {dataset.shape}" ) # preprocessing strategy: mean, median, mode etc.. strategy = preprocess_props.get('missing_values') if strategy: dataset = handle_missing_values(dataset, strategy=strategy) logger.info( f"shape of the dataset after handling missing values => {dataset.shape}" ) if target == 'predict' or target == 'fit_cluster': x = _reshape(dataset.to_numpy()) if not preprocess_props: return x scaling_props = preprocess_props.get('scale', None) if not scaling_props: return x else: scaling_method = scaling_props.get('method', None) return normalize(x, method=scaling_method) if any(col not in attributes for col in self.target): raise Exception( "chosen target(s) to predict must exist in the dataset") y = pd.concat([dataset.pop(x) for x in self.target], axis=1) x = _reshape(dataset.to_numpy()) y = _reshape(y.to_numpy()) logger.info(f"y shape: {y.shape} and x shape: {x.shape}") # handle data scaling if preprocess_props: scaling_props = preprocess_props.get('scale', None) if scaling_props: scaling_method = scaling_props.get('method', None) scaling_target = scaling_props.get('target', None) if scaling_target == 'all': x = normalize(x, method=scaling_method) y = normalize(y, method=scaling_method) elif scaling_target == 'inputs': x = normalize(x, method=scaling_method) elif scaling_target == 'outputs': y = normalize(y, method=scaling_method) if target == 'evaluate': return x, y split_options = self.dataset_props.get('split', None) if not split_options: return x, y, None, None test_size = split_options.get('test_size') shuffle = split_options.get('shuffle') stratify = split_options.get('stratify') x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, shuffle=shuffle, stratify=None if not stratify or stratify.lower() == "default" else stratify) return x_train, y_train, x_test, y_test except Exception as e: logger.exception( f"error occured while preparing the data: {e.args}") def _prepare_clustering_data(self): """ preprocess data for the clustering algorithm """ return self._process_data(target='fit_cluster') def _prepare_predict_data(self): """ preprocess predict data to get similar data to the one used when training the model """ return self._process_data(target='predict') def get_evaluation(self, model, x_test, y_true, y_pred, **kwargs): res = None try: res = evaluate_model(model_type=self.model_type, model=model, x_test=x_test, y_pred=y_pred, y_true=y_true, get_score_only=False, **kwargs) except Exception as e: res = evaluate_model(model_type=self.model_type, model=model, x_test=x_test, y_pred=y_pred, y_true=y_true, get_score_only=True, **kwargs) return res def fit(self, **kwargs): """ fit a machine learning model and save it to a file along with a description.json file @return: None """ x_train = None x_test = None y_train = None y_test = None if self.model_type == 'clustering': x_train = self._prepare_clustering_data() else: x_train, y_train, x_test, y_test = self._prepare_fit_data() self.model, model_args = self._create_model(**kwargs) logger.info( f"executing a {self.model.__class__.__name__} algorithm...") # convert to multioutput if there is more than one target to predict: if self.model_type != 'clustering' and len(self.target) > 1: logger.info( f"predicting multiple targets detected. Hence, the model will be automatically " f"converted to a multioutput model") self.model = MultiOutputClassifier(self.model) \ if self.model_type == 'classification' else MultiOutputRegressor(self.model) if self.model_type != 'clustering': self.model.fit(x_train, y_train) else: self.model.fit(x_train) saved = self._save_model(self.model) if saved: logger.info( f"model saved successfully and can be found in the {self.results_path} folder" ) eval_results = None if self.model_type == 'clustering': eval_results = self.model.score(x_train) else: if x_test is None: logger.info( f"no split options was provided. training score will be calculated" ) eval_results = self.model.score(x_train, y_train) else: logger.info( f"split option detected. The performance will be automatically evaluated " f"using the test data portion") y_pred = self.model.predict(x_test) eval_results = self.get_evaluation(model=self.model, x_test=x_test, y_true=y_test, y_pred=y_pred, **kwargs) fit_description = { "model": self.model.__class__.__name__, "arguments": model_args if model_args else "default", "type": self.model_props['type'], "algorithm": self.model_props['algorithm'], "dataset_props": self.dataset_props, "model_props": self.model_props, "data_path": self.data_path, "train_data_shape": x_train.shape, "test_data_shape": None if x_test is None else x_test.shape, "train_data_size": x_train.shape[0], "test_data_size": None if x_test is None else x_test.shape[0], "results_path": str(self.results_path), "model_path": str(self.default_model_path), "target": None if self.model_type == 'clustering' else self.target, "results_on_test_data": eval_results, "cluster_centers": None if self.model_type != 'clustering' else self.model.cluster_centers_, "cluster_labels": None if self.model_type != 'clustering' else self.model.labels_, } try: logger.info(f"saving fit description to {self.description_file}") with open(self.description_file, 'w', encoding='utf-8') as f: json.dump(fit_description, f, ensure_ascii=False, indent=4) except Exception as e: logger.exception( f"Error while storing the fit description file: {e}") def evaluate(self, **kwargs): """ evaluate a pre-fitted model and save results to a evaluation.json @return: None """ x_val = None y_true = None eval_results = None try: model = self._load_model() if self.model_type != 'clustering': x_val, y_true = self._prepare_eval_data() y_pred = model.predict(x_val) eval_results = self.get_evaluation(model=model, x_test=x_val, y_true=y_true, y_pred=y_pred, **kwargs) else: x_val = self._prepare_clustering_data() y_pred = model.predict(x_val) eval_results = model.score(x_val, y_pred) logger.info(f"saving fit description to {self.evaluation_file}") with open(self.evaluation_file, 'w', encoding='utf-8') as f: json.dump(eval_results, f, ensure_ascii=False, indent=4) except Exception as e: logger.exception(f"error occured during evaluation: {e}") def predict(self): """ use a pre-fitted model to make predictions and save them as csv @return: None """ try: model = self._load_model(f=self.model_path) x_val = self._prepare_predict_data( ) # the same is used for clustering y_pred = model.predict(x_val) y_pred = _reshape(y_pred) logger.info( f"predictions shape: {y_pred.shape} | shape len: {len(y_pred.shape)}" ) logger.info(f"predict on targets: {self.target}") df_pred = pd.DataFrame.from_dict({ self.target[i]: y_pred[:, i] if len(y_pred.shape) > 1 else y_pred for i in range(len(self.target)) }) logger.info(f"saving the predictions to {self.prediction_file}") df_pred.to_csv(self.prediction_file) except Exception as e: logger.exception(f"Error while preparing predictions: {e}") @staticmethod def create_init_mock_file(model_type=None, model_name=None, target=None, *args, **kwargs): path = configs.get('init_file_path', None) if not path: raise Exception("You need to provide a path for the init file") dataset_props = Igel.default_dataset_props model_props = Igel.default_model_props if model_type: logger.info(f"user selected model type = {model_type}") model_props['type'] = model_type if model_name: logger.info(f"user selected algorithm = {model_name}") model_props['algorithm'] = model_name logger.info(f"initalizing a default igel.yaml in {path}") default_data = { "dataset": dataset_props, "model": model_props, "target": ['provide your target(s) here'] if not target else [tg for tg in target.split()] } created = create_yaml(default_data, path) if created: logger.info( f"a default igel.yaml is created for you in {path}. " f"you just need to overwrite the values to meet your expectations" ) else: logger.warning( f"something went wrong while initializing a default file")
class ModelTrainer: """ To train a merchine learning model based on the input yaml config """ RAND_SEED = 42 input_cmds = ('fit', 'evaluate', 'predict', 'experiment') supported_types = ('regression', 'classification', 'clustering') results_path = configs.get('results_path') # path to the results folder default_model_path = configs.get( 'default_model_path') # path to the pre-fitted model description_file = configs.get( 'description_file') # path to the description.json file evaluation_file = configs.get( 'evaluation_file') # path to the evaluation.json file prediction_file = configs.get( 'prediction_file') # path to the predictions.csv default_dataset_props = configs.get( 'dataset_props' ) # dataset props that can be changed from the yaml file default_model_props = configs.get( 'model_props') # model props that can be changed from the yaml file model = None def __init__(self, *args, **kwargs) -> None: self.data_path: str = kwargs.get('data_path', None) self.logfile = kwargs.get('logfile', None) self.command = kwargs.get('cmd', None) self.results_path = kwargs.get('results_path', None) # path to the results folder self._x_columns = None # results_path as specified input if self.results_path == None: self.results_path = ModelTrainer.results_path # path to the results folder else: self.default_model_path = os.path.join(self.results_path, configs.get('model_file')) self.description_file = os.path.join( self.results_path, 'description.json') # path to the description.json file self.evaluation_file = os.path.join( self.results_path, 'evaluation.json') # path to the evaluation.json file self.prediction_file = os.path.join( self.results_path, 'prediction.json') # path to the predictions.csv logger.info(f"Entered kwargs: {kwargs}") if not self.command or self.command not in self.input_cmds: raise Exception(f"You must enter a valid command.\n" f"available commands: {self.input_cmds}") if self.command == "fit": self.yml_path = kwargs.get('yaml_path', None) file_ext = self.yml_path.split('.')[-1] logger.info(f"You passed the configurations as a {file_ext} file.") self.yaml_configs = read_yaml( self.yml_path) if file_ext == 'yaml' else read_json( self.yml_path) logger.info(f"your chosen configuration: {self.yaml_configs}") # dataset options given by the user self.dataset_props: dict = self.yaml_configs.get( 'dataset', self.default_dataset_props) # model options given by the user self.model_props: dict = self.yaml_configs.get( 'model', self.default_model_props) # list of target(s) to predict self.target: list = self.yaml_configs.get('target', None) # list of obs_id(s) to identify observation self.observation_id: list = self.yaml_configs.get( 'observation_id', None) self.model_type: str = self.model_props.get('type', None) logger.info(f"dataset_props: {self.dataset_props} \n" f"model_props: {self.model_props} \n " f"target: {self.target} \n") # handle random numbers generation random_num_options = self.dataset_props.get('random_numbers', None) if random_num_options: generate_reproducible = random_num_options.get( 'generate_reproducible', None) if generate_reproducible: logger.info( "You provided the generate reproducible results option." ) seed = random_num_options.get('seed', self.RAND_SEED) np.random.seed(seed) logger.info( f"Setting a seed = {seed} to generate same random numbers on each experiment.." ) # if entered command is evaluate or predict, then the pre-fitted model needs to be loaded and used else: self.model_path = kwargs.get('model_path', self.default_model_path) logger.info(f"path of the pre-fitted model => {self.model_path}") # load description file to read stored training parameters with open(self.description_file, 'r') as f: dic = json.load(f) self.target: list = dic.get( "target") # target to predict as a list self.model_type: str = dic.get( "type" ) # type of the model -> regression or classification self.dataset_props: dict = dic.get( 'dataset_props') # dataset props entered while fitting getattr(self, self.command)() def _create_model(self, **kwargs): """ fetch a model depending on the provided type and algorithm by the user and return it @return: class of the chosen model """ model_type: str = self.model_props.get('type') model_algorithm: str = self.model_props.get('algorithm') use_cv = self.model_props.get('use_cv_estimator', None) model_args = None if not model_type or not model_algorithm: raise Exception(f"model_type and algorithm cannot be None") algorithms: dict = models_dict.get( model_type) # extract all algorithms as a dictionary model = algorithms.get( model_algorithm) # extract model class depending on the algorithm logger.info( f"Solving a {model_type} problem using ===> {model_algorithm}") if not model: raise Exception("Model not found in the algorithms list") else: model_props_args = self.model_props.get('arguments', None) if model_props_args and type(model_props_args) == dict: model_args = model_props_args elif not model_props_args or model_props_args.lower() == "default": model_args = None if use_cv: model_class = model.get('cv_class', None) if model_class: logger.info( f"cross validation estimator detected. " f"Switch to the CV version of the {model_algorithm} algorithm" ) else: logger.info( f"No CV class found for the {model_algorithm} algorithm" ) else: model_class = model.get('class') logger.info(f"model arguments: \n" f"{self.model_props.get('arguments')}") model = model_class(**kwargs) if not model_args else model_class( **model_args) return model, model_args def _save_model(self, model): """ save the model to a binary file @param model: model to save @return: bool """ try: if not os.path.exists(self.results_path): logger.info( f"creating model_results folder to save results...\n" f"path of the results folder: {self.results_path}") os.mkdir(self.results_path) else: logger.info(f"Folder {self.results_path} already exists") logger.warning( f"data in the {self.results_path} folder will be overridden. If you don't " f"want this, then move the current {self.results_path} to another path" ) except OSError: logger.exception( f"Creating the directory {self.results_path} failed ") else: logger.info( f"Successfully created the directory in {self.results_path} ") pickle.dump(model, open(self.default_model_path, 'wb')) return True def _load_model(self, f: str = ''): """ load a saved model from file @param f: path to model @return: loaded model """ try: if not f: logger.info(f"result path: {self.results_path} ") logger.info(f"loading model form {self.default_model_path} ") model = pickle.load(open(self.default_model_path, 'rb')) else: logger.info(f"loading from {f}") model = pickle.load(open(f, 'rb')) return model except FileNotFoundError: logger.error(f"File not found in {self.default_model_path} ") def _prepare_clustering_data(self): """ preprocess data for the clustering algorithm """ return self._process_data(target='fit_cluster') def _prepare_predict_data(self): """ preprocess predict data to get similar data to the one used when training the model """ return self._process_data(target='predict') def _prepare_fit_data(self): return self._process_data(target='fit') def _prepare_eval_data(self): return self._process_data(target='evaluate') def _process_data(self, target='fit'): """ read and return data as x and y @return: list of separate x and y """ assert isinstance(self.target, list), "provide target(s) as a list in the yaml file" if self.model_type != "clustering": assert len( self.target) > 0, "please provide at least a target to predict" try: read_data_options = self.dataset_props.get('read_data_options', None) dataset = pd.read_csv( self.data_path) if not read_data_options else pd.read_csv( self.data_path, **read_data_options) logger.info(f"dataset shape: {dataset.shape}") attributes = list(dataset.columns) logger.info(f"dataset attributes: {attributes}") # handle missing values in the dataset preprocess_props = self.dataset_props.get('preprocess', None) if preprocess_props: # handle encoding encoding = preprocess_props.get('encoding') if encoding: encoding_type = encoding.get('type', None) column = encoding.get('column', None) if column in attributes: dataset, classes_map = encode( df=dataset, encoding_type=encoding_type.lower(), column=column) if classes_map: self.dataset_props[ 'label_encoding_classes'] = classes_map logger.info( f"adding classes_map to dataset props: \n{classes_map}" ) logger.info( f"shape of the dataset after encoding => {dataset.shape}" ) # preprocessing strategy: mean, median, mode etc.. strategy = preprocess_props.get('missing_values') if strategy: dataset = handle_missing_values(dataset, strategy=strategy) logger.info( f"shape of the dataset after handling missing values => {dataset.shape}" ) if target == 'predict' or target == 'fit_cluster': x = _reshape(dataset.to_numpy()) if not preprocess_props: return x scaling_props = preprocess_props.get('scale', None) if not scaling_props: return x else: scaling_method = scaling_props.get('method', None) return normalize(x, method=scaling_method) if any(col not in attributes for col in self.target): raise Exception( "chosen target(s) to predict must exist in the dataset") y = pd.concat( [dataset.pop(x) for x in self.target], axis=1) # remove target variable(s) from dataset & concat them x = _reshape(dataset.to_numpy()) y = _reshape(y.to_numpy()) logger.info(f"y shape: {y.shape} and x shape: {x.shape}") self._x_columns = dataset.columns.to_list() logger.info(f"X columns: {self._x_columns}") # handle data scaling if preprocess_props: scaling_props = preprocess_props.get('scale', None) if scaling_props: scaling_method = scaling_props.get('method', None) scaling_target = scaling_props.get('target', None) if scaling_target == 'all': x = normalize(x, method=scaling_method) y = normalize(y, method=scaling_method) elif scaling_target == 'inputs': x = normalize(x, method=scaling_method) elif scaling_target == 'outputs': y = normalize(y, method=scaling_method) if target == 'evaluate': return x, y split_options = self.dataset_props.get('split', None) if not split_options: return x, y, None, None test_size = split_options.get('test_size') shuffle = split_options.get('shuffle') stratify = split_options.get('stratify') x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, shuffle=shuffle, stratify=None if not stratify or stratify.lower() == "default" else stratify) return x_train, y_train, x_test, y_test except Exception as e: logger.exception( f"error occured while preparing the data: {e.args}") def get_evaluation(self, model, x_test, y_true, y_pred, y_score, **kwargs): try: res = evaluate_model(model_type=self.model_type, model=model, x_test=x_test, y_pred=y_pred, y_true=y_true, y_score=y_score, get_score_only=False, **kwargs) except Exception as e: logger.debug(e) res = evaluate_model(model_type=self.model_type, model=model, x_test=x_test, y_pred=y_pred, y_true=y_true, y_score=y_score, get_score_only=True, **kwargs) return res def fit(self, **kwargs): """fit a model Raises: Exception: [description] """ x_train = None y_train = None x_test = None y_test = None cv_results = None eval_results = None cv_params = None hp_search_results = {} if self.model_type == 'clustering': x_train = self._prepare_clustering_data() else: x_train, y_train, x_test, y_test = self._prepare_fit_data() self.model, model_args = self._create_model(**kwargs) logger.info( f"executing a {self.model.__class__.__name__} algorithm...") # convert to multioutput if there is more than one target to predict: if self.model_type != 'clustering' and len(self.target) > 1: logger.info( f"predicting multiple targets detected. Hence, the model will be automatically " f"converted to a multioutput model") self.model = MultiOutputClassifier(self.model) \ if self.model_type == 'classification' else MultiOutputRegressor(self.model) if self.model_type != 'clustering': cv_params = self.model_props.get('cross_validate', None) if not cv_params: logger.info(f"cross validation is not provided") else: # perform cross validation logger.info("performing cross validation ...") cv_results = cross_validate(estimator=self.model, X=x_train, y=y_train, **cv_params) hyperparams_props = self.model_props.get('hyperparameter_search', None) if hyperparams_props: # perform hyperparameter search method = hyperparams_props.get('method', None) grid_params = hyperparams_props.get('parameter_grid', None) hp_args = hyperparams_props.get('arguments', None) logger.info( f"Performing hyperparameter search using -> {method}") logger.info( f"Grid parameters entered by the user: {grid_params}") logger.info(f"Additional hyperparameter arguments: {hp_args}") best_estimator, best_score, best_params = hyperparameter_search( model=self.model, method=method, params=grid_params, x_train=x_train, y_train=y_train, **hp_args) hp_search_results['best_params'] = best_params hp_search_results['best_score'] = best_score self.model = best_estimator self.model.fit(x_train, y_train) else: # if the model type is clustering self.model.fit(x_train) saved = self._save_model(self.model) if saved: logger.info( f"model saved successfully and can be found in the {self.results_path} folder" ) if self.model_type == 'clustering': eval_results = self.model.score(x_train) else: if x_test is None: logger.info( f"no split options was provided. training score will be calculated" ) eval_results = self.model.score(x_train, y_train) else: logger.info( f"split option detected. The performance will be automatically evaluated " f"using the test data portion") y_pred = self.model.predict(x_test) y_score = self.model.predict_proba( x_test) if self.model_type == 'classification' else None eval_results = self.get_evaluation(model=self.model, x_test=x_test, y_true=y_test, y_pred=y_pred, y_score=y_score, **kwargs) fit_description = { "model": self.model.__class__.__name__, "arguments": model_args if model_args else "default", "type": self.model_props['type'], "algorithm": self.model_props['algorithm'], "dataset_props": self.dataset_props, "model_props": self.model_props, "data_path": self.data_path, "train_data_shape": x_train.shape, "test_data_shape": None if x_test is None else x_test.shape, "train_data_size": x_train.shape[0], "test_data_size": None if x_test is None else x_test.shape[0], "results_path": str(self.results_path), "model_path": str(self.default_model_path), "target": None if self.model_type == 'clustering' else self.target, "results_on_test_data": eval_results, "hyperparameter_search_results": hp_search_results } if self.model_type == 'clustering': clustering_res = { "cluster_centers": self.model.cluster_centers_, "cluster_labels": self.model.labels_ } fit_description['clustering_results'] = clustering_res if cv_params: cv_res = { "fit_time": cv_results['fit_time'].tolist(), "score_time": cv_results['score_time'].tolist(), "test_score": cv_results['test_score'].tolist() } fit_description['cross_validation_params'] = cv_params fit_description['cross_validation_results'] = cv_res try: logger.info(f"saving fit description to {self.description_file}") with open(self.description_file, 'w', encoding='utf-8') as f: json.dump(fit_description, f, ensure_ascii=False, indent=4) except Exception as e: logger.exception( f"Error while storing the fit description file: {e}") def evaluate(self, **kwargs): """ evaluate a pre-fitted model and save results to a evaluation.json @return: None """ x_val = None y_true = None eval_results = None try: model = self._load_model() if self.model_type != 'clustering': x_val, y_true = self._prepare_eval_data() y_pred = model.predict(x_val) y_score = model.predict_proba( x_val) if self.model_type == 'classification' else None eval_results = self.get_evaluation(model=model, x_test=x_val, y_true=y_true, y_pred=y_pred, y_score=y_score, **kwargs) else: x_val = self._prepare_clustering_data() y_pred = model.predict(x_val) eval_results = model.score(x_val, y_pred) logger.info(f"saving fit description to {self.evaluation_file}") with open(self.evaluation_file, 'w', encoding='utf-8') as f: json.dump(eval_results, f, ensure_ascii=False, indent=4) except Exception as e: logger.exception(f"error occured during evaluation: {e}") def predict(self): """ use a pre-fitted model to make predictions and save them as csv @return: None """ try: model = self._load_model(f=self.model_path) x_val = self._prepare_predict_data( ) # the same is used for clustering y_pred = model.predict(x_val) y_pred = _reshape(model.predict_proba(x_val)[:, 1]) if ( type_of_target(y_pred) == 'binary' and self.model_type == 'classification') else _reshape(y_pred) logger.info( f"predictions shape: {y_pred.shape} | shape len: {len(y_pred.shape)}" ) logger.info(f"predict on targets: {self.target}") df_pred = pd.DataFrame.from_dict({ self.target[i]: y_pred[:, i] if len(y_pred.shape) > 1 else y_pred for i in range(len(self.target)) }) logger.info(f"saving the predictions to {self.prediction_file}") df_pred.to_csv(self.prediction_file) except Exception as e: logger.exception(f"Error while preparing predictions: {e}") @staticmethod def create_init_config_file(model_type=None, model_name=None, target=None, *args, **kwargs): path = configs.get('init_file_path', None) if not path: raise Exception("You need to provide a path for the init file") dataset_props = ModelTrainer.default_dataset_props model_props = ModelTrainer.default_model_props if model_type: logger.info(f"user selected model type = {model_type}") model_props['type'] = model_type if model_name: logger.info(f"user selected algorithm = {model_name}") model_props['algorithm'] = model_name logger.info(f"initalizing a default ModelTrainer.yaml in {path}") default_data = { "dataset": dataset_props, "model": model_props, "target": ['provide your target(s) here'] if not target else [tg for tg in target.split()] } created = create_yaml(default_data, path) if created: logger.info( f"a default Model.yaml is created for you in {path}. " f"you just need to overwrite the values to meet your expectations" ) else: logger.warning( f"something went wrong while initializing a default file")
from flask import request, jsonify import pandas as pd from sklearn.multioutput import MultiOutputClassifier from sklearn.ensemble import RandomForestClassifier import csv import json app = flask.Flask(__name__) df = pd.read_csv("train_data1.csv") df_X = df.iloc[:, 1:12].copy() # Train Input df_Y = df.iloc[:, 12:16].copy() # Train Output vendorArray = [] forest = RandomForestClassifier(n_estimators=100, random_state=1) multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1) multi_target_forest_updated = MultiOutputClassifier(forest, n_jobs=-1) multi_target_forest.fit(df_X, df_Y) df = pd.read_csv("train_data2.csv") df_X = df.iloc[:, 1:12].copy() # Train Input df_Y = df.iloc[:, 12:16].copy() # Train Output # coursesArray = [] forest1 = RandomForestClassifier(n_estimators=100, random_state=1) multi_target_forest1 = MultiOutputClassifier(forest1, n_jobs=-1) multi_target_forest_updated1 = MultiOutputClassifier(forest1, n_jobs=-1) multi_target_forest1.fit(df_X, df_Y) def findVendor(index): vendors = { 0: "ACE American Insurance Company", 1: "American Agri-Business Insurance Company",