def main(): data = readData("IMDB-Movie-Data.csv") genres = data["Genre"] descriptions = data["Description"] labels = getLabels(genres) calculateNgrams(descriptions) features = list(map(extract_features, descriptions)) print len(features[1]) # X = features # Y = Labels X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42) #binRel(X_train, X_test, y_test, y_train) classifier = MLkNN(k=4) # Train classifier.fit(X_train, y_train) #predict #print X_test predictions = classifier.predict(np.array(X_test)) print('Hamming loss: {0}'.format( sklearn.metrics.hamming_loss(y_test, predictions))) #(y_true, y_pred) ''''
def adapted(data): classifier = MLkNN(k=20) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) accuracyScore = accuracy_score(y_test, predictions) return None
def get_cado_predictions(): data_path = '../../datasets/cado/train.csv' test_path = '../../datasets/cado/test.csv' data = du.load_data(data_path) test = du.load_data(test_path) text_index = 6 label_start_index = 7 X = [d[text_index] for d in data] labels = [d[label_start_index:label_start_index + 12] for d in data] X_test = [d[text_index] for d in test] labels_test = [d[label_start_index:label_start_index + 12] for d in test] Y = np.array(labels, dtype='int') y_test = np.array(labels_test, dtype='int') #Y = np.array(binary_labels, dtype='int') test_index = len(X) X = X + X_test Y = np.vstack([Y, y_test]) tokenizer = tokenize_data(X) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(X) X = pad_sequences(sequences, maxlen=700, padding="post", truncating="post", value=0) num_words = min(MAX_NB_WORDS, len(word_index) + 1) embedding_matrix = np.zeros((num_words, 1)) for word, i in word_index.items(): if i >= MAX_NB_WORDS: continue embedding_matrix[i] = 1 X_train = X[0:test_index, :] Y_train = Y[0:test_index, :] x_test = X[test_index:len(X), :] y_test = Y[test_index:len(Y), :] classifier = MLkNN() classifier.fit(X_train, Y_train) predictions = classifier.predict(x_test) scores = classifier.predict_proba(x_test) y_pred = predictions.toarray() y_score = scores.toarray() return y_pred, y_score
def mlknn(train_data_inx,y_train,test_data_inx): classifier = MLkNN(k=mlknn_k) x_train = [] x_test = [] for i in range(len(train_data_inx)): x_train.append(corpus_tfidf[train_data_inx[i]]) for j in range(len(test_data_inx)): x_test.append(corpus_tfidf[test_data_inx[j]]) classifier.fit(csr_matrix(x_train), csr_matrix(y_train)) mlknn_pre = classifier.predict(csr_matrix(x_test)) mlknn_pre = mlknn_pre.toarray() return mlknn_pre
def MLKNN_method(X_train, y_train, ml_k, ml_s): """ 改编算法-->MLKNN方法 :param X_train: 输入数据 :param y_train: 对应标签数据 :return: """ try: classifier = MLkNN(k=int(ml_k), s=float(ml_s)) classifier.fit(X_train, y_train) return classifier except Exception as e: print("warning----改编算法KNN|MLKNN----" + str(e)) return None
def mlknn(self, number): classifier = MLkNN(k=number) classifier.fit(self.X_train, self.y_train) # predict predictions = classifier.predict(self.X_test) result = hamming_loss(self.y_test, predictions) print("hanming_loss,",result) result = f1_score(self.y_test, predictions, average='micro') print("micro -f1: ", result) result = precision_score(self.y_test, predictions,average='micro') print(result)
def RecommendByMLKNN(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """ML KNN算法""" classifier = MLkNN(k=train_data_y.shape[1]) classifier.fit(train_data, train_data_y) predictions = classifier.predict_proba(test_data).todense() """预测结果转化为data array""" predictions = numpy.asarray(predictions) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def train(self): classifier_new = MLkNN(k=10) x_train = lil_matrix(self.x_data).toarray() y_train = lil_matrix(self.y_data).toarray() x_test = lil_matrix(self.x_test).toarray() classifier_new.fit(x_train, y_train) # predict predictions = classifier_new.predict(x_test) return { 'accuracy': accuracy_score(self.y_test, predictions), 'f1_score': f1_score(self.y_test, predictions, average='micro') }
def MLkNN(self): self.sub_parser.add_argument('--library', action='store_true', default=False) args = self.sub_parser.parse_args(sys.argv[2:]) print 'Running ML-kNN, arguments=%s' % args print 'Loading %s data...' % args.N if args.f == 'My_dict': vectorizer = my_dict_vectorizer(stop=not args.nostop, bigram=args.bigram) elif args.f == 'LIB_count': vectorizer = lib_count_vectorizer(stop=not args.nostop, bigram=args.bigram) elif args.f == 'LIB_hash': vectorizer = lib_hash_vectorizer(stop=not args.nostop, bigram=args.bigram) elif args.f == 'LIB_tfidf': vectorizer = lib_tfidf_vectorizer(stop=not args.nostop, bigram=args.bigram) data = load_data(args.N, args.D, args.Nt, vectorizer) print 'Done loading data, actual feature size:', data[1].shape X, Y, Xt, Yt, cats = data if args.library: from skmultilearn.adapt import MLkNN model = MLkNN() else: from sklearn.neighbors import NearestNeighbors from multi import MLkNN model = MLkNN(NearestNeighbors) model.fit(X, Y) Yp = model.predict(Xt) with warnings.catch_warnings(): warnings.simplefilter("ignore") hl = computeMetrics(Yp, Yt, cats) print 'the hamming loss:' print '>> ', hl from sklearn.metrics import (hamming_loss, classification_report) print 'hamming loss(library):', hamming_loss(Yt, Yp) print classification_report(Yt, Yp, target_names=cats) print 'DONE..'
def create_model(file_path=FINAL_MLKNN_MODEL_FILE_PATH): """ Creates and trains a MLkNN classifier using the optimized parameters found Saves this trained model to disk :param string file_path: specifies where the model should be saved :return: a trained sklearn MLkNN classifier """ with open(OPTIMIZED_MODEL_PARAMETERS_FILE_PATH) as file: hyperparameters = json.load(file)['hyperparameters'] question_data, music_data = preprocessing.load_data() question_data, music_data = preprocessing.preprocess_data( question_data, music_data) clf = MLkNN(k=hyperparameters['k'], s=hyperparameters['s']) clf.fit(question_data.values, music_data.values) pickle.dump(clf, open(file_path, 'wb')) return clf
def mlknn(x_tr, y_tr, x_te, x_va=None): """ mlknn :param x_tr: :param y_tr: :param x_te: :param x_va: :return: """ pred = MLkNN(k=10, s=True) y_tr = np.int32(y_tr) pred.fit(x_tr, y_tr) if x_va is None: y_te_ = sparse.dok_matrix.toarray(pred.predict(x_te)) return y_te_ else: y_te_ = sparse.dok_matrix.toarray(pred.predict(x_te)) y_va_ = sparse.dok_matrix.toarray(pred.predict(x_va)) return y_te_, y_va_
def adapt(X_train, y_train, X_test, y_test): y_train = y_train.to_sparse().to_coo() y_test = y_test.to_sparse().to_coo() from skmultilearn.adapt import MLkNN classifier = MLkNN(k=4) print("Train Adapted algorithm") classifier.fit(X_train, y_train) print("Predict") predictions = classifier.predict(X_test) from sklearn.metrics import accuracy_score print("Accuracy") print(y_test.shape, predictions.shape) print(accuracy_score(y_test.toarray(), predictions))
def mlknn(traindata, trainlabel, ttype): #,valdata,val_label): #knnscore=[] #print("[mlknn start to class>>>>]") ''' find the best parameters''' parameters = {'k': range(2, 5), 's': np.arange(0.1, 0.5, 0.2)} score = 'accuracy' '''search parameters''' search_result = search_bestparmaters(MLkNN(), parameters, score, traindata, trainlabel) #print (search_result.best_params_, search_result.best_score_) k = search_result.best_params_['k'] s = search_result.best_params_['s'] save_score('score/record', ('mlknn', ttype, k, s, search_result.best_score_)) clf = MLkNN(k, s) clf.fit(traindata, trainlabel) joblib.dump(clf, './model/mlknn' + "_model" + ttype + ".m")
class MLkNN(): def __init__(self,window_size=100): self.h=MLkNN(k=20) self.window_size=window_size self.window=InstanceWindow(window_size) self.number_element=0 self.flag=False self.L=None def partial_fit(self,X,y): N,L=y.shape self.L=L for i in range(N): if self.window=None: self.window=InstanceWindow(self.window_size) self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) self.number_element+=1 if self.number_element==self.window_size: X_batch=self.window.get_attributes_matrix() y_batch=self.window.get_targets_matrix() self.h.fit(X_batch,y_batch) self.number_element=0 self.flag=True
return vector document_X = [] document_Y = [] test_y = [] for example in tqdm(y_train): arr = example.strip().split() document_Y.append(to_category_vector(arr, max_len)) document_Y = np.array(document_Y) for example in tqdm(Y_test): arr = example.strip().split() test_y.append(to_category_vector(arr, max_len)) test_y = np.array(test_y) classifier_new = MLkNN(k=5) classifier_new.fit(x_train, document_Y) # predict predictions_new = classifier_new.predict(x_test) # print(predictions_new) pred = predictions_new.toarray() with open("data/predict.txt", 'w') as f: for i in tqdm(range(pred.shape[0])): g = np.where(pred[i] == 1) hashtags = [] for item in g[0]: word = id_to_word[item] hashtags.append(word) f.write(' '.join(hashtags)) f.write('\n')
session.run(init) for epoch in range(training_epochs): # train stage batches=batch_yield(train_x, train_y, batch_size, word2id, label_dict, sequence_length, shuffle=False) train_x_fit=np.zeros([batch_size*total_batches, n_classes]) train_y_fit=np.zeros([batch_size*total_batches, n_classes]) for step, (batch_x, batch_y) in enumerate(batches): batch_x_emb=session.run(word_embeddings, feed_dict={input_ids:batch_x}) _, c=session.run([optimizer, loss_function], feed_dict={x:batch_x_emb, y:batch_y, y_steps:np.tile(batch_y,((sequence_length),1)), sequence_lengths:[sequence_length]*batch_size}) batch_pred_y=session.run(y_last, feed_dict={x:batch_x_emb, sequence_lengths:[sequence_length]*batch_size}) train_x_fit[step*batch_size : step*batch_size+batch_size]=batch_pred_y train_y_fit[step*batch_size : step*batch_size+batch_size]=batch_y clf=MLkNN(k=4) clf.fit(X=train_x_fit, y=train_y_fit) # dev stage batches_dev=batch_yield(dev_x, dev_y, batch_size, word2id, label_dict, sequence_length, shuffle=False) total_batches_dev=len(dev_x)//batch_size dev_x_fit=np.zeros([batch_size*total_batches_dev, n_classes]) dev_y_fit=np.zeros([batch_size*total_batches_dev, n_classes]) for step, (batch_dev_x, batch_dev_y) in enumerate(batches_dev): batch_dev_x_emb=session.run(word_embeddings, feed_dict={input_ids:batch_dev_x}) batch_dev_pred_y=session.run(y_last, feed_dict={x:batch_dev_x_emb, sequence_lengths:[sequence_length]*batch_size}) dev_x_fit[step*batch_size : step*batch_size+batch_size]=batch_dev_pred_y dev_y_fit[step*batch_size : step*batch_size+batch_size]=batch_dev_y dev_preds=clf.predict(dev_x_fit) dev_preds=dev_preds.toarray() base_y=dev_preds+dev_y_fit acc=float(np.sum(base_y==2))/float(np.sum(base_y==1)+np.sum(base_y==2)) precision=float(np.sum(base_y==2))/float(np.sum(dev_preds==1))
import pandas as pd #import numpy as np from skmultilearn.adapt import MLkNN import random dframe = pd.read_csv("trainData.csv", header=None) dset = dframe.values dframe1 = pd.read_csv("pseudotrain.csv", header=None) X_train = dframe1.values input1 = len(dset[0]) Y_train = dset[:, 1:input1] del dframe, dset input = len(X_train[0]) output = len(Y_train[0]) #for test data dframe = pd.read_csv("testData.csv", header=None) dset = dframe.values dframe1 = pd.read_csv("pseudotest.csv", header=None) dframe1 = dframe1.fillna(random.uniform(0.0001, 0.001)) X_test = dframe1.values input1 = len(dset[0]) Y_test = dset[:, 1:input1] input = len(X_test[0]) output = len(Y_test[0]) classifier = MLkNN(20) classifier.fit(X_train, Y_train) predictions = classifier.predict(X_test) acc = (Y_test, predictions) print(acc)
def run(): parser = get_arg_parser() cmd_args = parser.parse_args() if cmd_args.gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = str(cmd_args.gpu) gpunum = os.getenv('CUDA_VISIBLE_DEVICES') logging.info("GPU has been set to {}".format(gpunum)) logging.info("Model used for the regression network: {}" .format(cmd_args.model_name)) # 1. Dataset retrieval # -------------------- tab_printer(constants.Dataset) dataset = Dataset(nrows=constants.Dataset.nrows, augment_labels=constants.Dataset.augment_labels, top_n=constants.Dataset.top_n) logging.info("Going to create vocabulary and fit a preprocessing pipeline" "using {} samples. Settings will be listed below" .format(len(dataset.X_train))) # 2. Preprocessing # ----------------- tab_printer(constants.NLP) preprocessor = Preprocessing(dataset.X_train) # Preprocess documents X_train = preprocessor.transform_documents(dataset.X_train) X_test = preprocessor.transform_documents(dataset.X_test) # 3. Word embeddings with word2vec # -------------------------------- # Train word2vec embeddings if train_word2vec option is selected if cmd_args.train_word2vec: utils.embeddings.main() weights = get_embedding_tensor(preprocessor) # 4. Node embeddings with AttentionWalk # ------------------------------------- args = _generate_deepwalk_parameters(dataset.y_train_graph) if cmd_args.train_attentionwalk: train_attention_walk(args) graph_embeddings = pd.read_csv(args.embedding_path).iloc[:, 1:].values # Get document representations using node embeddings y_embedded = _get_label_embeddings(dataset.y_train, graph_embeddings) y_test_embedded = _get_label_embeddings(dataset.y_test, graph_embeddings) # 5. Regressor Training # --------------------- device = 'cuda:' + str(os.getenv("CUDA_VISIBLE_DEVICES")) \ if torch.cuda.is_available() else 'cpu' regressor_nn = NeuralNet( get_network_class(cmd_args.model_name), max_epochs=constants.NeuralNetworkTraining.epochs, lr=constants.NeuralNetworkTraining.learning_rate, batch_size=constants.NeuralNetworkTraining.batch_size, optimizer=torch.optim.Adam, criterion=torch.nn.MSELoss, module__output_dim=args.dimensions, module__embedding=weights, module__embedding_dim=constants.NLP.embedding_size, device=device, train_split=None, ) # Train the regressor neural network regressor_nn.fit(X_train, y_embedded.astype(np.float32)) # 6. Train Multi-label KNN algorithm # ---------------------------------- tab_printer(constants.MLKNN) # Train multi-label KNN to turn label embeddings into label predictions classifier = MLkNN(k=constants.MLKNN.k, s=constants.MLKNN.s) classifier.fit(y_embedded, dataset.y_train) # 7. Evaluation # ------------- # Label prediction with documents y_test_pred = regressor_nn.predict(X_test) preds = classifier.predict(y_test_pred) preds_raw = classifier.predict_proba(y_test_pred) # Label prediction with label embeddings preds_w_labels = classifier.predict(y_test_embedded) preds_w_labels_raw = classifier.predict_proba(y_test_embedded) # Log evaluation result with label embeddings eval_metrics_w_labels = evaluation \ .all_metrics(preds_w_labels.toarray(), dataset.y_test, yhat_raw=preds_w_labels_raw.toarray()) logging.info(str(eval_metrics_w_labels)) # Log evaluation result with documents report_evaluation(preds.toarray(), dataset.y_test, yhat_raw=preds_raw.toarray())
def TrainProcess(train_data,train_target): cla= MLkNN(k=4, s=0.01) cla.fit(train_data, train_target) return cla
LogReg_pipeline.fit(x_train, train[category]) # calculating test accuracy prediction = LogReg_pipeline.predict(x_test) print('Test accuracy is {}'.format(accuracy_score(test[category], prediction))) print("\n") from skmultilearn.adapt import MLkNN from scipy.sparse import csr_matrix, lil_matrix classifier_new = MLkNN(k=10) # Note that this classifier can throw up errors when handling sparse matrices. x_train = lil_matrix(x_train).toarray() y_train = lil_matrix(y_train).toarray() x_test = lil_matrix(x_test).toarray() # train classifier_new.fit(x_train, y_train) # predict predictions_new = classifier_new.predict(x_test) # accuracy print("Accuracy = ",accuracy_score(y_test,predictions_new)) print("\n") # using Label Powerset from skmultilearn.problem_transform import LabelPowerset # initialize label powerset multi-label classifier classifier = LabelPowerset(LogisticRegression()) # train classifier.fit(x_train, y_train) # predict predictions = classifier.predict(x_test) # accuracy
# print(images_resized) # convert images to numpy array x_multidim = np.array([np.array(image) for image in images_resized]) #print(x_multidim.shape) # flatten the numpy array return x_multidim.reshape(n_samples, -1) # print(x.shape) # print(x) Xtrain = imageprep(dir + 'tmp_images/*.jpg') Xval = imageprep(dir + 'val_images/*.jpg') i, ytrain = multi_label(dir + "train_subset.json") i, yval = multi_label(dir + "validation.json") ytrain, yval = ytrain[:1000], yval[:1000] classifier = MLkNN(k=10) classifier.fit(Xtrain, ytrain) predictions = classifier.predict(Xval) print(predictions) # acc = accuracy_score(yval, predictions) # print("Accuracy on test set: {}".format(acc))
################################################################################### Multilabel Classifier ###################################################################################### from skmultilearn.problem_transform import ClassifierChain classifier = ClassifierChain(svm.SVC(decision_function_shape='ovo')) classifier.fit(train_features,tmp) p=classifier.predict(test_features) print(p) from skmultilearn.adapt import MLkNN clsfr= MLkNN(k=1) clsfr.fit(train_features,tmp) p=clsfr.predict(test_features) print(p) ########################################################################### Search for videos with similar tags ################################################################################## import urllib from bs4 import BeautifulSoup d={} d[0]="cheering" d[1]="music" d[2]="speech" p=p.todense() print(p)
def mlknn_train_pred(k_list, df_train_x, df_train_y, df_test_x, df_test_y, target_cols, NFOLDS=5): """ This function z-score normalizes the train and test data, split the train data in K-folds and run the Multilabel KNN on the folds to choose the best "K", thereafter predicting on the K-fold train data and test set using the Best K, averaging out the predictions across all folds for the test set. Args: k_list: A list of "K" nearest neighbours to perform gridsearch on df_train_x: train data with only phenotypic/morphological features - pandas dataframe. df_train_y: train data with only the MOA (Mechanism of actions) target labels - pandas dataframe. df_test_x: test data with only phenotypic/morphological features - pandas dataframe. df_test_y: test data with only the MOA (Mechanism of actions) target labels- pandas dataframe. target_cols: A list of MOA (Mechanism of actions) target labels NFOLDS: A value that represent number of K-subset/cross-validation we want to perform Returns: oof_preds: Train out-of-fold predictions - pandas dataframe. test_preds: Test predictions - pandas dataframe. """ sc = StandardScaler() df_train_x_scaled = pd.DataFrame(sc.fit_transform(df_train_x), columns=df_train_x.columns) df_test_x_scaled = pd.DataFrame(sc.transform(df_test_x), columns=df_test_x.columns) acc_losses = [] oof_preds = pd.DataFrame(np.zeros(shape=(df_train_y.shape)), columns=target_cols) test_preds = pd.DataFrame(np.zeros(shape=(df_test_y.shape)), columns=target_cols) skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=133) print('Execution time | Fold number | logloss | Best K |') for fn, (trn_idx, val_idx) in enumerate(skf.split(df_train_x_scaled, df_train_y)): start_time = time() X_train, X_val = df_train_x_scaled.loc[ trn_idx, :], df_train_x_scaled.loc[val_idx, :] y_train, y_val = df_train_y.iloc[trn_idx, :], df_train_y.iloc[ val_idx, :] best_k = 0 best_loss = np.inf for k_item in k_list: classifier = MLkNN(k=k_item) classifier.fit(X_train.values, y_train.values) val_preds = classifier.predict_proba(X_val.values) loss = log_loss(np.ravel(y_val), np.ravel(val_preds.toarray())) if loss < best_loss: best_loss = loss best_k = k_item oof_preds.iloc[val_idx, :] = val_preds.toarray() classifier = MLkNN(k=best_k) classifier.fit(X_train.values, y_train.values) acc_losses.append(best_loss) preds = classifier.predict_proba(df_test_x_scaled.values) test_preds += preds.toarray() / NFOLDS print('{}\t\t{}\t\t{:.5f}\t\t{}'.format( str(datetime.timedelta(seconds=time() - start_time))[:7], fn, loss, best_k)) return oof_preds, test_preds
acc_tuple = None hamm_tuple = None f1_tuple = None time_tuple = None print('Pocinjem podesavanje hiperparametara na validacionom skupu:') start_time = time.time() for k in np.arange(1, 6): for s in [0.3, 0.5, 0.7, 1.0]: print('k = ', k, ', s = ', s) inner_start_time = time.time() classifier = MLkNN(k=k, s=s) prediction = classifier.fit(x_train, y_train).predict(x_val) inner_end_time = time.time() inner_time_passed = inner_end_time - inner_start_time hamm = metrics.hamming_loss(y_val, prediction) acc = metrics.accuracy_score(y_val, prediction) f1 = metrics.f1_score(y_val, prediction, average='micro') print('HM:', hamm) print('AS:', acc) print('F1:', f1) if acc > max_acc: print('Nova najbolja tacnost (', acc, '>', max_acc, ') dala je kombinacija k = ', k, ', s =', s) max_acc = acc
#y_train_large = (y_train >= 7) #y_train_odd = (y_train % 2 == 1) #y_multilabel = np.c_[y_train_large, y_train_odd] #knn_clf = KNeighborsClassifier() #knn_clf.fit(f_norm, tlabels) from skmultilearn.dataset import load_dataset from skmultilearn.problem_transform import BinaryRelevance from sklearn.svm import SVC import sklearn.metrics as metrics import sklearn.model_selection X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( f_norm, tlabels, test_size=0.2, random_state=1) tt = y_train.astype('int') ttest = y_test.astype('int') clf = BinaryRelevance(classifier=SVC(), require_dense=[False, True]) clf.fit(X_train, tt) prediction = clf.predict(X_test) metrics.hamming_loss(ttest, prediction) metrics.accuracy_score(ttest, prediction) #Hamming loss measures how well the classifier predicts each of the labels, averaged over samples, then over labels #accuracy score measures how well the classifier predicts label combinations, averaged over samples #jaccard similarity measures the proportion of predicted labels for a sample to its correct assignment, averaged over samples #precision measures how many samples with , #recall measures how many samples , #F1 score measures a weighted average of precision and recall, where both have the same impact on the score from skmultilearn.adapt import MLkNN classifier = MLkNN(k=3) prediction = classifier.fit(X_train, y_train).predict(X_test) metrics.hamming_loss(y_test, prediction)
for j in range(y_num): temp = 0 for t in range(neigs.shape[0]): temp = temp + neigs[t][j + 1] if ph[j] * peh1[j, temp] > ph_[j] * peh0[j, temp]: predict.append(1) else: predict.append(0) predicts.append(predict) predicts = np.array(predicts) return predicts data = pickle.load(open('datasets.pickle', 'rb')) #得到训练数据X,和标签类别Y X = data[0] Y = data[1] predict = mlknn(X, X, 8, 5, Y) print(predict) print(accuary(predict, Y)) ml = MLkNN(k=8) ml.fit(X, Y) p = ml.predict(X) print(accuary(p, Y)) kn = KNeighborsClassifier(n_neighbors=8) kn.fit(X, Y) pp = kn.predict(X) print(accuary(p, Y))
from datetime import timedelta import time start = time.time() from scipy.sparse import csr_matrix, lil_matrix from skmultilearn.adapt import MLkNN x_train = lil_matrix(x_train).toarray() y_train = lil_matrix(y_train).toarray() x_test = lil_matrix(x_test).toarray() classifier = MLkNN(k=4) # train from skmultilearn.adapt import BRkNNbClassifier classifier = BRkNNbClassifier(k=6) classifier.fit(x_train, y_train) # predict predictions = classifier.predict(x_test) # accuracy print("Accuracy = ", accuracy_score(y_test, predictions)) print("\n") print("F1 = ", f1_score(y_test, predictions, average='micro')) print("\n") print("Jaccard = ", jaccard_similarity_score(y_test, predictions)) print("\n") print("Precision = ", precision_score(y_test, predictions, average='micro')) print("\n")
class Model(object): """Fully connected neural network with no hidden layer.""" def __init__(self, metadata): """ Args: metadata: an AutoDLMetadata object. Its definition can be found in AutoDL_ingestion_program/dataset.py """ self.done_training = False self.metadata = metadata self.output_dim = self.metadata.get_output_size() self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True) self.model = MLkNN(k=20) self.step = 0 self.lgb_round = 80 def train(self, dataset, remaining_time_budget=None): """Train this algorithm on the tensorflow |dataset|. This method will be called REPEATEDLY during the whole training/predicting process. So your `train` method should be able to handle repeated calls and hopefully improve your model performance after each call. **************************************************************************** **************************************************************************** IMPORTANT: the loop of calling `train` and `test` will only run if self.done_training = False (the corresponding code can be found in ingestion.py, search 'M.done_training') Otherwise, the loop will go on until the time budget is used up. Please pay attention to set self.done_training = True when you think the model is converged or when there is not enough time for next round of training. **************************************************************************** **************************************************************************** Args: dataset: a `tf.data.Dataset` object. Each of its examples is of the form (example, labels) where `example` is a dense 4-D Tensor of shape (sequence_size, row_count, col_count, num_channels) and `labels` is a 1-D Tensor of shape (output_dim,). Here `output_dim` represents number of classes of this multilabel classification task. IMPORTANT: some of the dimensions of `example` might be `None`, which means the shape on this dimension might be variable. In this case, some preprocessing technique should be applied in order to feed the training of a neural network. For example, if an image dataset has `example` of shape (1, None, None, 3) then the images in this datasets may have different sizes. On could apply resizing, cropping or padding in order to have a fixed size input tensor. remaining_time_budget: time remaining to execute train(). The method should keep track of its execution time to avoid exceeding its time budget. If remaining_time_budget is None, no time budget is imposed. """ if self.done_training: return self.step += 1 # print(f'dataset: {dataset}') t1 = time.time() # Count examples on training set if not hasattr(self, 'num_examples_train'): logger.info("Counting number of examples on train set.") dataset = dataset.batch(128) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() X = [] Y = [] with tf.Session(config=tf.ConfigProto( log_device_placement=False)) as sess: while True: try: example, labels = sess.run(next_element) example = np.squeeze(example) X.extend(example) Y.extend(labels) except tf.errors.OutOfRangeError: break self.X_train = np.array(X) self.y_train = np.array(Y) print('self.X_train.shape: {}'.format(self.X_train.shape)) print('self.y_train.shape: {}.'.format(self.y_train.shape)) self.num_examples_train = len(self.y_train) logger.info("Finished counting. There are {} examples for training set." \ .format(self.num_examples_train)) print('spand time: {}'.format(time.time() - t1)) if self.lgb_round >= 300 or self.step > 10: self.done_training = True return if hasattr(self, 'test_duration'): round = int(50 * self.test_duration + 5) self.lgb_round += round train_start = time.time() self.X_train = self.imputer.fit_transform(self.X_train) self.model.fit(self.X_train, self.y_train) train_end = time.time() # Update for time budget managing train_duration = train_end - train_start logger.info("{} step. {:.2f} sec used. ".format( self.step, train_duration)) self.done_training = True def test(self, dataset, remaining_time_budget=None): """Test this algorithm on the tensorflow |dataset|. Args: Same as that of `train` method, except that the `labels` will be empty. Returns: predictions: A `numpy.ndarray` matrix of shape (sample_count, output_dim). here `sample_count` is the number of examples in this dataset as test set and `output_dim` is the number of labels to be predicted. The values should be binary or in the interval [0,1]. """ # Count examples on test set if not hasattr(self, 'num_examples_test'): logger.info("Counting number of examples on test set.") dataset = dataset.batch(128) iterator = dataset.make_one_shot_iterator() example, labels = iterator.get_next() X = [] with tf.Session(config=tf.ConfigProto( log_device_placement=False)) as sess: while True: try: ex = sess.run(example) ex = np.squeeze(ex) X.extend(ex) except tf.errors.OutOfRangeError: break self.X_test = np.array(X) self.num_examples_test = self.X_test.shape[0] logger.info("Finished counting. There are {} examples for test set." \ .format(self.num_examples_test)) test_begin = time.time() logger.info("Begin testing...") self.X_test = self.imputer.fit_transform(self.X_test) predictions = self.model.predict(self.X_test).A # print(type(predictions)) # print(predictions.A) # preds = self.model.predict_proba(self.X_test) # print(preds) # test_results = pd.Series(test_results).map(self.remps).values # predictions = self.bin2y(test_results) # print(predictions) test_end = time.time() # Update some variables for time management self.test_duration = test_end - test_begin logger.info("[+] Successfully made one prediction. {:.2f} sec used. " \ .format(self.test_duration) + \ "Duration used for test: {:2f}".format(self.test_duration)) return predictions def y2bin(self, y): res = y[:, 0] for i in range(1, y.shape[1]): res *= 2 res += y[:, i] return res def bin2y(self, bin): y = np.array([bin % 2]).T i = 1 while i < self.output_dim: i += 1 bin = bin // 2 y = np.c_[np.array([bin % 2]).T, y] # y = np.insert(y, 0, values=bin%2, axis=1) return y
# 30 is currently the best tested k amount. l = [30, 40, 50, 100, 200, 280] # l = [200] # l = [likely_k] # l = [70, 80, 90, 100, 500, 1000, 2000, 3000, 4000, 5600] best_clf = None lowest_hl = float('inf') best_k = float('inf') for k in l: print(25*'=') print('k = ' + str(k)) clf = MLkNN(k) # train clf.fit(x_train, y_train) # predict predictions = clf.predict(x_dev) predictions = predictions.todense() print('all match:', np.sum(np.all(predictions == y_dev, axis=1)) / len(y_dev)) print('at least one match:', (np.sum(np.all(predictions - y_dev <= 0, axis=1))-np.sum(np.all(predictions== 0, axis=1))) / len(y_dev)) print('binary :', np.mean(predictions == y_dev)) hl = hamming_loss(y_dev, predictions) print('Hamming Loss:', hamming_loss(y_dev, predictions)) if hl < lowest_hl: lowest_hl = hl best_clf = clf best_k = k
classifier = MLkNN(k = 20) #feature_x = pkl.load(open('features_for_classification.pkl')) #classifier = BinaryRelevance(GaussianNB()) Keys_Train = random.sample(ent2type.keys(),10000) Keys_Test = list(ent2type.keys()) [Keys_Test.remove(val) for val in Keys_Train] X_Train = [feature_x[key] for key in Keys_Train] X_Test = [feature_x[key] for key in Keys_Test] Y_Train = generate_labels(Keys_Train) Y_Test = generate_labels(Keys_Test) print('HEERE 1') classifier.fit(np.array(X_Train), np.array(Y_Train)) print('HEERE 2') predictions = classifier.predict(np.array(X_Test)) print(accuracy_score(np.array(Y_Test),predictions)) preds = predictions.toarray() def accuracy(input): data = input[0] true = input[1] size = len(data) FP = TP = FN = TN = 0 for i in xrange(size): if true[i] == True: if data[i] == True: