def MlKnn_with_Grid_Parameters(X_train, X_test, y_train, y_test): X_train = lil_matrix(X_train).toarray() y_train = lil_matrix(y_train).toarray() X_test = lil_matrix(X_test).toarray() y_test = lil_matrix(y_test).toarray() print("MlKnn") model = MLkNN(k=5, s=0.2).fit(X_train, y_train) hamming = hamming_loss(y_test, model.predict(X_test)) Subset_Accuracy = accuracy_score(y_test, model.predict(X_test)) Precision = precision_score(y_test, model.predict(X_test), average="micro") Recall = recall_score(y_test, model.predict(X_test), average='micro') f1 = f1_score(y_test, model.predict(X_test), average='micro') print("Hamming: " + str(hamming_loss(y_test, model.predict(X_test)))) print("Subset Accuracy: " + str(accuracy_score(y_test, model.predict(X_test)))) print("Precision: " + str(precision_score(y_test, model.predict(X_test), average="micro"))) print("Recall: " + str(recall_score(y_test, model.predict(X_test), average='micro'))) print("F1 score: " + str(f1_score(y_test, model.predict(X_test), average='micro'))) print("\n") return hamming, Subset_Accuracy, Precision, Recall, f1
def main(): data = readData("IMDB-Movie-Data.csv") genres = data["Genre"] descriptions = data["Description"] labels = getLabels(genres) calculateNgrams(descriptions) features = list(map(extract_features, descriptions)) print len(features[1]) # X = features # Y = Labels X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42) #binRel(X_train, X_test, y_test, y_train) classifier = MLkNN(k=4) # Train classifier.fit(X_train, y_train) #predict #print X_test predictions = classifier.predict(np.array(X_test)) print('Hamming loss: {0}'.format( sklearn.metrics.hamming_loss(y_test, predictions))) #(y_true, y_pred) ''''
def adapted(data): classifier = MLkNN(k=20) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) accuracyScore = accuracy_score(y_test, predictions) return None
def get_cado_predictions(): data_path = '../../datasets/cado/train.csv' test_path = '../../datasets/cado/test.csv' data = du.load_data(data_path) test = du.load_data(test_path) text_index = 6 label_start_index = 7 X = [d[text_index] for d in data] labels = [d[label_start_index:label_start_index + 12] for d in data] X_test = [d[text_index] for d in test] labels_test = [d[label_start_index:label_start_index + 12] for d in test] Y = np.array(labels, dtype='int') y_test = np.array(labels_test, dtype='int') #Y = np.array(binary_labels, dtype='int') test_index = len(X) X = X + X_test Y = np.vstack([Y, y_test]) tokenizer = tokenize_data(X) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(X) X = pad_sequences(sequences, maxlen=700, padding="post", truncating="post", value=0) num_words = min(MAX_NB_WORDS, len(word_index) + 1) embedding_matrix = np.zeros((num_words, 1)) for word, i in word_index.items(): if i >= MAX_NB_WORDS: continue embedding_matrix[i] = 1 X_train = X[0:test_index, :] Y_train = Y[0:test_index, :] x_test = X[test_index:len(X), :] y_test = Y[test_index:len(Y), :] classifier = MLkNN() classifier.fit(X_train, Y_train) predictions = classifier.predict(x_test) scores = classifier.predict_proba(x_test) y_pred = predictions.toarray() y_score = scores.toarray() return y_pred, y_score
def mlknn(x_tr, y_tr, x_te, x_va=None): """ mlknn :param x_tr: :param y_tr: :param x_te: :param x_va: :return: """ pred = MLkNN(k=10, s=True) y_tr = np.int32(y_tr) pred.fit(x_tr, y_tr) if x_va is None: y_te_ = sparse.dok_matrix.toarray(pred.predict(x_te)) return y_te_ else: y_te_ = sparse.dok_matrix.toarray(pred.predict(x_te)) y_va_ = sparse.dok_matrix.toarray(pred.predict(x_va)) return y_te_, y_va_
def mlknn(train_data_inx,y_train,test_data_inx): classifier = MLkNN(k=mlknn_k) x_train = [] x_test = [] for i in range(len(train_data_inx)): x_train.append(corpus_tfidf[train_data_inx[i]]) for j in range(len(test_data_inx)): x_test.append(corpus_tfidf[test_data_inx[j]]) classifier.fit(csr_matrix(x_train), csr_matrix(y_train)) mlknn_pre = classifier.predict(csr_matrix(x_test)) mlknn_pre = mlknn_pre.toarray() return mlknn_pre
def mlknn(self, number): classifier = MLkNN(k=number) classifier.fit(self.X_train, self.y_train) # predict predictions = classifier.predict(self.X_test) result = hamming_loss(self.y_test, predictions) print("hanming_loss,",result) result = f1_score(self.y_test, predictions, average='micro') print("micro -f1: ", result) result = precision_score(self.y_test, predictions,average='micro') print(result)
def train(self): classifier_new = MLkNN(k=10) x_train = lil_matrix(self.x_data).toarray() y_train = lil_matrix(self.y_data).toarray() x_test = lil_matrix(self.x_test).toarray() classifier_new.fit(x_train, y_train) # predict predictions = classifier_new.predict(x_test) return { 'accuracy': accuracy_score(self.y_test, predictions), 'f1_score': f1_score(self.y_test, predictions, average='micro') }
def MLkNN(self): self.sub_parser.add_argument('--library', action='store_true', default=False) args = self.sub_parser.parse_args(sys.argv[2:]) print 'Running ML-kNN, arguments=%s' % args print 'Loading %s data...' % args.N if args.f == 'My_dict': vectorizer = my_dict_vectorizer(stop=not args.nostop, bigram=args.bigram) elif args.f == 'LIB_count': vectorizer = lib_count_vectorizer(stop=not args.nostop, bigram=args.bigram) elif args.f == 'LIB_hash': vectorizer = lib_hash_vectorizer(stop=not args.nostop, bigram=args.bigram) elif args.f == 'LIB_tfidf': vectorizer = lib_tfidf_vectorizer(stop=not args.nostop, bigram=args.bigram) data = load_data(args.N, args.D, args.Nt, vectorizer) print 'Done loading data, actual feature size:', data[1].shape X, Y, Xt, Yt, cats = data if args.library: from skmultilearn.adapt import MLkNN model = MLkNN() else: from sklearn.neighbors import NearestNeighbors from multi import MLkNN model = MLkNN(NearestNeighbors) model.fit(X, Y) Yp = model.predict(Xt) with warnings.catch_warnings(): warnings.simplefilter("ignore") hl = computeMetrics(Yp, Yt, cats) print 'the hamming loss:' print '>> ', hl from sklearn.metrics import (hamming_loss, classification_report) print 'hamming loss(library):', hamming_loss(Yt, Yp) print classification_report(Yt, Yp, target_names=cats) print 'DONE..'
def adapt(X_train, y_train, X_test, y_test): y_train = y_train.to_sparse().to_coo() y_test = y_test.to_sparse().to_coo() from skmultilearn.adapt import MLkNN classifier = MLkNN(k=4) print("Train Adapted algorithm") classifier.fit(X_train, y_train) print("Predict") predictions = classifier.predict(X_test) from sklearn.metrics import accuracy_score print("Accuracy") print(y_test.shape, predictions.shape) print(accuracy_score(y_test.toarray(), predictions))
for i in range(7): for j in range(7): block = x_luv[32 * i:32 * (i + 1), 32 * j:32 * (j + 1)] mean, var = np.mean(block, axis=tuple(range(block.ndim - 1))), np.var( block, axis=tuple(range(block.ndim - 1))) l = np.concatenate((l, mean)) l = np.concatenate((l, var)) x_test.append(l) x_train = np.asarray(x_train).astype(np.float32) x_test = np.asarray(x_test).astype(np.float32) y_test = np.asarray(y_test) y_train = np.asarray(y_train) print(x_train.shape, y_train.shape, x_test.shape, y_test.shape) classifier = MLkNN(k=9) classifier.fit(x_train, y_train) with open('mlknn-k-9-luv.pkl', 'wb') as f: pickle.dump(classifier, f) ''' with open('mlknn-k-9-luv.pkl', 'rb') as f: classifier = pickle.load(f) ''' predictions = classifier.predict(x_test).todense() print('all match:', np.sum(np.all(predictions == y_test, axis=1)) / len(y_test)) print('at least one match:', (np.sum(np.all(predictions - y_test <= 0, axis=1)) - np.sum(np.all(predictions == 0, axis=1))) / len(y_test)) print('binary :', np.sum(predictions == y_test) / (5 * len(y_test)))
def run(): parser = get_arg_parser() cmd_args = parser.parse_args() if cmd_args.gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = str(cmd_args.gpu) gpunum = os.getenv('CUDA_VISIBLE_DEVICES') logging.info("GPU has been set to {}".format(gpunum)) logging.info("Model used for the regression network: {}" .format(cmd_args.model_name)) # 1. Dataset retrieval # -------------------- tab_printer(constants.Dataset) dataset = Dataset(nrows=constants.Dataset.nrows, augment_labels=constants.Dataset.augment_labels, top_n=constants.Dataset.top_n) logging.info("Going to create vocabulary and fit a preprocessing pipeline" "using {} samples. Settings will be listed below" .format(len(dataset.X_train))) # 2. Preprocessing # ----------------- tab_printer(constants.NLP) preprocessor = Preprocessing(dataset.X_train) # Preprocess documents X_train = preprocessor.transform_documents(dataset.X_train) X_test = preprocessor.transform_documents(dataset.X_test) # 3. Word embeddings with word2vec # -------------------------------- # Train word2vec embeddings if train_word2vec option is selected if cmd_args.train_word2vec: utils.embeddings.main() weights = get_embedding_tensor(preprocessor) # 4. Node embeddings with AttentionWalk # ------------------------------------- args = _generate_deepwalk_parameters(dataset.y_train_graph) if cmd_args.train_attentionwalk: train_attention_walk(args) graph_embeddings = pd.read_csv(args.embedding_path).iloc[:, 1:].values # Get document representations using node embeddings y_embedded = _get_label_embeddings(dataset.y_train, graph_embeddings) y_test_embedded = _get_label_embeddings(dataset.y_test, graph_embeddings) # 5. Regressor Training # --------------------- device = 'cuda:' + str(os.getenv("CUDA_VISIBLE_DEVICES")) \ if torch.cuda.is_available() else 'cpu' regressor_nn = NeuralNet( get_network_class(cmd_args.model_name), max_epochs=constants.NeuralNetworkTraining.epochs, lr=constants.NeuralNetworkTraining.learning_rate, batch_size=constants.NeuralNetworkTraining.batch_size, optimizer=torch.optim.Adam, criterion=torch.nn.MSELoss, module__output_dim=args.dimensions, module__embedding=weights, module__embedding_dim=constants.NLP.embedding_size, device=device, train_split=None, ) # Train the regressor neural network regressor_nn.fit(X_train, y_embedded.astype(np.float32)) # 6. Train Multi-label KNN algorithm # ---------------------------------- tab_printer(constants.MLKNN) # Train multi-label KNN to turn label embeddings into label predictions classifier = MLkNN(k=constants.MLKNN.k, s=constants.MLKNN.s) classifier.fit(y_embedded, dataset.y_train) # 7. Evaluation # ------------- # Label prediction with documents y_test_pred = regressor_nn.predict(X_test) preds = classifier.predict(y_test_pred) preds_raw = classifier.predict_proba(y_test_pred) # Label prediction with label embeddings preds_w_labels = classifier.predict(y_test_embedded) preds_w_labels_raw = classifier.predict_proba(y_test_embedded) # Log evaluation result with label embeddings eval_metrics_w_labels = evaluation \ .all_metrics(preds_w_labels.toarray(), dataset.y_test, yhat_raw=preds_w_labels_raw.toarray()) logging.info(str(eval_metrics_w_labels)) # Log evaluation result with documents report_evaluation(preds.toarray(), dataset.y_test, yhat_raw=preds_raw.toarray())
class Model(object): """Fully connected neural network with no hidden layer.""" def __init__(self, metadata): """ Args: metadata: an AutoDLMetadata object. Its definition can be found in AutoDL_ingestion_program/dataset.py """ self.done_training = False self.metadata = metadata self.output_dim = self.metadata.get_output_size() self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True) self.model = MLkNN(k=20) self.step = 0 self.lgb_round = 80 def train(self, dataset, remaining_time_budget=None): """Train this algorithm on the tensorflow |dataset|. This method will be called REPEATEDLY during the whole training/predicting process. So your `train` method should be able to handle repeated calls and hopefully improve your model performance after each call. **************************************************************************** **************************************************************************** IMPORTANT: the loop of calling `train` and `test` will only run if self.done_training = False (the corresponding code can be found in ingestion.py, search 'M.done_training') Otherwise, the loop will go on until the time budget is used up. Please pay attention to set self.done_training = True when you think the model is converged or when there is not enough time for next round of training. **************************************************************************** **************************************************************************** Args: dataset: a `tf.data.Dataset` object. Each of its examples is of the form (example, labels) where `example` is a dense 4-D Tensor of shape (sequence_size, row_count, col_count, num_channels) and `labels` is a 1-D Tensor of shape (output_dim,). Here `output_dim` represents number of classes of this multilabel classification task. IMPORTANT: some of the dimensions of `example` might be `None`, which means the shape on this dimension might be variable. In this case, some preprocessing technique should be applied in order to feed the training of a neural network. For example, if an image dataset has `example` of shape (1, None, None, 3) then the images in this datasets may have different sizes. On could apply resizing, cropping or padding in order to have a fixed size input tensor. remaining_time_budget: time remaining to execute train(). The method should keep track of its execution time to avoid exceeding its time budget. If remaining_time_budget is None, no time budget is imposed. """ if self.done_training: return self.step += 1 # print(f'dataset: {dataset}') t1 = time.time() # Count examples on training set if not hasattr(self, 'num_examples_train'): logger.info("Counting number of examples on train set.") dataset = dataset.batch(128) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() X = [] Y = [] with tf.Session(config=tf.ConfigProto( log_device_placement=False)) as sess: while True: try: example, labels = sess.run(next_element) example = np.squeeze(example) X.extend(example) Y.extend(labels) except tf.errors.OutOfRangeError: break self.X_train = np.array(X) self.y_train = np.array(Y) print('self.X_train.shape: {}'.format(self.X_train.shape)) print('self.y_train.shape: {}.'.format(self.y_train.shape)) self.num_examples_train = len(self.y_train) logger.info("Finished counting. There are {} examples for training set." \ .format(self.num_examples_train)) print('spand time: {}'.format(time.time() - t1)) if self.lgb_round >= 300 or self.step > 10: self.done_training = True return if hasattr(self, 'test_duration'): round = int(50 * self.test_duration + 5) self.lgb_round += round train_start = time.time() self.X_train = self.imputer.fit_transform(self.X_train) self.model.fit(self.X_train, self.y_train) train_end = time.time() # Update for time budget managing train_duration = train_end - train_start logger.info("{} step. {:.2f} sec used. ".format( self.step, train_duration)) self.done_training = True def test(self, dataset, remaining_time_budget=None): """Test this algorithm on the tensorflow |dataset|. Args: Same as that of `train` method, except that the `labels` will be empty. Returns: predictions: A `numpy.ndarray` matrix of shape (sample_count, output_dim). here `sample_count` is the number of examples in this dataset as test set and `output_dim` is the number of labels to be predicted. The values should be binary or in the interval [0,1]. """ # Count examples on test set if not hasattr(self, 'num_examples_test'): logger.info("Counting number of examples on test set.") dataset = dataset.batch(128) iterator = dataset.make_one_shot_iterator() example, labels = iterator.get_next() X = [] with tf.Session(config=tf.ConfigProto( log_device_placement=False)) as sess: while True: try: ex = sess.run(example) ex = np.squeeze(ex) X.extend(ex) except tf.errors.OutOfRangeError: break self.X_test = np.array(X) self.num_examples_test = self.X_test.shape[0] logger.info("Finished counting. There are {} examples for test set." \ .format(self.num_examples_test)) test_begin = time.time() logger.info("Begin testing...") self.X_test = self.imputer.fit_transform(self.X_test) predictions = self.model.predict(self.X_test).A # print(type(predictions)) # print(predictions.A) # preds = self.model.predict_proba(self.X_test) # print(preds) # test_results = pd.Series(test_results).map(self.remps).values # predictions = self.bin2y(test_results) # print(predictions) test_end = time.time() # Update some variables for time management self.test_duration = test_end - test_begin logger.info("[+] Successfully made one prediction. {:.2f} sec used. " \ .format(self.test_duration) + \ "Duration used for test: {:2f}".format(self.test_duration)) return predictions def y2bin(self, y): res = y[:, 0] for i in range(1, y.shape[1]): res *= 2 res += y[:, i] return res def bin2y(self, bin): y = np.array([bin % 2]).T i = 1 while i < self.output_dim: i += 1 bin = bin // 2 y = np.c_[np.array([bin % 2]).T, y] # y = np.insert(y, 0, values=bin%2, axis=1) return y
# calculating test accuracy prediction = LogReg_pipeline.predict(x_test) print('Test accuracy is {}'.format(accuracy_score(test[category], prediction))) print("\n") from skmultilearn.adapt import MLkNN from scipy.sparse import csr_matrix, lil_matrix classifier_new = MLkNN(k=10) # Note that this classifier can throw up errors when handling sparse matrices. x_train = lil_matrix(x_train).toarray() y_train = lil_matrix(y_train).toarray() x_test = lil_matrix(x_test).toarray() # train classifier_new.fit(x_train, y_train) # predict predictions_new = classifier_new.predict(x_test) # accuracy print("Accuracy = ",accuracy_score(y_test,predictions_new)) print("\n") # using Label Powerset from skmultilearn.problem_transform import LabelPowerset # initialize label powerset multi-label classifier classifier = LabelPowerset(LogisticRegression()) # train classifier.fit(x_train, y_train) # predict predictions = classifier.predict(x_test) # accuracy print("Accuracy = ",accuracy_score(y_test,predictions)) print("\n")
def train_cnn_rnn(input_file, training_config): # read data and params x_, y_, vocabulary, vocabulary_inv, df, label_dict = data_helper_multi.load_data( input_file) params = json.loads(open(training_config).read()) # create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) output_dir = os.path.join('data_path_save', 'cnn_rnn_' + timestamp) trained_dir = os.path.join(output_dir, 'trained_results') if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) # assign a 300 dimension vector to each word word_embeddings = data_helper_multi.load_embeddings(vocabulary) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # split the original dataset into trainset and devset x_train, x_dev, y_train, y_dev = train_test_split(x_, y_, test_size=0.1) # split the trainset into trainset and devset logging.info('x_train: {}, x_dev: {}'.format(len(x_train), len(x_dev))) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) checkpoint_dir = os.path.join(output_dir, 'checkpoints') if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch) } _, step, loss, scores = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.scores], feed_dict=feed_dict) return scores def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch) } step, loss, scores = sess.run( [global_step, cnn_rnn.loss, cnn_rnn.scores], feed_dict=feed_dict) return step, loss, scores saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # training starts here train_batches = data_helper_multi.batch_iter( list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 x_train_fit = np.zeros([ params['batch_size'] * params['evaluate_every'], len(label_dict.items()) ]) y_train_fit = np.zeros([ params['batch_size'] * params['evaluate_every'], len(label_dict.items()) ]) for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) scores = train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) x_train_fit[(current_step % params['evaluate_every']) * params['batch_size']: (current_step % params['evaluate_every']) * params['batch_size'] + params['batch_size']] = scores y_train_fit[(current_step % params['evaluate_every']) * params['batch_size']: (current_step % params['evaluate_every']) * params['batch_size'] + params['batch_size']] = y_train_batch if current_step % params['evaluate_every'] == 0: clf = MLkNN(k=4) clf.fit(x_train_fit, y_train_fit) dev_batches = data_helper_multi.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_batches_dev = len(x_dev) // params['batch_size'] x_dev_fit = np.zeros([ params['batch_size'] * total_batches_dev, len(label_dict.items()) ]) y_dev_fit = np.zeros([ params['batch_size'] * total_batches_dev, len(label_dict.items()) ]) for step_dev, dev_batch in enumerate(dev_batches): x_dev_batch, y_dev_batch = zip(*dev_batch) step, loss, scores = dev_step(x_dev_batch, y_dev_batch) x_dev_fit[step_dev * params['batch_size']:step_dev * params['batch_size'] + params['batch_size']] = scores y_dev_fit[step_dev * params['batch_size']:step_dev * params['batch_size'] + params['batch_size']] = y_dev_batch y_dev_preds = clf.predict(x_dev_fit) y_dev_preds = y_dev_preds.toarray() y_union = y_dev_preds + y_dev_fit accuracy = float(np.sum(y_union == 2)) / float( np.sum(y_union == 1) + np.sum(y_union == 2)) precision = float(np.sum(y_union == 2)) / float( np.sum(y_dev_preds == 1)) recall = float(np.sum(y_union == 2)) / float( np.sum(y_dev_fit == 1)) f1 = 2 * precision * recall / (precision + recall) logging.info('Accuracy on dev set: {}'.format(accuracy)) logging.info('Precision on dev set: {}'.format(precision)) logging.info('Recall on dev set: {}'.format(recall)) logging.info('F1-measure on dev set: {}'.format(f1)) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format( path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # save trained params and files with open(trained_dir + '/words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + '/embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + '/labels.json', 'w') as outfile: json.dump(label_dict, outfile, indent=4, ensure_ascii=False) params['sequence_length'] = x_train.shape[1] with open(trained_dir + '/trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False) with open(trained_dir + '/classifier.pickle', 'wb') as outfile: pickle.dump(clf, outfile, pickle.HIGHEST_PROTOCOL)
################################################################################### Multilabel Classifier ###################################################################################### from skmultilearn.problem_transform import ClassifierChain classifier = ClassifierChain(svm.SVC(decision_function_shape='ovo')) classifier.fit(train_features,tmp) p=classifier.predict(test_features) print(p) from skmultilearn.adapt import MLkNN clsfr= MLkNN(k=1) clsfr.fit(train_features,tmp) p=clsfr.predict(test_features) print(p) ########################################################################### Search for videos with similar tags ################################################################################## import urllib from bs4 import BeautifulSoup d={} d[0]="cheering" d[1]="music" d[2]="speech" p=p.todense() print(p) for tup in p: tupp = np.matrix(tup).tolist()[0]
from sklearn.neighbors import KNeighborsClassifier classifier_knn = KNeighborsClassifier(n_neighbors=5,p=2) classifier_knn.fit(features_train, labels_train) pred_knn = classifier_knn.predict(features_test) Score_knn = classifier_knn.score(features_test, labels_test) #------------------------------------------------------------------------------ #using Multilabel KNN Algorithm from skmultilearn.adapt import MLkNN mlknn_model = MLkNN(k=20) # train mlknn_model.fit(features_train, labels_train) # predict predictions = mlknn_model.predict(features_test) score_mlknn= accuracy_score(labels_test,predictions) ''' PERFORMING FEATURE SELECTION ''' from sklearn.decomposition import PCA pca = PCA(n_components=388) fit_train = pca.fit(features_train) fit_test = pca.fit(features_test) features_train_new = pca.transform(features_train) features_test_new = pca.transform(features_test)
end_time = time.time() print('Klasifikator obucen i sacuvan za: ', end_time-start_time, 's') if cf is None: cf = joblib.load(cf_name) if vec is None: vec = joblib.load(vec_name) if genres is None: genres = joblib.load(genres_name) yes_no = 'yes' while(yes_no != 'no'): film = input('Unesite opis filma: ') film_rep = vec.transform([film]) predicted = cf.predict(film_rep) res = '' print(predicted[0, :].toarray()[0]) for genre, prediction in zip(genres, predicted[0, :].toarray()[0]): if prediction == 1: res += genre + ', ' print(res[:-2]) yes_no = input('Da li zelite jos filmova [yes/no]: ')
for j in range(y_num): temp = 0 for t in range(neigs.shape[0]): temp = temp + neigs[t][j + 1] if ph[j] * peh1[j, temp] > ph_[j] * peh0[j, temp]: predict.append(1) else: predict.append(0) predicts.append(predict) predicts = np.array(predicts) return predicts data = pickle.load(open('datasets.pickle', 'rb')) #得到训练数据X,和标签类别Y X = data[0] Y = data[1] predict = mlknn(X, X, 8, 5, Y) print(predict) print(accuary(predict, Y)) ml = MLkNN(k=8) ml.fit(X, Y) p = ml.predict(X) print(accuary(p, Y)) kn = KNeighborsClassifier(n_neighbors=8) kn.fit(X, Y) pp = kn.predict(X) print(accuary(p, Y))
batch_pred_y=session.run(y_last, feed_dict={x:batch_x_emb, sequence_lengths:[sequence_length]*batch_size}) train_x_fit[step*batch_size : step*batch_size+batch_size]=batch_pred_y train_y_fit[step*batch_size : step*batch_size+batch_size]=batch_y clf=MLkNN(k=4) clf.fit(X=train_x_fit, y=train_y_fit) # dev stage batches_dev=batch_yield(dev_x, dev_y, batch_size, word2id, label_dict, sequence_length, shuffle=False) total_batches_dev=len(dev_x)//batch_size dev_x_fit=np.zeros([batch_size*total_batches_dev, n_classes]) dev_y_fit=np.zeros([batch_size*total_batches_dev, n_classes]) for step, (batch_dev_x, batch_dev_y) in enumerate(batches_dev): batch_dev_x_emb=session.run(word_embeddings, feed_dict={input_ids:batch_dev_x}) batch_dev_pred_y=session.run(y_last, feed_dict={x:batch_dev_x_emb, sequence_lengths:[sequence_length]*batch_size}) dev_x_fit[step*batch_size : step*batch_size+batch_size]=batch_dev_pred_y dev_y_fit[step*batch_size : step*batch_size+batch_size]=batch_dev_y dev_preds=clf.predict(dev_x_fit) dev_preds=dev_preds.toarray() base_y=dev_preds+dev_y_fit acc=float(np.sum(base_y==2))/float(np.sum(base_y==1)+np.sum(base_y==2)) precision=float(np.sum(base_y==2))/float(np.sum(dev_preds==1)) recall=float(np.sum(base_y==2))/float(np.sum(dev_y_fit==1)) f1=2*precision*recall/(precision+recall) print('----------- Epoch {} -------------'.format(epoch+1)) print('Accuracy\tPrecision\tRecall\tF1 measure') print(str(acc)+'\t'+str(precision)+'\t'+str(recall)+'\t'+str(f1)) save_path=saver.save(session, model_path) ''' ## Make predictions test_data=read_data('data_path/labeled_text_test2.csv')
start = time.time() from scipy.sparse import csr_matrix, lil_matrix from skmultilearn.adapt import MLkNN x_train = lil_matrix(x_train).toarray() y_train = lil_matrix(y_train).toarray() x_test = lil_matrix(x_test).toarray() classifier = MLkNN(k=4) # train from skmultilearn.adapt import BRkNNbClassifier classifier = BRkNNbClassifier(k=6) classifier.fit(x_train, y_train) # predict predictions = classifier.predict(x_test) # accuracy print("Accuracy = ", accuracy_score(y_test, predictions)) print("\n") print("F1 = ", f1_score(y_test, predictions, average='micro')) print("\n") print("Jaccard = ", jaccard_similarity_score(y_test, predictions)) print("\n") print("Precision = ", precision_score(y_test, predictions, average='micro')) print("\n") print("Recall = ", recall_score(y_test, predictions, average='micro')) print("\n")
#feature_x = pkl.load(open('features_for_classification.pkl')) #classifier = BinaryRelevance(GaussianNB()) Keys_Train = random.sample(ent2type.keys(),10000) Keys_Test = list(ent2type.keys()) [Keys_Test.remove(val) for val in Keys_Train] X_Train = [feature_x[key] for key in Keys_Train] X_Test = [feature_x[key] for key in Keys_Test] Y_Train = generate_labels(Keys_Train) Y_Test = generate_labels(Keys_Test) print('HEERE 1') classifier.fit(np.array(X_Train), np.array(Y_Train)) print('HEERE 2') predictions = classifier.predict(np.array(X_Test)) print(accuracy_score(np.array(Y_Test),predictions)) preds = predictions.toarray() def accuracy(input): data = input[0] true = input[1] size = len(data) FP = TP = FN = TN = 0 for i in xrange(size): if true[i] == True: if data[i] == True: TP += 1 else:
# print(images_resized) # convert images to numpy array x_multidim = np.array([np.array(image) for image in images_resized]) #print(x_multidim.shape) # flatten the numpy array return x_multidim.reshape(n_samples, -1) # print(x.shape) # print(x) Xtrain = imageprep(dir + 'tmp_images/*.jpg') Xval = imageprep(dir + 'val_images/*.jpg') i, ytrain = multi_label(dir + "train_subset.json") i, yval = multi_label(dir + "validation.json") ytrain, yval = ytrain[:1000], yval[:1000] classifier = MLkNN(k=10) classifier.fit(Xtrain, ytrain) predictions = classifier.predict(Xval) print(predictions) # acc = accuracy_score(yval, predictions) # print("Accuracy on test set: {}".format(acc))
accuracy_arr.append(sub_accuracy) recall_arr.append(recall) precision_arr.append(precision) f1_arr.append(f1) elif mlknn_with_grid != True and grid != True: X_train = lil_matrix(X_train).toarray() y_train = lil_matrix(y_train).toarray() X_test = lil_matrix(X_test).toarray() y_test = lil_matrix(y_test).toarray() k = [2, 3, 4, 5, 6, 7, 8, 9, 10] for i in k: model = MLkNN(k=i, s=0.2).fit(X_train, y_train) hamming[i].append(hamming_loss(y_test, model.predict(X_test))) Subset_Accuracy[i].append( accuracy_score(y_test, model.predict(X_test))) Precision[i].append( precision_score(y_test, model.predict(X_test), average="micro")) Recall[i].append( recall_score(y_test, model.predict(X_test), average='micro')) f1[i].append( f1_score(y_test, model.predict(X_test), average='micro')) if mlknn_with_grid != True and grid != True: all = [hamming, Subset_Accuracy, Recall, Precision, f1]
def MultiLabel_class(temp_interval): # 把数据集划为测试集和训练集 X_train, X_test, y_train, y_test = train_test_split( data, label, test_size=temp_interval, random_state=17) save_folder = open(save_path, "a+") save_folder.write("测试集占比:" + str(temp_interval) + "\n") # # 方法1:x_train 对每一个单标签. # # with a gaussian naive bayes base classifier # classifier = BinaryRelevance(GaussianNB()) # # train # classifier.fit(X_train, y_train) # # predict # predictions = classifier.predict(X_test) # print("方法一:", accuracy_score(y_test, predictions)) # print("方法一:", np.mean(predictions == y_test)) # 方法2:OneVsRest 想要分类的作为正类,其他的类作为反类。 # 分类器使用1对多,SVM用linear kernel clf1 = OneVsRestClassifier(SVC(kernel='linear', gamma='auto'), n_jobs=-1) # clf1 = OneVsRestClassifier(SVC(kernel='poly', gamma='auto'), n_jobs=-1) # 训练 clf1.fit(X_train, y_train) # 输出预测的标签结果 predict_class = clf1.predict(X_test) # 准确率,预测的结果和实际的结果 save_folder.write("OneVsRest(accuracy_score):" + str(clf1.score(X_test, y_test)) + "\n") save_folder.write("OneVsRest(mean):" + str(np.mean(predict_class == y_test)) + "\n") # # 方法3:powerset:随机抽取k个label,将这k类(有2^k种组合)转化为单标签. # classifier = LabelPowerset(GaussianNB()) # # train # classifier.fit(X_train, y_train) # # predict # predictions = classifier.predict(X_test) # print("方法三(accuracy_score):", accuracy_score(y_test, predictions)) # print("方法三(mean):", np.mean(predictions == y_test)) # 方法4:Adapted Algorithm:多标签KNN算法MLKNN classifier = MLkNN(k=20) # train classifier.fit(X_train, y_train) # predict predictions = classifier.predict(X_test) save_folder.write("MLKNN(accuracy_score):" + str(accuracy_score(y_test, predictions)) + "\n") save_folder.write("MLKNN(mean):" + str(np.mean(predictions == y_test)) + "\n") # # 方法5:分类器链 # classifier = ClassifierChain(GaussianNB()) # # train # classifier.fit(X_train, y_train) # # predict # predictions = classifier.predict(X_test) # print("方法五:", accuracy_score(y_test, predictions)) # print("方法五:", np.mean(predictions == y_test)) # np.save(save_path, predict_class) # 准确率,预测的结果和实际的结果 # print(np.mean(predict_class == y_test)) save_folder.close()
# l = [200] # l = [likely_k] # l = [70, 80, 90, 100, 500, 1000, 2000, 3000, 4000, 5600] best_clf = None lowest_hl = float('inf') best_k = float('inf') for k in l: print(25*'=') print('k = ' + str(k)) clf = MLkNN(k) # train clf.fit(x_train, y_train) # predict predictions = clf.predict(x_dev) predictions = predictions.todense() print('all match:', np.sum(np.all(predictions == y_dev, axis=1)) / len(y_dev)) print('at least one match:', (np.sum(np.all(predictions - y_dev <= 0, axis=1))-np.sum(np.all(predictions== 0, axis=1))) / len(y_dev)) print('binary :', np.mean(predictions == y_dev)) hl = hamming_loss(y_dev, predictions) print('Hamming Loss:', hamming_loss(y_dev, predictions)) if hl < lowest_hl: lowest_hl = hl best_clf = clf best_k = k # import sys # np.set_printoptions(threshold=sys.maxsize)
# splitting the data to training and testing data set X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30,random_state=2) # transforming the data X_train_tfidf = vetorizar.transform(X_train) X_val_tfidf = vetorizar.transform(X_val) X_test1_tfidf = vetorizar.transform(X_test1) # using Multi-label kNN classifier mlknn_classifier = MLkNN() mlknn_classifier.fit(X_train_tfidf, y_train) #prediction predicted = mlknn_classifier.predict(X_val_tfidf) print(f1_score(y_val, predicted,average='micro')) --------test------------------------------------------------------------------------------ predicts = mlknn_classifier.predict(X_test1_tfidf) k=pd.DataFrame(predicts.todense()) ss[TARGET_COLS] = k ss.to_csv(r"C:\Users\Sheeja Ayoob\Desktop\hacklive_NLP_sub7.csv", index = False) --------------------------------------------------------------------------------------------