def main():
    data = readData("IMDB-Movie-Data.csv")
    genres = data["Genre"]
    descriptions = data["Description"]
    labels = getLabels(genres)
    calculateNgrams(descriptions)

    features = list(map(extract_features, descriptions))
    print len(features[1])
    # X = features
    # Y = Labels
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.33,
                                                        random_state=42)
    #binRel(X_train, X_test, y_test, y_train)
    classifier = MLkNN(k=4)
    # Train
    classifier.fit(X_train, y_train)
    #predict
    #print X_test
    predictions = classifier.predict(np.array(X_test))
    print('Hamming loss: {0}'.format(
        sklearn.metrics.hamming_loss(y_test, predictions)))  #(y_true, y_pred)
    ''''
def adapted(data):

    classifier = MLkNN(k=20)
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    accuracyScore = accuracy_score(y_test, predictions)
    return None
Beispiel #3
0
def get_cado_predictions():
    data_path = '../../datasets/cado/train.csv'
    test_path = '../../datasets/cado/test.csv'

    data = du.load_data(data_path)
    test = du.load_data(test_path)

    text_index = 6
    label_start_index = 7
    X = [d[text_index] for d in data]
    labels = [d[label_start_index:label_start_index + 12] for d in data]

    X_test = [d[text_index] for d in test]
    labels_test = [d[label_start_index:label_start_index + 12] for d in test]

    Y = np.array(labels, dtype='int')
    y_test = np.array(labels_test, dtype='int')
    #Y = np.array(binary_labels, dtype='int')

    test_index = len(X)

    X = X + X_test
    Y = np.vstack([Y, y_test])

    tokenizer = tokenize_data(X)
    word_index = tokenizer.word_index

    sequences = tokenizer.texts_to_sequences(X)

    X = pad_sequences(sequences,
                      maxlen=700,
                      padding="post",
                      truncating="post",
                      value=0)

    num_words = min(MAX_NB_WORDS, len(word_index) + 1)
    embedding_matrix = np.zeros((num_words, 1))

    for word, i in word_index.items():
        if i >= MAX_NB_WORDS:
            continue
        embedding_matrix[i] = 1

    X_train = X[0:test_index, :]
    Y_train = Y[0:test_index, :]
    x_test = X[test_index:len(X), :]
    y_test = Y[test_index:len(Y), :]

    classifier = MLkNN()
    classifier.fit(X_train, Y_train)
    predictions = classifier.predict(x_test)
    scores = classifier.predict_proba(x_test)
    y_pred = predictions.toarray()
    y_score = scores.toarray()

    return y_pred, y_score
def mlknn(train_data_inx,y_train,test_data_inx):
	classifier = MLkNN(k=mlknn_k)
	x_train = []
	x_test = []
	for i in range(len(train_data_inx)):
		x_train.append(corpus_tfidf[train_data_inx[i]])
	for j in range(len(test_data_inx)):
		x_test.append(corpus_tfidf[test_data_inx[j]])
	classifier.fit(csr_matrix(x_train), csr_matrix(y_train))
	mlknn_pre = classifier.predict(csr_matrix(x_test))
	mlknn_pre = mlknn_pre.toarray()
	return mlknn_pre
Beispiel #5
0
def MLKNN_method(X_train, y_train, ml_k, ml_s):
    """
	改编算法-->MLKNN方法
	:param X_train: 输入数据
	:param y_train: 对应标签数据
	:return:
	"""
    try:
        classifier = MLkNN(k=int(ml_k), s=float(ml_s))
        classifier.fit(X_train, y_train)

        return classifier
    except Exception as e:
        print("warning----改编算法KNN|MLKNN----" + str(e))

    return None
Beispiel #6
0
    def mlknn(self, number):
        classifier = MLkNN(k=number)

        classifier.fit(self.X_train, self.y_train)

        # predict
        predictions = classifier.predict(self.X_test)
        result = hamming_loss(self.y_test, predictions)

        print("hanming_loss,",result)

        result = f1_score(self.y_test, predictions, average='micro')
        print("micro -f1: ", result)

        result = precision_score(self.y_test, predictions,average='micro')
        print(result)
Beispiel #7
0
    def RecommendByMLKNN(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """ML KNN算法"""
        classifier = MLkNN(k=train_data_y.shape[1])
        classifier.fit(train_data, train_data_y)

        predictions = classifier.predict_proba(test_data).todense()
        """预测结果转化为data array"""
        predictions = numpy.asarray(predictions)

        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)
        answerList = test_data_y
        print(predictions)
        print(test_data_y)
        print(recommendList)
        print(answerList)
        return [recommendList, answerList]
Beispiel #8
0
    def train(self):

        classifier_new = MLkNN(k=10)

        x_train = lil_matrix(self.x_data).toarray()
        y_train = lil_matrix(self.y_data).toarray()
        x_test = lil_matrix(self.x_test).toarray()

        classifier_new.fit(x_train, y_train)

        # predict
        predictions = classifier_new.predict(x_test)

        return {
            'accuracy': accuracy_score(self.y_test, predictions),
            'f1_score': f1_score(self.y_test, predictions, average='micro')
        }
Beispiel #9
0
    def MLkNN(self):
        self.sub_parser.add_argument('--library',
                                     action='store_true',
                                     default=False)

        args = self.sub_parser.parse_args(sys.argv[2:])
        print 'Running ML-kNN, arguments=%s' % args
        print 'Loading %s data...' % args.N

        if args.f == 'My_dict':
            vectorizer = my_dict_vectorizer(stop=not args.nostop,
                                            bigram=args.bigram)
        elif args.f == 'LIB_count':
            vectorizer = lib_count_vectorizer(stop=not args.nostop,
                                              bigram=args.bigram)
        elif args.f == 'LIB_hash':
            vectorizer = lib_hash_vectorizer(stop=not args.nostop,
                                             bigram=args.bigram)
        elif args.f == 'LIB_tfidf':
            vectorizer = lib_tfidf_vectorizer(stop=not args.nostop,
                                              bigram=args.bigram)

        data = load_data(args.N, args.D, args.Nt, vectorizer)
        print 'Done loading data, actual feature size:', data[1].shape

        X, Y, Xt, Yt, cats = data
        if args.library:
            from skmultilearn.adapt import MLkNN
            model = MLkNN()
        else:
            from sklearn.neighbors import NearestNeighbors
            from multi import MLkNN
            model = MLkNN(NearestNeighbors)
        model.fit(X, Y)
        Yp = model.predict(Xt)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            hl = computeMetrics(Yp, Yt, cats)

        print 'the hamming loss:'
        print '>>  ', hl
        from sklearn.metrics import (hamming_loss, classification_report)
        print 'hamming loss(library):', hamming_loss(Yt, Yp)
        print classification_report(Yt, Yp, target_names=cats)
        print 'DONE..'
Beispiel #10
0
def create_model(file_path=FINAL_MLKNN_MODEL_FILE_PATH):
    """
    Creates and trains a MLkNN classifier using the optimized parameters found
    Saves this trained model to disk

    :param string file_path: specifies where the model should be saved
    :return: a trained sklearn MLkNN classifier
    """

    with open(OPTIMIZED_MODEL_PARAMETERS_FILE_PATH) as file:
        hyperparameters = json.load(file)['hyperparameters']

    question_data, music_data = preprocessing.load_data()
    question_data, music_data = preprocessing.preprocess_data(
        question_data, music_data)
    clf = MLkNN(k=hyperparameters['k'], s=hyperparameters['s'])
    clf.fit(question_data.values, music_data.values)
    pickle.dump(clf, open(file_path, 'wb'))
    return clf
Beispiel #11
0
def mlknn(x_tr, y_tr, x_te, x_va=None):
    """
    mlknn
    :param x_tr:
    :param y_tr:
    :param x_te:
    :param x_va:
    :return:
    """
    pred = MLkNN(k=10, s=True)
    y_tr = np.int32(y_tr)
    pred.fit(x_tr, y_tr)

    if x_va is None:
        y_te_ = sparse.dok_matrix.toarray(pred.predict(x_te))
        return y_te_
    else:
        y_te_ = sparse.dok_matrix.toarray(pred.predict(x_te))
        y_va_ = sparse.dok_matrix.toarray(pred.predict(x_va))
        return y_te_, y_va_
Beispiel #12
0
    def adapt(X_train, y_train, X_test, y_test):

        y_train = y_train.to_sparse().to_coo()
        y_test = y_test.to_sparse().to_coo()

        from skmultilearn.adapt import MLkNN
        classifier = MLkNN(k=4)

        print("Train Adapted algorithm")

        classifier.fit(X_train, y_train)

        print("Predict")
        predictions = classifier.predict(X_test)

        from sklearn.metrics import accuracy_score

        print("Accuracy")
        print(y_test.shape, predictions.shape)
        print(accuracy_score(y_test.toarray(), predictions))
Beispiel #13
0
def mlknn(traindata, trainlabel, ttype):  #,valdata,val_label):

    #knnscore=[]
    #print("[mlknn start to class>>>>]")
    ''' find the best parameters'''
    parameters = {'k': range(2, 5), 's': np.arange(0.1, 0.5, 0.2)}
    score = 'accuracy'
    '''search parameters'''
    search_result = search_bestparmaters(MLkNN(), parameters, score, traindata,
                                         trainlabel)

    #print (search_result.best_params_, search_result.best_score_)

    k = search_result.best_params_['k']
    s = search_result.best_params_['s']
    save_score('score/record',
               ('mlknn', ttype, k, s, search_result.best_score_))

    clf = MLkNN(k, s)
    clf.fit(traindata, trainlabel)
    joblib.dump(clf, './model/mlknn' + "_model" + ttype + ".m")
Beispiel #14
0
class MLkNN():
	def __init__(self,window_size=100):
		self.h=MLkNN(k=20)
		self.window_size=window_size
		self.window=InstanceWindow(window_size)
		self.number_element=0
		self.flag=False
		self.L=None

	def partial_fit(self,X,y):
		N,L=y.shape
		self.L=L
		for i in range(N):
			if self.window=None:
				self.window=InstanceWindow(self.window_size)
			self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]]))
			self.number_element+=1
			if self.number_element==self.window_size:
				X_batch=self.window.get_attributes_matrix()
				y_batch=self.window.get_targets_matrix()
				self.h.fit(X_batch,y_batch)
				self.number_element=0
				self.flag=True
Beispiel #15
0
    return vector


document_X = []
document_Y = []
test_y = []
for example in tqdm(y_train):
    arr = example.strip().split()
    document_Y.append(to_category_vector(arr, max_len))
document_Y = np.array(document_Y)
for example in tqdm(Y_test):
    arr = example.strip().split()
    test_y.append(to_category_vector(arr, max_len))
test_y = np.array(test_y)

classifier_new = MLkNN(k=5)
classifier_new.fit(x_train, document_Y)
# predict
predictions_new = classifier_new.predict(x_test)
# print(predictions_new)

pred = predictions_new.toarray()
with open("data/predict.txt", 'w') as f:
    for i in tqdm(range(pred.shape[0])):
        g = np.where(pred[i] == 1)
        hashtags = []
        for item in g[0]:
            word = id_to_word[item]
            hashtags.append(word)
        f.write(' '.join(hashtags))
        f.write('\n')
 session.run(init)
 for epoch in range(training_epochs):
     # train stage
     batches=batch_yield(train_x, train_y, batch_size, word2id, label_dict, sequence_length, shuffle=False)
     train_x_fit=np.zeros([batch_size*total_batches, n_classes])
     train_y_fit=np.zeros([batch_size*total_batches, n_classes])
     for step, (batch_x, batch_y) in enumerate(batches):
         batch_x_emb=session.run(word_embeddings, feed_dict={input_ids:batch_x})
         _, c=session.run([optimizer, loss_function], feed_dict={x:batch_x_emb, y:batch_y, 
                                                                 y_steps:np.tile(batch_y,((sequence_length),1)), 
                                                                 sequence_lengths:[sequence_length]*batch_size})
         batch_pred_y=session.run(y_last, feed_dict={x:batch_x_emb, sequence_lengths:[sequence_length]*batch_size})
         train_x_fit[step*batch_size : step*batch_size+batch_size]=batch_pred_y
         train_y_fit[step*batch_size : step*batch_size+batch_size]=batch_y
     clf=MLkNN(k=4)
     clf.fit(X=train_x_fit, y=train_y_fit)
     # dev stage
     batches_dev=batch_yield(dev_x, dev_y, batch_size, word2id, label_dict, sequence_length, shuffle=False)
     total_batches_dev=len(dev_x)//batch_size
     dev_x_fit=np.zeros([batch_size*total_batches_dev, n_classes])
     dev_y_fit=np.zeros([batch_size*total_batches_dev, n_classes])
     for step, (batch_dev_x, batch_dev_y) in enumerate(batches_dev):
         batch_dev_x_emb=session.run(word_embeddings, feed_dict={input_ids:batch_dev_x})
         batch_dev_pred_y=session.run(y_last, feed_dict={x:batch_dev_x_emb, sequence_lengths:[sequence_length]*batch_size})
         dev_x_fit[step*batch_size : step*batch_size+batch_size]=batch_dev_pred_y
         dev_y_fit[step*batch_size : step*batch_size+batch_size]=batch_dev_y
     dev_preds=clf.predict(dev_x_fit)
     dev_preds=dev_preds.toarray()
     base_y=dev_preds+dev_y_fit
     acc=float(np.sum(base_y==2))/float(np.sum(base_y==1)+np.sum(base_y==2))
     precision=float(np.sum(base_y==2))/float(np.sum(dev_preds==1))
import pandas as pd
#import numpy as np
from skmultilearn.adapt import MLkNN
import random

dframe = pd.read_csv("trainData.csv", header=None)
dset = dframe.values
dframe1 = pd.read_csv("pseudotrain.csv", header=None)
X_train = dframe1.values
input1 = len(dset[0])
Y_train = dset[:, 1:input1]
del dframe, dset
input = len(X_train[0])
output = len(Y_train[0])

#for test data
dframe = pd.read_csv("testData.csv", header=None)
dset = dframe.values
dframe1 = pd.read_csv("pseudotest.csv", header=None)
dframe1 = dframe1.fillna(random.uniform(0.0001, 0.001))
X_test = dframe1.values
input1 = len(dset[0])
Y_test = dset[:, 1:input1]
input = len(X_test[0])
output = len(Y_test[0])

classifier = MLkNN(20)
classifier.fit(X_train, Y_train)
predictions = classifier.predict(X_test)
acc = (Y_test, predictions)
print(acc)
Beispiel #18
0
def run():
    parser = get_arg_parser()
    cmd_args = parser.parse_args()

    if cmd_args.gpu is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = str(cmd_args.gpu)
        gpunum = os.getenv('CUDA_VISIBLE_DEVICES')
        logging.info("GPU has been set to {}".format(gpunum))

    logging.info("Model used for the regression network: {}"
                 .format(cmd_args.model_name))

    # 1. Dataset retrieval
    # --------------------

    tab_printer(constants.Dataset)
    dataset = Dataset(nrows=constants.Dataset.nrows,
                      augment_labels=constants.Dataset.augment_labels,
                      top_n=constants.Dataset.top_n)

    logging.info("Going to create vocabulary and fit a preprocessing pipeline"
                 "using {} samples. Settings will be listed below"
                 .format(len(dataset.X_train)))

    # 2. Preprocessing
    # -----------------

    tab_printer(constants.NLP)
    preprocessor = Preprocessing(dataset.X_train)

    # Preprocess documents
    X_train = preprocessor.transform_documents(dataset.X_train)
    X_test = preprocessor.transform_documents(dataset.X_test)

    # 3. Word embeddings with word2vec
    # --------------------------------

    # Train word2vec embeddings if train_word2vec option is selected
    if cmd_args.train_word2vec: utils.embeddings.main()
    weights = get_embedding_tensor(preprocessor)

    # 4. Node embeddings with AttentionWalk
    # -------------------------------------
    args = _generate_deepwalk_parameters(dataset.y_train_graph)
    if cmd_args.train_attentionwalk: train_attention_walk(args)

    graph_embeddings = pd.read_csv(args.embedding_path).iloc[:, 1:].values

    # Get document representations using node embeddings
    y_embedded = _get_label_embeddings(dataset.y_train, graph_embeddings)
    y_test_embedded = _get_label_embeddings(dataset.y_test, graph_embeddings)

    # 5. Regressor Training
    # ---------------------

    device = 'cuda:' + str(os.getenv("CUDA_VISIBLE_DEVICES")) \
        if torch.cuda.is_available() else 'cpu'

    regressor_nn = NeuralNet(
        get_network_class(cmd_args.model_name),
        max_epochs=constants.NeuralNetworkTraining.epochs,
        lr=constants.NeuralNetworkTraining.learning_rate,
        batch_size=constants.NeuralNetworkTraining.batch_size,
        optimizer=torch.optim.Adam,
        criterion=torch.nn.MSELoss,

        module__output_dim=args.dimensions,
        module__embedding=weights,
        module__embedding_dim=constants.NLP.embedding_size,

        device=device,
        train_split=None,
    )

    # Train the regressor neural network
    regressor_nn.fit(X_train, y_embedded.astype(np.float32))

    # 6. Train Multi-label KNN algorithm
    # ----------------------------------

    tab_printer(constants.MLKNN)

    # Train multi-label KNN to turn label embeddings into label predictions
    classifier = MLkNN(k=constants.MLKNN.k, s=constants.MLKNN.s)
    classifier.fit(y_embedded, dataset.y_train)

    # 7. Evaluation
    # -------------

    # Label prediction with documents
    y_test_pred = regressor_nn.predict(X_test)
    preds = classifier.predict(y_test_pred)
    preds_raw = classifier.predict_proba(y_test_pred)

    # Label prediction with label embeddings
    preds_w_labels = classifier.predict(y_test_embedded)
    preds_w_labels_raw = classifier.predict_proba(y_test_embedded)

    # Log evaluation result with label embeddings
    eval_metrics_w_labels = evaluation \
        .all_metrics(preds_w_labels.toarray(),
                     dataset.y_test,
                     yhat_raw=preds_w_labels_raw.toarray())

    logging.info(str(eval_metrics_w_labels))

    # Log evaluation result with documents
    report_evaluation(preds.toarray(),
                      dataset.y_test,
                      yhat_raw=preds_raw.toarray())
Beispiel #19
0
 def TrainProcess(train_data,train_target):
     cla= MLkNN(k=4, s=0.01)
     cla.fit(train_data, train_target)
     return cla
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
classifier_new = MLkNN(k=10)
# Note that this classifier can throw up errors when handling sparse matrices.
x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()
# train
classifier_new.fit(x_train, y_train)
# predict
predictions_new = classifier_new.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions_new))
print("\n")

# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())
# train
classifier.fit(x_train, y_train)
# predict
predictions = classifier.predict(x_test)
# accuracy
Beispiel #21
0
    # print(images_resized)

    # convert images to numpy array
    x_multidim = np.array([np.array(image) for image in images_resized])
    #print(x_multidim.shape)

    # flatten the numpy array
    return x_multidim.reshape(n_samples, -1)
    # print(x.shape)
    # print(x)


Xtrain = imageprep(dir + 'tmp_images/*.jpg')

Xval = imageprep(dir + 'val_images/*.jpg')

i, ytrain = multi_label(dir + "train_subset.json")
i, yval = multi_label(dir + "validation.json")

ytrain, yval = ytrain[:1000], yval[:1000]

classifier = MLkNN(k=10)

classifier.fit(Xtrain, ytrain)

predictions = classifier.predict(Xval)
print(predictions)

# acc = accuracy_score(yval, predictions)
# print("Accuracy on test set: {}".format(acc))
Beispiel #22
0

###################################################################################      Multilabel Classifier     ######################################################################################

from skmultilearn.problem_transform import ClassifierChain
classifier = ClassifierChain(svm.SVC(decision_function_shape='ovo'))
classifier.fit(train_features,tmp)

p=classifier.predict(test_features)
print(p)



from skmultilearn.adapt import MLkNN
clsfr= MLkNN(k=1)
clsfr.fit(train_features,tmp)

p=clsfr.predict(test_features)
print(p)


###########################################################################      Search for videos with similar tags   ##################################################################################

import urllib
from bs4 import BeautifulSoup
d={}
d[0]="cheering"
d[1]="music"
d[2]="speech"
p=p.todense()
print(p)
Beispiel #23
0
def mlknn_train_pred(k_list,
                     df_train_x,
                     df_train_y,
                     df_test_x,
                     df_test_y,
                     target_cols,
                     NFOLDS=5):
    """
    This function z-score normalizes the train and test data, split the train data in K-folds and run the 
    Multilabel KNN on the folds to choose the best "K", thereafter predicting on the K-fold train data and
    test set using the Best K, averaging out the predictions across all folds for the test set.
    
    Args:
            k_list: A list of "K" nearest neighbours to perform gridsearch on
            df_train_x: train data with only phenotypic/morphological features - pandas dataframe.
            df_train_y: train data with only the MOA (Mechanism of actions) target labels - pandas dataframe.
            df_test_x: test data with only phenotypic/morphological features - pandas dataframe.
            df_test_y: test data with only the MOA (Mechanism of actions) target labels- pandas dataframe.
            target_cols: A list of MOA (Mechanism of actions) target labels
            NFOLDS: A value that represent number of K-subset/cross-validation we want to perform
    
    Returns:
            oof_preds: Train out-of-fold predictions - pandas dataframe.
            test_preds: Test predictions - pandas dataframe.

    """

    sc = StandardScaler()
    df_train_x_scaled = pd.DataFrame(sc.fit_transform(df_train_x),
                                     columns=df_train_x.columns)
    df_test_x_scaled = pd.DataFrame(sc.transform(df_test_x),
                                    columns=df_test_x.columns)

    acc_losses = []
    oof_preds = pd.DataFrame(np.zeros(shape=(df_train_y.shape)),
                             columns=target_cols)
    test_preds = pd.DataFrame(np.zeros(shape=(df_test_y.shape)),
                              columns=target_cols)
    skf = MultilabelStratifiedKFold(n_splits=NFOLDS,
                                    shuffle=True,
                                    random_state=133)

    print('Execution time | Fold number | logloss | Best K |')
    for fn, (trn_idx,
             val_idx) in enumerate(skf.split(df_train_x_scaled, df_train_y)):
        start_time = time()
        X_train, X_val = df_train_x_scaled.loc[
            trn_idx, :], df_train_x_scaled.loc[val_idx, :]
        y_train, y_val = df_train_y.iloc[trn_idx, :], df_train_y.iloc[
            val_idx, :]

        best_k = 0
        best_loss = np.inf
        for k_item in k_list:
            classifier = MLkNN(k=k_item)
            classifier.fit(X_train.values, y_train.values)
            val_preds = classifier.predict_proba(X_val.values)
            loss = log_loss(np.ravel(y_val), np.ravel(val_preds.toarray()))
            if loss < best_loss:
                best_loss = loss
                best_k = k_item
                oof_preds.iloc[val_idx, :] = val_preds.toarray()

        classifier = MLkNN(k=best_k)
        classifier.fit(X_train.values, y_train.values)
        acc_losses.append(best_loss)
        preds = classifier.predict_proba(df_test_x_scaled.values)
        test_preds += preds.toarray() / NFOLDS
        print('{}\t\t{}\t\t{:.5f}\t\t{}'.format(
            str(datetime.timedelta(seconds=time() - start_time))[:7], fn, loss,
            best_k))

    return oof_preds, test_preds
Beispiel #24
0
    acc_tuple = None
    hamm_tuple = None
    f1_tuple = None
    time_tuple = None

    print('Pocinjem podesavanje hiperparametara na validacionom skupu:')
    start_time = time.time()

    for k in np.arange(1, 6):
        for s in [0.3, 0.5, 0.7, 1.0]:
            print('k = ', k, ', s = ', s)

            inner_start_time = time.time()

            classifier = MLkNN(k=k, s=s)
            prediction = classifier.fit(x_train, y_train).predict(x_val)

            inner_end_time = time.time()
            inner_time_passed = inner_end_time - inner_start_time

            hamm = metrics.hamming_loss(y_val, prediction)
            acc = metrics.accuracy_score(y_val, prediction)
            f1 = metrics.f1_score(y_val, prediction, average='micro')
            print('HM:', hamm)
            print('AS:', acc)
            print('F1:', f1)

            if acc > max_acc:
                print('Nova najbolja tacnost (', acc, '>',
                      max_acc, ') dala je kombinacija k = ', k, ', s =', s)
                max_acc = acc
#y_train_large = (y_train >= 7)
#y_train_odd = (y_train % 2 == 1)
#y_multilabel = np.c_[y_train_large, y_train_odd]
#knn_clf = KNeighborsClassifier()
#knn_clf.fit(f_norm, tlabels)

from skmultilearn.dataset import load_dataset
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC
import sklearn.metrics as metrics
import sklearn.model_selection
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    f_norm, tlabels, test_size=0.2, random_state=1)
tt = y_train.astype('int')
ttest = y_test.astype('int')
clf = BinaryRelevance(classifier=SVC(), require_dense=[False, True])
clf.fit(X_train, tt)
prediction = clf.predict(X_test)
metrics.hamming_loss(ttest, prediction)
metrics.accuracy_score(ttest, prediction)
#Hamming loss measures how well the classifier predicts each of the labels, averaged over samples, then over labels
#accuracy score measures how well the classifier predicts label combinations, averaged over samples
#jaccard similarity measures the proportion of predicted labels for a sample to its correct assignment, averaged over samples
#precision measures how many samples with ,
#recall measures how many samples ,
#F1 score measures a weighted average of precision and recall, where both have the same impact on the score
from skmultilearn.adapt import MLkNN
classifier = MLkNN(k=3)
prediction = classifier.fit(X_train, y_train).predict(X_test)
metrics.hamming_loss(y_test, prediction)
Beispiel #26
0
        for j in range(y_num):
            temp = 0
            for t in range(neigs.shape[0]):
                temp = temp + neigs[t][j + 1]
            if ph[j] * peh1[j, temp] > ph_[j] * peh0[j, temp]:
                predict.append(1)
            else:
                predict.append(0)
        predicts.append(predict)
    predicts = np.array(predicts)
    return predicts


data = pickle.load(open('datasets.pickle', 'rb'))
#得到训练数据X,和标签类别Y
X = data[0]
Y = data[1]

predict = mlknn(X, X, 8, 5, Y)
print(predict)
print(accuary(predict, Y))

ml = MLkNN(k=8)
ml.fit(X, Y)
p = ml.predict(X)
print(accuary(p, Y))

kn = KNeighborsClassifier(n_neighbors=8)
kn.fit(X, Y)
pp = kn.predict(X)
print(accuary(p, Y))
Beispiel #27
0
from datetime import timedelta
import time
start = time.time()

from scipy.sparse import csr_matrix, lil_matrix
from skmultilearn.adapt import MLkNN
x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()
classifier = MLkNN(k=4)

# train
from skmultilearn.adapt import BRkNNbClassifier

classifier = BRkNNbClassifier(k=6)
classifier.fit(x_train, y_train)
# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ", accuracy_score(y_test, predictions))
print("\n")
print("F1 = ", f1_score(y_test, predictions, average='micro'))
print("\n")

print("Jaccard = ", jaccard_similarity_score(y_test, predictions))
print("\n")

print("Precision = ", precision_score(y_test, predictions, average='micro'))
print("\n")
Beispiel #28
0
class Model(object):
    """Fully connected neural network with no hidden layer."""
    def __init__(self, metadata):
        """
    Args:
      metadata: an AutoDLMetadata object. Its definition can be found in
          AutoDL_ingestion_program/dataset.py
    """
        self.done_training = False
        self.metadata = metadata
        self.output_dim = self.metadata.get_output_size()
        self.imputer = Imputer(missing_values='NaN',
                               strategy='mean',
                               axis=0,
                               verbose=0,
                               copy=True)
        self.model = MLkNN(k=20)
        self.step = 0
        self.lgb_round = 80

    def train(self, dataset, remaining_time_budget=None):
        """Train this algorithm on the tensorflow |dataset|.
    This method will be called REPEATEDLY during the whole training/predicting
    process. So your `train` method should be able to handle repeated calls and
    hopefully improve your model performance after each call.

    ****************************************************************************
    ****************************************************************************
    IMPORTANT: the loop of calling `train` and `test` will only run if
        self.done_training = False
      (the corresponding code can be found in ingestion.py, search
      'M.done_training')
      Otherwise, the loop will go on until the time budget is used up. Please
      pay attention to set self.done_training = True when you think the model is
      converged or when there is not enough time for next round of training.
    ****************************************************************************
    ****************************************************************************

    Args:
      dataset: a `tf.data.Dataset` object. Each of its examples is of the form
            (example, labels)
          where `example` is a dense 4-D Tensor of shape
            (sequence_size, row_count, col_count, num_channels)
          and `labels` is a 1-D Tensor of shape
            (output_dim,).
          Here `output_dim` represents number of classes of this
          multilabel classification task.

          IMPORTANT: some of the dimensions of `example` might be `None`,
          which means the shape on this dimension might be variable. In this
          case, some preprocessing technique should be applied in order to
          feed the training of a neural network. For example, if an image
          dataset has `example` of shape
            (1, None, None, 3)
          then the images in this datasets may have different sizes. On could
          apply resizing, cropping or padding in order to have a fixed size
          input tensor.

      remaining_time_budget: time remaining to execute train(). The method
          should keep track of its execution time to avoid exceeding its time
          budget. If remaining_time_budget is None, no time budget is imposed.
    """
        if self.done_training:
            return
        self.step += 1
        # print(f'dataset: {dataset}')
        t1 = time.time()
        # Count examples on training set
        if not hasattr(self, 'num_examples_train'):
            logger.info("Counting number of examples on train set.")
            dataset = dataset.batch(128)
            iterator = dataset.make_one_shot_iterator()
            next_element = iterator.get_next()
            X = []
            Y = []
            with tf.Session(config=tf.ConfigProto(
                    log_device_placement=False)) as sess:
                while True:
                    try:
                        example, labels = sess.run(next_element)
                        example = np.squeeze(example)
                        X.extend(example)
                        Y.extend(labels)
                    except tf.errors.OutOfRangeError:
                        break
            self.X_train = np.array(X)
            self.y_train = np.array(Y)
            print('self.X_train.shape: {}'.format(self.X_train.shape))
            print('self.y_train.shape: {}.'.format(self.y_train.shape))
            self.num_examples_train = len(self.y_train)
            logger.info("Finished counting. There are {} examples for training set." \
                        .format(self.num_examples_train))
        print('spand time: {}'.format(time.time() - t1))
        if self.lgb_round >= 300 or self.step > 10:
            self.done_training = True
            return
        if hasattr(self, 'test_duration'):
            round = int(50 * self.test_duration + 5)
            self.lgb_round += round
        train_start = time.time()
        self.X_train = self.imputer.fit_transform(self.X_train)
        self.model.fit(self.X_train, self.y_train)
        train_end = time.time()

        # Update for time budget managing
        train_duration = train_end - train_start
        logger.info("{} step. {:.2f} sec used. ".format(
            self.step, train_duration))

        self.done_training = True

    def test(self, dataset, remaining_time_budget=None):
        """Test this algorithm on the tensorflow |dataset|.

    Args:
      Same as that of `train` method, except that the `labels` will be empty.
    Returns:
      predictions: A `numpy.ndarray` matrix of shape (sample_count, output_dim).
          here `sample_count` is the number of examples in this dataset as test
          set and `output_dim` is the number of labels to be predicted. The
          values should be binary or in the interval [0,1].
    """
        # Count examples on test set
        if not hasattr(self, 'num_examples_test'):
            logger.info("Counting number of examples on test set.")
            dataset = dataset.batch(128)
            iterator = dataset.make_one_shot_iterator()
            example, labels = iterator.get_next()
            X = []
            with tf.Session(config=tf.ConfigProto(
                    log_device_placement=False)) as sess:
                while True:
                    try:
                        ex = sess.run(example)
                        ex = np.squeeze(ex)
                        X.extend(ex)
                    except tf.errors.OutOfRangeError:
                        break
            self.X_test = np.array(X)
            self.num_examples_test = self.X_test.shape[0]
            logger.info("Finished counting. There are {} examples for test set." \
                        .format(self.num_examples_test))

        test_begin = time.time()
        logger.info("Begin testing...")
        self.X_test = self.imputer.fit_transform(self.X_test)
        predictions = self.model.predict(self.X_test).A
        # print(type(predictions))
        # print(predictions.A)
        # preds = self.model.predict_proba(self.X_test)
        # print(preds)
        # test_results = pd.Series(test_results).map(self.remps).values
        # predictions = self.bin2y(test_results)
        # print(predictions)
        test_end = time.time()
        # Update some variables for time management
        self.test_duration = test_end - test_begin
        logger.info("[+] Successfully made one prediction. {:.2f} sec used. " \
                    .format(self.test_duration) + \
                    "Duration used for test: {:2f}".format(self.test_duration))
        return predictions

    def y2bin(self, y):
        res = y[:, 0]
        for i in range(1, y.shape[1]):
            res *= 2
            res += y[:, i]
        return res

    def bin2y(self, bin):
        y = np.array([bin % 2]).T
        i = 1
        while i < self.output_dim:
            i += 1
            bin = bin // 2
            y = np.c_[np.array([bin % 2]).T, y]
            # y = np.insert(y, 0, values=bin%2, axis=1)
        return y
# 30 is currently the best tested k amount.
l = [30, 40, 50, 100, 200, 280]
# l = [200]
# l = [likely_k]
# l = [70, 80, 90, 100, 500, 1000, 2000, 3000, 4000, 5600]
best_clf = None
lowest_hl = float('inf')
best_k = float('inf')
for k in l:
    print(25*'=')
    print('k = ' + str(k))
    clf = MLkNN(k)

    # train
    clf.fit(x_train, y_train)

    # predict
    predictions = clf.predict(x_dev)

    predictions = predictions.todense()
    print('all match:', np.sum(np.all(predictions == y_dev, axis=1)) / len(y_dev))
    print('at least one match:', (np.sum(np.all(predictions - y_dev <= 0, axis=1))-np.sum(np.all(predictions== 0, axis=1))) / len(y_dev))
    print('binary :', np.mean(predictions == y_dev))
    hl = hamming_loss(y_dev, predictions)
    print('Hamming Loss:', hamming_loss(y_dev, predictions))
    if hl < lowest_hl:
        lowest_hl = hl
        best_clf = clf
        best_k = k
    
Beispiel #30
0
    
classifier = MLkNN(k = 20)

#feature_x = pkl.load(open('features_for_classification.pkl'))
#classifier = BinaryRelevance(GaussianNB())


Keys_Train = random.sample(ent2type.keys(),10000)
Keys_Test = list(ent2type.keys())
[Keys_Test.remove(val) for val in Keys_Train]
X_Train = [feature_x[key] for key in Keys_Train]
X_Test = [feature_x[key] for key in Keys_Test]
Y_Train = generate_labels(Keys_Train)
Y_Test = generate_labels(Keys_Test)
print('HEERE 1')
classifier.fit(np.array(X_Train), np.array(Y_Train))
print('HEERE 2')    
predictions = classifier.predict(np.array(X_Test))

print(accuracy_score(np.array(Y_Test),predictions))

preds = predictions.toarray()

def accuracy(input):
    data = input[0]
    true = input[1]
    size = len(data)
    FP = TP = FN = TN = 0
    for i in xrange(size):
        if true[i] == True:
            if data[i] == True: