def main():
    data = readData("IMDB-Movie-Data.csv")
    genres = data["Genre"]
    descriptions = data["Description"]
    labels = getLabels(genres)
    calculateNgrams(descriptions)

    features = list(map(extract_features, descriptions))
    print len(features[1])
    # X = features
    # Y = Labels
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.33,
                                                        random_state=42)
    #binRel(X_train, X_test, y_test, y_train)
    classifier = MLkNN(k=4)
    # Train
    classifier.fit(X_train, y_train)
    #predict
    #print X_test
    predictions = classifier.predict(np.array(X_test))
    print('Hamming loss: {0}'.format(
        sklearn.metrics.hamming_loss(y_test, predictions)))  #(y_true, y_pred)
    ''''
Beispiel #2
0
	def __init__(self,window_size=100):
		self.h=MLkNN(k=20)
		self.window_size=window_size
		self.window=InstanceWindow(window_size)
		self.number_element=0
		self.flag=False
		self.L=None
    def classifiers(self):
        graph_builder = LabelCooccurrenceGraphBuilder(weighted=True,
                                                      include_self_edges=False)

        param_dicts = {
            'GraphFactorization': dict(epoch=1),
            'GraRep': dict(Kstep=2),
            'HOPE': dict(),
            'LaplacianEigenmaps': dict(),
            'LINE': dict(epoch=1, order=1),
            'LLE': dict(),
        }

        if not (sys.version_info[0] == 2
                or platform.architecture()[0] == '32bit'):
            for embedding in OpenNetworkEmbedder._EMBEDDINGS:
                if embedding == 'LLE':
                    dimension = 3
                else:
                    dimension = 4

                yield EmbeddingClassifier(
                    OpenNetworkEmbedder(copy(graph_builder), embedding,
                                        dimension, 'add', True,
                                        param_dicts[embedding]),
                    LinearRegression(), MLkNN(k=2))

        yield EmbeddingClassifier(
            SKLearnEmbedder(SpectralEmbedding(n_components=2)),
            LinearRegression(), MLkNN(k=2))

        EmbeddingClassifier(CLEMS(metrics.accuracy_score, True),
                            LinearRegression(), MLkNN(k=2), True)
def adapted(data):

    classifier = MLkNN(k=20)
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    accuracyScore = accuracy_score(y_test, predictions)
    return None
def mlknn(train_data_inx,y_train,test_data_inx):
	classifier = MLkNN(k=mlknn_k)
	x_train = []
	x_test = []
	for i in range(len(train_data_inx)):
		x_train.append(corpus_tfidf[train_data_inx[i]])
	for j in range(len(test_data_inx)):
		x_test.append(corpus_tfidf[test_data_inx[j]])
	classifier.fit(csr_matrix(x_train), csr_matrix(y_train))
	mlknn_pre = classifier.predict(csr_matrix(x_test))
	mlknn_pre = mlknn_pre.toarray()
	return mlknn_pre
Beispiel #6
0
def MLKNN_method(X_train, y_train, ml_k, ml_s):
    """
	改编算法-->MLKNN方法
	:param X_train: 输入数据
	:param y_train: 对应标签数据
	:return:
	"""
    try:
        classifier = MLkNN(k=int(ml_k), s=float(ml_s))
        classifier.fit(X_train, y_train)

        return classifier
    except Exception as e:
        print("warning----改编算法KNN|MLKNN----" + str(e))

    return None
Beispiel #7
0
    def fit(self, X, y):
        """Fit classifier to multi-label data

        Parameters
        ----------
        X : numpy.ndarray or scipy.sparse
            input features, can be a dense or sparse matrix of size
            :code:`(n_samples, n_features)`
        y : numpy.ndaarray or scipy.sparse {0,1}
            binary indicator matrix with label assignments, shape
            :code:`(n_samples, n_labels)`

        Returns
        -------
        fitted instance of self
        """
        self._label_count = y.shape[1]
        self.model_count_ = int(np.ceil(self._label_count /
                                        self.labelset_size))
        self.classifier_ = LabelSpacePartitioningClassifier(
            classifier=MLkNN(),
            clusterer=RandomLabelSpaceClusterer(
                cluster_size=self.labelset_size,
                cluster_count=self.model_count_,
                allow_overlap=False),
            require_dense=[False, False])
        return self.classifier_.fit(X, y)
Beispiel #8
0
def run_test1(normas):
    models = [[('cv', CountVectorizer(min_df=20, max_df=0.5))],
              [('tfidf', TfidfVectorizer(min_df=20, max_df=0.5))],
              [('tokenize', Tokenizador()),
               ('d2v', D2VTransformer(dm=0, min_count=100, size=200,
                                      workers=6))]]

    clfs = [{
        'clf': ('dt', DecisionTreeClassifier()),
        'params': {
            'dt__min_samples_split': [0.005, 0.010, 2],
            'dt__max_depth': [16, 32, None]
        }
    }, {
        'clf': ('rf', RandomForestClassifier()),
        'params': {
            'rf__n_estimators': [100, 110, 120],
            'rf__min_samples_split': [0.005, 0.010, 2],
            'rf__min_samples_leaf': [5, 3, 1]
        }
    }, {
        'clf': ('mlknn', MLkNN()),
        'params': {
            'mlknn__k': [6, 8, 10, 12],
            'mlknn__s': [0.5, 1.0, 1.5, 2.0]
        }
    }, {
        'clf': ('mlp', MLPClassifier()),
        'params': {
            'mlp__hidden_layer_sizes': [(150), (100, 100), (50, 50, 50)],
            'mlp__activation': ['tanh', 'relu'],
            'mlp__solver': ['sgd', 'adam']
        }
    }]
    run(normas, models, clfs)
Beispiel #9
0
    def mlknn(self, number):
        classifier = MLkNN(k=number)

        classifier.fit(self.X_train, self.y_train)

        # predict
        predictions = classifier.predict(self.X_test)
        result = hamming_loss(self.y_test, predictions)

        print("hanming_loss,",result)

        result = f1_score(self.y_test, predictions, average='micro')
        print("micro -f1: ", result)

        result = precision_score(self.y_test, predictions,average='micro')
        print(result)
Beispiel #10
0
def MLKnn_GridSearch(X_train, X_test, y_train, y_test):
    parameters = {'k': range(1, 12), 's': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 1.0]}
    score = 'f1_macro'

    clf = GridSearchCV(MLkNN(), parameters, scoring=score)
    clf.fit(X, y)

    print(clf.best_params_, clf.best_score_)
Beispiel #11
0
 def __init__(self, metadata):
     """
 Args:
   metadata: an AutoDLMetadata object. Its definition can be found in
       AutoDL_ingestion_program/dataset.py
 """
     self.done_training = False
     self.metadata = metadata
     self.output_dim = self.metadata.get_output_size()
     self.imputer = Imputer(missing_values='NaN',
                            strategy='mean',
                            axis=0,
                            verbose=0,
                            copy=True)
     self.model = MLkNN(k=20)
     self.step = 0
     self.lgb_round = 80
Beispiel #12
0
    def RecommendByMLKNN(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """ML KNN算法"""
        classifier = MLkNN(k=train_data_y.shape[1])
        classifier.fit(train_data, train_data_y)

        predictions = classifier.predict_proba(test_data).todense()
        """预测结果转化为data array"""
        predictions = numpy.asarray(predictions)

        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)
        answerList = test_data_y
        print(predictions)
        print(test_data_y)
        print(recommendList)
        print(answerList)
        return [recommendList, answerList]
Beispiel #13
0
    def train(self):

        classifier_new = MLkNN(k=10)

        x_train = lil_matrix(self.x_data).toarray()
        y_train = lil_matrix(self.y_data).toarray()
        x_test = lil_matrix(self.x_test).toarray()

        classifier_new.fit(x_train, y_train)

        # predict
        predictions = classifier_new.predict(x_test)

        return {
            'accuracy': accuracy_score(self.y_test, predictions),
            'f1_score': f1_score(self.y_test, predictions, average='micro')
        }
Beispiel #14
0
def create_model(file_path=FINAL_MLKNN_MODEL_FILE_PATH):
    """
    Creates and trains a MLkNN classifier using the optimized parameters found
    Saves this trained model to disk

    :param string file_path: specifies where the model should be saved
    :return: a trained sklearn MLkNN classifier
    """

    with open(OPTIMIZED_MODEL_PARAMETERS_FILE_PATH) as file:
        hyperparameters = json.load(file)['hyperparameters']

    question_data, music_data = preprocessing.load_data()
    question_data, music_data = preprocessing.preprocess_data(
        question_data, music_data)
    clf = MLkNN(k=hyperparameters['k'], s=hyperparameters['s'])
    clf.fit(question_data.values, music_data.values)
    pickle.dump(clf, open(file_path, 'wb'))
    return clf
 def __init__(self,
              random_state=84,
              n_estimators=20,
              params={
                  'k': range(5, 27, 2),
                  's': [0.5, 0.7, 1.0]
              },
              niterations=10):
     self.model = MLkNN()
     self.params = params
     self.niterations = niterations
Beispiel #16
0
 def __init__(self,
              k=5,
              classifier=MLkNN(),
              lambd=0.3,
              delta=.5,
              threshold=0.70):
     self.k = k
     self.classifier = classifier
     self.lambd = lambd
     self.delta = delta
     self.threshold = threshold
Beispiel #17
0
    def adapt(X_train, y_train, X_test, y_test):

        y_train = y_train.to_sparse().to_coo()
        y_test = y_test.to_sparse().to_coo()

        from skmultilearn.adapt import MLkNN
        classifier = MLkNN(k=4)

        print("Train Adapted algorithm")

        classifier.fit(X_train, y_train)

        print("Predict")
        predictions = classifier.predict(X_test)

        from sklearn.metrics import accuracy_score

        print("Accuracy")
        print(y_test.shape, predictions.shape)
        print(accuracy_score(y_test.toarray(), predictions))
Beispiel #18
0
def mlknn(traindata, trainlabel, ttype):  #,valdata,val_label):

    #knnscore=[]
    #print("[mlknn start to class>>>>]")
    ''' find the best parameters'''
    parameters = {'k': range(2, 5), 's': np.arange(0.1, 0.5, 0.2)}
    score = 'accuracy'
    '''search parameters'''
    search_result = search_bestparmaters(MLkNN(), parameters, score, traindata,
                                         trainlabel)

    #print (search_result.best_params_, search_result.best_score_)

    k = search_result.best_params_['k']
    s = search_result.best_params_['s']
    save_score('score/record',
               ('mlknn', ttype, k, s, search_result.best_score_))

    clf = MLkNN(k, s)
    clf.fit(traindata, trainlabel)
    joblib.dump(clf, './model/mlknn' + "_model" + ttype + ".m")
Beispiel #19
0
class MLkNN():
	def __init__(self,window_size=100):
		self.h=MLkNN(k=20)
		self.window_size=window_size
		self.window=InstanceWindow(window_size)
		self.number_element=0
		self.flag=False
		self.L=None

	def partial_fit(self,X,y):
		N,L=y.shape
		self.L=L
		for i in range(N):
			if self.window=None:
				self.window=InstanceWindow(self.window_size)
			self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]]))
			self.number_element+=1
			if self.number_element==self.window_size:
				X_batch=self.window.get_attributes_matrix()
				y_batch=self.window.get_targets_matrix()
				self.h.fit(X_batch,y_batch)
				self.number_element=0
				self.flag=True
Beispiel #20
0
def get_cado_predictions():
    data_path = '../../datasets/cado/train.csv'
    test_path = '../../datasets/cado/test.csv'

    data = du.load_data(data_path)
    test = du.load_data(test_path)

    text_index = 6
    label_start_index = 7
    X = [d[text_index] for d in data]
    labels = [d[label_start_index:label_start_index + 12] for d in data]

    X_test = [d[text_index] for d in test]
    labels_test = [d[label_start_index:label_start_index + 12] for d in test]

    Y = np.array(labels, dtype='int')
    y_test = np.array(labels_test, dtype='int')
    #Y = np.array(binary_labels, dtype='int')

    test_index = len(X)

    X = X + X_test
    Y = np.vstack([Y, y_test])

    tokenizer = tokenize_data(X)
    word_index = tokenizer.word_index

    sequences = tokenizer.texts_to_sequences(X)

    X = pad_sequences(sequences,
                      maxlen=700,
                      padding="post",
                      truncating="post",
                      value=0)

    num_words = min(MAX_NB_WORDS, len(word_index) + 1)
    embedding_matrix = np.zeros((num_words, 1))

    for word, i in word_index.items():
        if i >= MAX_NB_WORDS:
            continue
        embedding_matrix[i] = 1

    X_train = X[0:test_index, :]
    Y_train = Y[0:test_index, :]
    x_test = X[test_index:len(X), :]
    y_test = Y[test_index:len(Y), :]

    classifier = MLkNN()
    classifier.fit(X_train, Y_train)
    predictions = classifier.predict(x_test)
    scores = classifier.predict_proba(x_test)
    y_pred = predictions.toarray()
    y_score = scores.toarray()

    return y_pred, y_score
def get_classifiers():
    binary_relevance = BinaryRelevance(GaussianNB())
    classifier_chain = ClassifierChain(GaussianNB())
    label_powerset = LabelPowerset(GaussianNB())
    decision_tree = DecisionTreeClassifier(random_state=0)
    knn = MLkNN(k=20)
    random_forest = RandomForestClassifier(max_depth=2, random_state=0)
    clfs = [
        binary_relevance, classifier_chain, label_powerset, decision_tree, knn,
        random_forest
    ]
    names = [
        'binary_relevance', 'classifier_chain', 'label_powerset',
        'decision_tree', 'knn', 'random_forest'
    ]
    return clfs, names
Beispiel #22
0
def test_mlknn(df, truth, eval_type):
    parameters = {'k': range(1, 4), 's': [0.5, 0.7, 1.0]}
    kfold = KFold(n_splits=10, random_state=26)
    # print("Start gridsearch")
    # clf = GridSearchCV(MLkNN(), parameters, scoring=eval_type, cv=kfold)
    # clf.fit(df, truth)
    # print(f"Gridsearch completed. Best params: {clf.best_params_}")

    best_classifier = MLkNN(k=3, s=0.5)
    print("Start Crossval")
    scores = cross_val_score(best_classifier,
                             df.values,
                             truth,
                             cv=kfold,
                             scoring=eval_type)
    return ["MLkNN"], [scores]
Beispiel #23
0
    def __init__(self, model_name="MLKNNbaseline"):

        if model_name == "MLKNNbaseline":
            self.model = MLkNN()
        elif model_name == "BRkNNbaseline":
            self.model = BRkNNaClassifier()
        elif model_name == "BRSVCbaseline":
            self.model = BinaryRelevance(classifier=SVC(),
                                         require_dense=[False, True])
        else:
            if model_name not in set(
                ["MLKNNbaseline", "BRkNNbaseline", "BRSVCbaseline"]):
                raise ValueError(
                    "Specify MLKNNbaseline, BRkNNbaseline, or BRSVCbaseline model name"
                )
        self.model_name = model_name
Beispiel #24
0
    def MLkNN(self):
        print("")
        print("Starting MLkNN Classifier of skmultilearn.adapt...")
        print("")
        start = datetime.now()

        parameters = {'k': range(1, 3), 's': [0.5, 0.7, 1.0]}

        grid_search_cv = GridSearchCV(MLkNN(),
                                      parameters,
                                      scoring='f1_macro',
                                      verbose=2,
                                      n_jobs=-1)
        grid_search_cv.fit(self.x_train, self.y_train)
        clf = grid_search_cv.best_estimator_

        y_pred = clf.predict(self.x_test)
        return self.multilabel_evaluation(y_pred, self.y_test)
Beispiel #25
0
 def getClassifier(self):
     if self.classifierType.lower() == 'rakelo':
         classifier = RakelO(
             base_classifier=LabelPowerset(GaussianNB()),
             #base_classifier_require_dense=[True, True],
             model_count=10,
             labelset_size=2  #len(labelTypes) // 4
         )
     elif self.classifierType.lower() == 'mlknn':
         classifier = MLkNN(k=3)
     # elif self.classifierType.lower() == 'mltsvm':
     #     classifier = MLTSVM(c_k = 2**-1)
     elif self.classifierType.lower() == 'mlaram':
         classifier = MLARAM()
     elif self.classifierType.lower() == 'labelpowerset':
         classifier = LabelPowerset(
             classifier=RandomForestClassifier(n_estimators=100),
             require_dense=[False, True])
     return classifier
Beispiel #26
0
    def MLkNN(self):
        self.sub_parser.add_argument('--library',
                                     action='store_true',
                                     default=False)

        args = self.sub_parser.parse_args(sys.argv[2:])
        print 'Running ML-kNN, arguments=%s' % args
        print 'Loading %s data...' % args.N

        if args.f == 'My_dict':
            vectorizer = my_dict_vectorizer(stop=not args.nostop,
                                            bigram=args.bigram)
        elif args.f == 'LIB_count':
            vectorizer = lib_count_vectorizer(stop=not args.nostop,
                                              bigram=args.bigram)
        elif args.f == 'LIB_hash':
            vectorizer = lib_hash_vectorizer(stop=not args.nostop,
                                             bigram=args.bigram)
        elif args.f == 'LIB_tfidf':
            vectorizer = lib_tfidf_vectorizer(stop=not args.nostop,
                                              bigram=args.bigram)

        data = load_data(args.N, args.D, args.Nt, vectorizer)
        print 'Done loading data, actual feature size:', data[1].shape

        X, Y, Xt, Yt, cats = data
        if args.library:
            from skmultilearn.adapt import MLkNN
            model = MLkNN()
        else:
            from sklearn.neighbors import NearestNeighbors
            from multi import MLkNN
            model = MLkNN(NearestNeighbors)
        model.fit(X, Y)
        Yp = model.predict(Xt)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            hl = computeMetrics(Yp, Yt, cats)

        print 'the hamming loss:'
        print '>>  ', hl
        from sklearn.metrics import (hamming_loss, classification_report)
        print 'hamming loss(library):', hamming_loss(Yt, Yp)
        print classification_report(Yt, Yp, target_names=cats)
        print 'DONE..'
Beispiel #27
0
def ML_model_predict(train_x, train_y, test_x, model_name):
    print(f"--------train {model_name} model----------")
    classifier = None
    if model_name == "MLARAM":
        classifier = MLARAM(threshold=0.2)
    elif model_name == "MLkNN":
        classifier = MLkNN()
    elif model_name == "BRkNNa":
        classifier = BRkNNaClassifier()
    elif model_name == "BRkNNb":
        classifier = BRkNNbClassifier()
    elif model_name == "RF":
        classifier = RandomForestClassifier(n_estimators=1000,
                                            random_state=0,
                                            n_jobs=-1)
    elif model_name == "MLTSVM":
        classifier = MLTSVM(c_k=2**-1)
    classifier.fit(train_x, train_y)
    prediction = classifier.predict(test_x)
    return prediction
Beispiel #28
0
def multiLabelKnn():
    classifier_new = MLkNN(k=10)
    # Note that this classifier can throw up errors when handling sparse matrices.
    x_train = lil_matrix(train_x).toarray()
    y_train = lil_matrix(train_y).toarray()
    x_test = lil_matrix(test_x).toarray()

    filename = 'model.sav'
    start = time.time()

    # train
    # classifier_new.fit(x_train, y_train)

    # save
    # pickle.dump(classifier_new, open(filename, 'wb'))

    # load the model from disk
    loaded_model = pickle.load(open(filename, 'rb'))
    # result = loaded_model.score(X_test, Y_test)
    print('training time taken: ', round(time.time() - start, 0), 'seconds')
    # predict
    predictions_new = loaded_model.predict(x_test)
Beispiel #29
0
def mlknn(x_tr, y_tr, x_te, x_va=None):
    """
    mlknn
    :param x_tr:
    :param y_tr:
    :param x_te:
    :param x_va:
    :return:
    """
    pred = MLkNN(k=10, s=True)
    y_tr = np.int32(y_tr)
    pred.fit(x_tr, y_tr)

    if x_va is None:
        y_te_ = sparse.dok_matrix.toarray(pred.predict(x_te))
        return y_te_
    else:
        y_te_ = sparse.dok_matrix.toarray(pred.predict(x_te))
        y_va_ = sparse.dok_matrix.toarray(pred.predict(x_va))
        return y_te_, y_va_
# note that this unpickling is only for the most previously pickled (k=5 right now)
# pickle_file = open('MLkNN_milestone.pkl', 'rb')
# clf = pickle.load(pickle_file)

# 30 is currently the best tested k amount.
l = [30, 40, 50, 100, 200, 280]
# l = [200]
# l = [likely_k]
# l = [70, 80, 90, 100, 500, 1000, 2000, 3000, 4000, 5600]
best_clf = None
lowest_hl = float('inf')
best_k = float('inf')
for k in l:
    print(25*'=')
    print('k = ' + str(k))
    clf = MLkNN(k)

    # train
    clf.fit(x_train, y_train)

    # predict
    predictions = clf.predict(x_dev)

    predictions = predictions.todense()
    print('all match:', np.sum(np.all(predictions == y_dev, axis=1)) / len(y_dev))
    print('at least one match:', (np.sum(np.all(predictions - y_dev <= 0, axis=1))-np.sum(np.all(predictions== 0, axis=1))) / len(y_dev))
    print('binary :', np.mean(predictions == y_dev))
    hl = hamming_loss(y_dev, predictions)
    print('Hamming Loss:', hamming_loss(y_dev, predictions))
    if hl < lowest_hl:
        lowest_hl = hl