def main(): data = readData("IMDB-Movie-Data.csv") genres = data["Genre"] descriptions = data["Description"] labels = getLabels(genres) calculateNgrams(descriptions) features = list(map(extract_features, descriptions)) print len(features[1]) # X = features # Y = Labels X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42) #binRel(X_train, X_test, y_test, y_train) classifier = MLkNN(k=4) # Train classifier.fit(X_train, y_train) #predict #print X_test predictions = classifier.predict(np.array(X_test)) print('Hamming loss: {0}'.format( sklearn.metrics.hamming_loss(y_test, predictions))) #(y_true, y_pred) ''''
def __init__(self,window_size=100): self.h=MLkNN(k=20) self.window_size=window_size self.window=InstanceWindow(window_size) self.number_element=0 self.flag=False self.L=None
def classifiers(self): graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False) param_dicts = { 'GraphFactorization': dict(epoch=1), 'GraRep': dict(Kstep=2), 'HOPE': dict(), 'LaplacianEigenmaps': dict(), 'LINE': dict(epoch=1, order=1), 'LLE': dict(), } if not (sys.version_info[0] == 2 or platform.architecture()[0] == '32bit'): for embedding in OpenNetworkEmbedder._EMBEDDINGS: if embedding == 'LLE': dimension = 3 else: dimension = 4 yield EmbeddingClassifier( OpenNetworkEmbedder(copy(graph_builder), embedding, dimension, 'add', True, param_dicts[embedding]), LinearRegression(), MLkNN(k=2)) yield EmbeddingClassifier( SKLearnEmbedder(SpectralEmbedding(n_components=2)), LinearRegression(), MLkNN(k=2)) EmbeddingClassifier(CLEMS(metrics.accuracy_score, True), LinearRegression(), MLkNN(k=2), True)
def adapted(data): classifier = MLkNN(k=20) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) accuracyScore = accuracy_score(y_test, predictions) return None
def mlknn(train_data_inx,y_train,test_data_inx): classifier = MLkNN(k=mlknn_k) x_train = [] x_test = [] for i in range(len(train_data_inx)): x_train.append(corpus_tfidf[train_data_inx[i]]) for j in range(len(test_data_inx)): x_test.append(corpus_tfidf[test_data_inx[j]]) classifier.fit(csr_matrix(x_train), csr_matrix(y_train)) mlknn_pre = classifier.predict(csr_matrix(x_test)) mlknn_pre = mlknn_pre.toarray() return mlknn_pre
def MLKNN_method(X_train, y_train, ml_k, ml_s): """ 改编算法-->MLKNN方法 :param X_train: 输入数据 :param y_train: 对应标签数据 :return: """ try: classifier = MLkNN(k=int(ml_k), s=float(ml_s)) classifier.fit(X_train, y_train) return classifier except Exception as e: print("warning----改编算法KNN|MLKNN----" + str(e)) return None
def fit(self, X, y): """Fit classifier to multi-label data Parameters ---------- X : numpy.ndarray or scipy.sparse input features, can be a dense or sparse matrix of size :code:`(n_samples, n_features)` y : numpy.ndaarray or scipy.sparse {0,1} binary indicator matrix with label assignments, shape :code:`(n_samples, n_labels)` Returns ------- fitted instance of self """ self._label_count = y.shape[1] self.model_count_ = int(np.ceil(self._label_count / self.labelset_size)) self.classifier_ = LabelSpacePartitioningClassifier( classifier=MLkNN(), clusterer=RandomLabelSpaceClusterer( cluster_size=self.labelset_size, cluster_count=self.model_count_, allow_overlap=False), require_dense=[False, False]) return self.classifier_.fit(X, y)
def run_test1(normas): models = [[('cv', CountVectorizer(min_df=20, max_df=0.5))], [('tfidf', TfidfVectorizer(min_df=20, max_df=0.5))], [('tokenize', Tokenizador()), ('d2v', D2VTransformer(dm=0, min_count=100, size=200, workers=6))]] clfs = [{ 'clf': ('dt', DecisionTreeClassifier()), 'params': { 'dt__min_samples_split': [0.005, 0.010, 2], 'dt__max_depth': [16, 32, None] } }, { 'clf': ('rf', RandomForestClassifier()), 'params': { 'rf__n_estimators': [100, 110, 120], 'rf__min_samples_split': [0.005, 0.010, 2], 'rf__min_samples_leaf': [5, 3, 1] } }, { 'clf': ('mlknn', MLkNN()), 'params': { 'mlknn__k': [6, 8, 10, 12], 'mlknn__s': [0.5, 1.0, 1.5, 2.0] } }, { 'clf': ('mlp', MLPClassifier()), 'params': { 'mlp__hidden_layer_sizes': [(150), (100, 100), (50, 50, 50)], 'mlp__activation': ['tanh', 'relu'], 'mlp__solver': ['sgd', 'adam'] } }] run(normas, models, clfs)
def mlknn(self, number): classifier = MLkNN(k=number) classifier.fit(self.X_train, self.y_train) # predict predictions = classifier.predict(self.X_test) result = hamming_loss(self.y_test, predictions) print("hanming_loss,",result) result = f1_score(self.y_test, predictions, average='micro') print("micro -f1: ", result) result = precision_score(self.y_test, predictions,average='micro') print(result)
def MLKnn_GridSearch(X_train, X_test, y_train, y_test): parameters = {'k': range(1, 12), 's': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 1.0]} score = 'f1_macro' clf = GridSearchCV(MLkNN(), parameters, scoring=score) clf.fit(X, y) print(clf.best_params_, clf.best_score_)
def __init__(self, metadata): """ Args: metadata: an AutoDLMetadata object. Its definition can be found in AutoDL_ingestion_program/dataset.py """ self.done_training = False self.metadata = metadata self.output_dim = self.metadata.get_output_size() self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True) self.model = MLkNN(k=20) self.step = 0 self.lgb_round = 80
def RecommendByMLKNN(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """ML KNN算法""" classifier = MLkNN(k=train_data_y.shape[1]) classifier.fit(train_data, train_data_y) predictions = classifier.predict_proba(test_data).todense() """预测结果转化为data array""" predictions = numpy.asarray(predictions) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def train(self): classifier_new = MLkNN(k=10) x_train = lil_matrix(self.x_data).toarray() y_train = lil_matrix(self.y_data).toarray() x_test = lil_matrix(self.x_test).toarray() classifier_new.fit(x_train, y_train) # predict predictions = classifier_new.predict(x_test) return { 'accuracy': accuracy_score(self.y_test, predictions), 'f1_score': f1_score(self.y_test, predictions, average='micro') }
def create_model(file_path=FINAL_MLKNN_MODEL_FILE_PATH): """ Creates and trains a MLkNN classifier using the optimized parameters found Saves this trained model to disk :param string file_path: specifies where the model should be saved :return: a trained sklearn MLkNN classifier """ with open(OPTIMIZED_MODEL_PARAMETERS_FILE_PATH) as file: hyperparameters = json.load(file)['hyperparameters'] question_data, music_data = preprocessing.load_data() question_data, music_data = preprocessing.preprocess_data( question_data, music_data) clf = MLkNN(k=hyperparameters['k'], s=hyperparameters['s']) clf.fit(question_data.values, music_data.values) pickle.dump(clf, open(file_path, 'wb')) return clf
def __init__(self, random_state=84, n_estimators=20, params={ 'k': range(5, 27, 2), 's': [0.5, 0.7, 1.0] }, niterations=10): self.model = MLkNN() self.params = params self.niterations = niterations
def __init__(self, k=5, classifier=MLkNN(), lambd=0.3, delta=.5, threshold=0.70): self.k = k self.classifier = classifier self.lambd = lambd self.delta = delta self.threshold = threshold
def adapt(X_train, y_train, X_test, y_test): y_train = y_train.to_sparse().to_coo() y_test = y_test.to_sparse().to_coo() from skmultilearn.adapt import MLkNN classifier = MLkNN(k=4) print("Train Adapted algorithm") classifier.fit(X_train, y_train) print("Predict") predictions = classifier.predict(X_test) from sklearn.metrics import accuracy_score print("Accuracy") print(y_test.shape, predictions.shape) print(accuracy_score(y_test.toarray(), predictions))
def mlknn(traindata, trainlabel, ttype): #,valdata,val_label): #knnscore=[] #print("[mlknn start to class>>>>]") ''' find the best parameters''' parameters = {'k': range(2, 5), 's': np.arange(0.1, 0.5, 0.2)} score = 'accuracy' '''search parameters''' search_result = search_bestparmaters(MLkNN(), parameters, score, traindata, trainlabel) #print (search_result.best_params_, search_result.best_score_) k = search_result.best_params_['k'] s = search_result.best_params_['s'] save_score('score/record', ('mlknn', ttype, k, s, search_result.best_score_)) clf = MLkNN(k, s) clf.fit(traindata, trainlabel) joblib.dump(clf, './model/mlknn' + "_model" + ttype + ".m")
class MLkNN(): def __init__(self,window_size=100): self.h=MLkNN(k=20) self.window_size=window_size self.window=InstanceWindow(window_size) self.number_element=0 self.flag=False self.L=None def partial_fit(self,X,y): N,L=y.shape self.L=L for i in range(N): if self.window=None: self.window=InstanceWindow(self.window_size) self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) self.number_element+=1 if self.number_element==self.window_size: X_batch=self.window.get_attributes_matrix() y_batch=self.window.get_targets_matrix() self.h.fit(X_batch,y_batch) self.number_element=0 self.flag=True
def get_cado_predictions(): data_path = '../../datasets/cado/train.csv' test_path = '../../datasets/cado/test.csv' data = du.load_data(data_path) test = du.load_data(test_path) text_index = 6 label_start_index = 7 X = [d[text_index] for d in data] labels = [d[label_start_index:label_start_index + 12] for d in data] X_test = [d[text_index] for d in test] labels_test = [d[label_start_index:label_start_index + 12] for d in test] Y = np.array(labels, dtype='int') y_test = np.array(labels_test, dtype='int') #Y = np.array(binary_labels, dtype='int') test_index = len(X) X = X + X_test Y = np.vstack([Y, y_test]) tokenizer = tokenize_data(X) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(X) X = pad_sequences(sequences, maxlen=700, padding="post", truncating="post", value=0) num_words = min(MAX_NB_WORDS, len(word_index) + 1) embedding_matrix = np.zeros((num_words, 1)) for word, i in word_index.items(): if i >= MAX_NB_WORDS: continue embedding_matrix[i] = 1 X_train = X[0:test_index, :] Y_train = Y[0:test_index, :] x_test = X[test_index:len(X), :] y_test = Y[test_index:len(Y), :] classifier = MLkNN() classifier.fit(X_train, Y_train) predictions = classifier.predict(x_test) scores = classifier.predict_proba(x_test) y_pred = predictions.toarray() y_score = scores.toarray() return y_pred, y_score
def get_classifiers(): binary_relevance = BinaryRelevance(GaussianNB()) classifier_chain = ClassifierChain(GaussianNB()) label_powerset = LabelPowerset(GaussianNB()) decision_tree = DecisionTreeClassifier(random_state=0) knn = MLkNN(k=20) random_forest = RandomForestClassifier(max_depth=2, random_state=0) clfs = [ binary_relevance, classifier_chain, label_powerset, decision_tree, knn, random_forest ] names = [ 'binary_relevance', 'classifier_chain', 'label_powerset', 'decision_tree', 'knn', 'random_forest' ] return clfs, names
def test_mlknn(df, truth, eval_type): parameters = {'k': range(1, 4), 's': [0.5, 0.7, 1.0]} kfold = KFold(n_splits=10, random_state=26) # print("Start gridsearch") # clf = GridSearchCV(MLkNN(), parameters, scoring=eval_type, cv=kfold) # clf.fit(df, truth) # print(f"Gridsearch completed. Best params: {clf.best_params_}") best_classifier = MLkNN(k=3, s=0.5) print("Start Crossval") scores = cross_val_score(best_classifier, df.values, truth, cv=kfold, scoring=eval_type) return ["MLkNN"], [scores]
def __init__(self, model_name="MLKNNbaseline"): if model_name == "MLKNNbaseline": self.model = MLkNN() elif model_name == "BRkNNbaseline": self.model = BRkNNaClassifier() elif model_name == "BRSVCbaseline": self.model = BinaryRelevance(classifier=SVC(), require_dense=[False, True]) else: if model_name not in set( ["MLKNNbaseline", "BRkNNbaseline", "BRSVCbaseline"]): raise ValueError( "Specify MLKNNbaseline, BRkNNbaseline, or BRSVCbaseline model name" ) self.model_name = model_name
def MLkNN(self): print("") print("Starting MLkNN Classifier of skmultilearn.adapt...") print("") start = datetime.now() parameters = {'k': range(1, 3), 's': [0.5, 0.7, 1.0]} grid_search_cv = GridSearchCV(MLkNN(), parameters, scoring='f1_macro', verbose=2, n_jobs=-1) grid_search_cv.fit(self.x_train, self.y_train) clf = grid_search_cv.best_estimator_ y_pred = clf.predict(self.x_test) return self.multilabel_evaluation(y_pred, self.y_test)
def getClassifier(self): if self.classifierType.lower() == 'rakelo': classifier = RakelO( base_classifier=LabelPowerset(GaussianNB()), #base_classifier_require_dense=[True, True], model_count=10, labelset_size=2 #len(labelTypes) // 4 ) elif self.classifierType.lower() == 'mlknn': classifier = MLkNN(k=3) # elif self.classifierType.lower() == 'mltsvm': # classifier = MLTSVM(c_k = 2**-1) elif self.classifierType.lower() == 'mlaram': classifier = MLARAM() elif self.classifierType.lower() == 'labelpowerset': classifier = LabelPowerset( classifier=RandomForestClassifier(n_estimators=100), require_dense=[False, True]) return classifier
def MLkNN(self): self.sub_parser.add_argument('--library', action='store_true', default=False) args = self.sub_parser.parse_args(sys.argv[2:]) print 'Running ML-kNN, arguments=%s' % args print 'Loading %s data...' % args.N if args.f == 'My_dict': vectorizer = my_dict_vectorizer(stop=not args.nostop, bigram=args.bigram) elif args.f == 'LIB_count': vectorizer = lib_count_vectorizer(stop=not args.nostop, bigram=args.bigram) elif args.f == 'LIB_hash': vectorizer = lib_hash_vectorizer(stop=not args.nostop, bigram=args.bigram) elif args.f == 'LIB_tfidf': vectorizer = lib_tfidf_vectorizer(stop=not args.nostop, bigram=args.bigram) data = load_data(args.N, args.D, args.Nt, vectorizer) print 'Done loading data, actual feature size:', data[1].shape X, Y, Xt, Yt, cats = data if args.library: from skmultilearn.adapt import MLkNN model = MLkNN() else: from sklearn.neighbors import NearestNeighbors from multi import MLkNN model = MLkNN(NearestNeighbors) model.fit(X, Y) Yp = model.predict(Xt) with warnings.catch_warnings(): warnings.simplefilter("ignore") hl = computeMetrics(Yp, Yt, cats) print 'the hamming loss:' print '>> ', hl from sklearn.metrics import (hamming_loss, classification_report) print 'hamming loss(library):', hamming_loss(Yt, Yp) print classification_report(Yt, Yp, target_names=cats) print 'DONE..'
def ML_model_predict(train_x, train_y, test_x, model_name): print(f"--------train {model_name} model----------") classifier = None if model_name == "MLARAM": classifier = MLARAM(threshold=0.2) elif model_name == "MLkNN": classifier = MLkNN() elif model_name == "BRkNNa": classifier = BRkNNaClassifier() elif model_name == "BRkNNb": classifier = BRkNNbClassifier() elif model_name == "RF": classifier = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1) elif model_name == "MLTSVM": classifier = MLTSVM(c_k=2**-1) classifier.fit(train_x, train_y) prediction = classifier.predict(test_x) return prediction
def multiLabelKnn(): classifier_new = MLkNN(k=10) # Note that this classifier can throw up errors when handling sparse matrices. x_train = lil_matrix(train_x).toarray() y_train = lil_matrix(train_y).toarray() x_test = lil_matrix(test_x).toarray() filename = 'model.sav' start = time.time() # train # classifier_new.fit(x_train, y_train) # save # pickle.dump(classifier_new, open(filename, 'wb')) # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) # result = loaded_model.score(X_test, Y_test) print('training time taken: ', round(time.time() - start, 0), 'seconds') # predict predictions_new = loaded_model.predict(x_test)
def mlknn(x_tr, y_tr, x_te, x_va=None): """ mlknn :param x_tr: :param y_tr: :param x_te: :param x_va: :return: """ pred = MLkNN(k=10, s=True) y_tr = np.int32(y_tr) pred.fit(x_tr, y_tr) if x_va is None: y_te_ = sparse.dok_matrix.toarray(pred.predict(x_te)) return y_te_ else: y_te_ = sparse.dok_matrix.toarray(pred.predict(x_te)) y_va_ = sparse.dok_matrix.toarray(pred.predict(x_va)) return y_te_, y_va_
# note that this unpickling is only for the most previously pickled (k=5 right now) # pickle_file = open('MLkNN_milestone.pkl', 'rb') # clf = pickle.load(pickle_file) # 30 is currently the best tested k amount. l = [30, 40, 50, 100, 200, 280] # l = [200] # l = [likely_k] # l = [70, 80, 90, 100, 500, 1000, 2000, 3000, 4000, 5600] best_clf = None lowest_hl = float('inf') best_k = float('inf') for k in l: print(25*'=') print('k = ' + str(k)) clf = MLkNN(k) # train clf.fit(x_train, y_train) # predict predictions = clf.predict(x_dev) predictions = predictions.todense() print('all match:', np.sum(np.all(predictions == y_dev, axis=1)) / len(y_dev)) print('at least one match:', (np.sum(np.all(predictions - y_dev <= 0, axis=1))-np.sum(np.all(predictions== 0, axis=1))) / len(y_dev)) print('binary :', np.mean(predictions == y_dev)) hl = hamming_loss(y_dev, predictions) print('Hamming Loss:', hamming_loss(y_dev, predictions)) if hl < lowest_hl: lowest_hl = hl