class LP(): ''' Label Powerset Method ''' h = None def __init__(self, h=LogisticRegression()): self.h = LabelPowerset(h) def fit(self, X, Y): ''' Train the model on training data X,Y ''' return self.h.fit(X, Y) def predict(self, X): ''' Return predictions Y, given X ''' return self.h.predict(X) def predict_proba(self, X): ''' Return matrix P, where P[i,j] = P(Y[i,j] = 1 | X[i]) (where i-th row/example, and j-th label) ''' return self.h.predict_proba(X)
class MyLabelPowerSetFeatureSelect(): def fit(self, X, y): # I'm using a gaussian naive bayes base classifier self.LabelPowerSetObject = LabelPowerset(GaussianNB()) # fitting the data self.LabelPowerSetObject.fit(X, y) # transformed y y_transformed = self.LabelPowerSetObject.transform(y) # instanciating with SelectKBest object self.X_new = SelectKBest(chi2, k=2) # the feature selecting self.X_transformed = self.X_new.fit_transform(X, y_transformed) # save indices of the saved attributes self.selected_attributes_indices = self.X_new.get_support(indices = True) #print(self.attributes_indices,'the indices of the selected atributes') return self def transform(self, X): return X[:,self.selected_attributes_indices] def predict(self, X): return self.LabelPowerSetObject.predict(X) def predict_proba(self, X): return self.LabelPowerSetObject.predict_proba(X)
def labelSet(self): classifier = LabelPowerset(GaussianNB()) classifier.fit(self.X_train, self.y_train) # predict predictions = classifier.predict(self.X_test) result = accuracy_score(self.y_test, predictions) print(result)
def powerset(self): classifier = LabelPowerset(LogisticRegression()) classifier.fit(self.x_data, self.y_data) predictions = classifier.predict(self.x_test) return { 'accuracy': accuracy_score(self.y_test, predictions), 'f1_score': f1_score(self.y_test, predictions, average='micro') }
def classifiers(X_train, Y_train, X_test): classifier1 = BinaryRelevance(GaussianNB()) classifier2 = ClassifierChain(GaussianNB()) classifier3 = LabelPowerset(GaussianNB()) classifier1.fit(X_train, Y_train) classifier2.fit(X_train, Y_train) classifier3.fit(X_train, Y_train) predictions1 = classifier1.predict(X_test) predictions2 = classifier2.predict(X_test) predictions3 = classifier3.predict(X_test) return predictions1, predictions2, predictions3
def evaluate_verse(embedding, labels, number_shuffles=10, train_perc=0.1): from skmultilearn.problem_transform import LabelPowerset micro = [] macro = [] sss = StratifiedShuffleSplit( n_splits=number_shuffles, test_size=1 - train_perc) for train_index, test_index in sss.split(embedding, labels): X_train, X_test = embedding[train_index], embedding[test_index] y_train, y_test = labels[train_index], labels[test_index] clf = LabelPowerset(LogisticRegression()) clf.fit(X_train, y_train) preds = clf.predict(X_test) micro.append(f1_score(y_test, preds, average='micro')) macro.append(f1_score(y_test, preds, average='macro')) return (micro, macro)
def runSet(model, x, y): mse = [] accuracy = [] kf = KFold(n_splits=splitNo) for train, test in kf.split(x): classifier = LabelPowerset(model) classifier.fit(x[train], y[train]) predictions = classifier.predict(x[test]) accuracy.append(accuracy_score(y[test], predictions)) mse.append(mean_squared_error(y[test], predictions.toarray())) mse = np.array(mse) accuracy = np.array(accuracy) mse = np.mean(mse) accuracy = np.mean(accuracy) return accuracy, mse
def get_train_test_lda(topic): model = VGG16(include_top=False, pooling='avg') x_train, y_train, x_test, y_test = load() x_train = x_train.astype('float32') x_train /= 255 y_train = y_train.astype('int64') x_test = x_test.astype('float32') x_test /= 255 y_test = y_test.astype('float32') X_train = model.predict(x_train) print(X_train.shape) X_test = model.predict(x_test) # X_train = model.predict(x_train) # X_test = model.predict(x_test) for k in topic: X_iter = X_train model_label = lda.LDA(n_topics=k, n_iter=1000) model_label.fit(y_train) doc_topic = model_label.doc_topic_ x2 = doc_topic x = x2 x = discretization_doc_topic(x) X_train = np.hstack((X_train, x)) # multi-label learning to get x2 classifier = LabelPowerset(RandomForestClassifier()) classifier.fit(X_iter, x) x = np.array(sp.csr_matrix(classifier.predict(X_test)).toarray()) # print(x) # x = alpha * x1 + (1-alpha) * x2 # x = self.discretization_doc_topic(x) X_test = np.hstack((X_test, x)) return np.array(X_train)[:, -28:], np.array(y_train), np.array( X_test)[:, -28:], np.array(y_test)
metrics=['accuracy']) return model def create_model_multiclass(input_dim, output_dim): # create model model = Sequential() model.add(Dense(8, input_dim=input_dim, activation='relu')) model.add(Dense(output_dim, activation='softmax')) # Compile model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model KERAS_PARAMS = dict(epochs=10, batch_size=100, verbose=0) # clf = BinaryRelevance(classifier=Keras(create_model_single_class, False, KERAS_PARAMS), require_dense=[True,True]) # clf.fit(X_train, y_train) # result = clf.predict(X_test) # print(result) clf = LabelPowerset(classifier=Keras(create_model_multiclass, True, KERAS_PARAMS), require_dense=[True, True]) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print('Accuracy = %0.3f\n' % accuracy) print(type(accuracy))
accuracy_score(y_test, predicted) accuracy_score(y_test, dtree_predictions) from skmultilearn.problem_transform import LabelPowerset from sklearn.naive_bayes import GaussianNB # initialize binary relevance multi-label classifier # with a gaussian naive bayes base classifier classifier = LabelPowerset(LinearSVC()) # train classifier.fit(train_X, train_label) # predict predictions = classifier.predict(valid_X) accuracy_score(valid_label, predictions) from sklearn.metrics import accuracy_score accuracy_score(y_test, predictions) ## from sklearn.multiclass import OneVsRestClassifier from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier, GradientBoostingClassifier) from sklearn.multiclass import OutputCodeClassifier classifier = OutputCodeClassifier(GradientBoostingClassifier(max_depth=5, n_estimators=14), code_size=2,
# train classifier_new.fit(x_train, y_train) # predict predictions_new = classifier_new.predict(x_test) # accuracy print("Accuracy = ",accuracy_score(y_test,predictions_new)) print("\n") # using Label Powerset from skmultilearn.problem_transform import LabelPowerset # initialize label powerset multi-label classifier classifier = LabelPowerset(LogisticRegression()) # train classifier.fit(x_train, y_train) # predict predictions = classifier.predict(x_test) # accuracy print("Accuracy = ",accuracy_score(y_test,predictions)) print("\n") # using classifier chains from skmultilearn.problem_transform import ClassifierChain from sklearn.linear_model import LogisticRegression # initialize classifier chains multi-label classifier classifier = ClassifierChain(LogisticRegression()) # Training logistic regression model on train data classifier.fit(x_train, y_train) # predict predictions = classifier.predict(x_test) # accuracy print("Accuracy = ",accuracy_score(y_test,predictions))
X_copy = X_orig[(p):(p + 1)] #Slice the ith element from the numpy array y_copy = y_orig[(p):(p + 1)] X_model = X_orig y_model = y_orig #Set X and y equal to samples and labels X_model = np.delete( X_model, p, axis=0 ) #Create a new array to train the model with slicing out the ith item for LOOCV y_model = np.delete(y_model, p, axis=0) train_set = np.concatenate((X_model, y_model), axis=1) #combine numpy matrices classifier.fit(X_model, y_model) prediction = classifier.predict(X_copy) #print(prediction.toarray(), y_copy) results = np.append(results, np.array(prediction.toarray()), axis=0) if np.array_equal(y_copy, prediction.toarray()): j = j + 1 #print(y_copy, prediction.toarray()) else: #print(y_copy, prediction.toarray()) pass print(j / 49) att = results[:, 0] esc = results[:, 1:2] ns = results[:, 2:3] tang = results[:, 3:4]
multiclassifier = MultinomialNB() multiclassifier.fit(x_train_vect,Y_LP_train) multiclass_predict = multiclassifier.predict(x_test_vect) print(Y_LP_test) print(multiclass_predict) print('Test accuracy {}'.format(accuracy_score(Y_LP_test,multiclass_predict))) ##############################Label Powerset classifier####################### from skmultilearn.problem_transform import LabelPowerset from sklearn.naive_bayes import GaussianNB LP_classifier = LabelPowerset(GaussianNB()) LP_classifier.fit(x_train_vect, Y_train) LP_predictions = LP_classifier.predict(x_test_vect) print('Test accuracy {}'.format(accuracy_score(Y_test,LP_prediction))) ######################################################################################################### ############################ using Naive Bayes pipeline################################################### NB_pipeline = Pipeline([ ('tfidf', TfidfVectorizer(stop_words=stop_words)), ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)), ])
# In[4]: X_test = weather[weather['Year'] > 2016] X_test = X_test[X_test['Month'] > 8].iloc[:, 5:12] X_train = weather[~weather.index.isin(X_test.index)].iloc[:, 5:12] y_test = y[y.index.isin(X_test.index)] y_train = y[~y.index.isin(X_test.index)] # In[5]: # model = LabelPowerset(GaussianNB()) model = LabelPowerset(KNeighborsClassifier(n_neighbors=20)) # model = MLkNN(k=18) model.fit(X_train.values, y_train.values) predictions = model.predict(X_test) result = predictions.toarray() predicted = pd.DataFrame(result, columns=description) print(accuracy_score(y_test, predictions)) # In[6]: def columnName(row): name = '' idx = row[row == 1].index for i in range(len(idx)): name += (idx[i] + ' ') return name
def training_phase(filename): cue_ground_truths = [] fp = open(filename, 'r') data = fp.readlines() corpus = [] for i in data: i = i.replace('\n', '') corpus.append(i.split('\t')) postag = [] # feature extraction for line in range(len(corpus)): if (len(corpus[line]) > 8): count = int((len(corpus[line]) - 7) / 3) pakodi = [] for j in range(count): word = corpus[line][7 + (j * 3)] pakodi.append(word) if (corpus[line][3] in pakodi): cue_ground_truths.append(1) postag.append(corpus[line][5].lower()) else: cue_ground_truths.append(0) postag.append(corpus[line][5].lower()) elif (len(corpus[line]) == 8): cue_ground_truths.append(0) postag.append(corpus[line][5].lower()) pos_tags = [] for i in postag: if i not in pos_tags: pos_tags.append(i) # one-hot-encoding the postags one_hot_postag = [] for i in range(len(postag)): seq = [] if (postag[i] in pos_tags): req = pos_tags.index(postag[i]) for j in range(len(pos_tags)): if (j == req): seq.append(1.0) else: seq.append(0.0) remaining = 100 - len(pos_tags) for j in range(remaining): seq.append(0.0) one_hot_postag.append(seq) zero_list = [] for i in range(100): zero_list.append(0.0) sent_index = [] for i in range(len(corpus)): if (len(corpus[i]) == 1): sent_index.append(corpus[i - 1][2]) temp = corpus corpus = [] for i in range(len(temp)): if (len(temp[i]) == 1): continue else: corpus.append(temp[i]) # uncomment the below lines to take only before 5 and next 6 postags as features # temp_corpus = corpus # temp_cue_ground_truths = cue_ground_truths # temp_one_hot_postag = one_hot_postag # features = [] # cue_postag_features = [] # for i in sent_index: # i = int(i) # target_sentence = temp_corpus[:i+1] # target_cues = temp_cue_ground_truths[:i+1] # target_pos = temp_one_hot_postag[:i+1] # temp_corpus = temp_corpus[i+1:] # temp_cue_ground_truths = temp_cue_ground_truths[i+1:] # temp_one_hot_postag = temp_one_hot_postag[i+1:] # for j in range(len(target_cues)): # if (target_cues[j] == 1): # missing = 6 - j # if(missing>0): # for k in range(missing): # features.append(zero_list) # n = 0 # while n<=j: # features.append(target_pos[n]) # n = n + 1 # else: # n = j-6 # while n <= j: # features.append(target_pos[n]) # n = n +1 # missing = 7 - len(target_sentence) + j # if missing>0: # n = j + 1 # while n < len(target_sentence): # features.append(target_pos[n]) # n = n+1 # for n in range(missing): # features.append(zero_list) # else: # n = j+1 # while n < (j+7): # features.append(target_pos[n]) # n = n+1 # cue_postag_features.append(features) # features = [] # for i in range(len(cue_postag_features)): # if(len(cue_postag_features[i]) != 13): # print(len(cue_postag_features[i]), end= " ") # print(i) # for j in range(len(cue_postag_features[0])): # for k in range(len(cue_postag_features[0][j])): # if cue_postag_features[0][j][k] == 1: # print(pos_tags[k], end= " ") temp_corpus = corpus temp_cue_ground_truths = cue_ground_truths temp_one_hot_postag = one_hot_postag features = [] feature1 = [] cue_postag_features = [] for i in sent_index: i = int(i) target_sentence = temp_corpus[:i + 1] target_cues = temp_cue_ground_truths[:i + 1] target_pos = temp_one_hot_postag[:i + 1] temp_corpus = temp_corpus[i + 1:] temp_cue_ground_truths = temp_cue_ground_truths[i + 1:] temp_one_hot_postag = temp_one_hot_postag[i + 1:] for j in range(len(target_cues)): if (target_cues[j] == 1): for k in range(len(target_pos)): features.append(target_pos[j]) for l in range(100): if l == j: feature1.append(1.0) else: feature1.append(0.0) features.append(feature1) feature1 = [] cue_postag_features.append(features) features = [] cue_postag_features = keras.preprocessing.sequence.pad_sequences( cue_postag_features, maxlen=100) cue_postag_features = np.array(cue_postag_features) print(cue_postag_features.shape) temp_corpus = corpus temp_cue_ground_truths = cue_ground_truths temp_one_hot_postag = one_hot_postag features = [] ground_scope = [] for i in sent_index: i = int(i) target_sentence = temp_corpus[:i + 1] target_cues = temp_cue_ground_truths[:i + 1] target_pos = temp_one_hot_postag[:i + 1] temp_corpus = temp_corpus[i + 1:] temp_cue_ground_truths = temp_cue_ground_truths[i + 1:] temp_one_hot_postag = temp_one_hot_postag[i + 1:] for j in range(len(target_cues)): if (target_cues[j] == 1): cue_count = int((len(target_sentence[j]) - 7) / 3) paks = [] for k in range(cue_count): word = target_sentence[j][7 + (k * 3)] paks.append(word) indi = 0 for k in range(cue_count): if (paks[k] != '_'): indi = k for k in range(len(target_sentence)): thing = target_sentence[k][7 + (indi * 3) + 1] if (thing != '_'): features.append(1.0) else: features.append(0.0) ground_scope.append(features) features = [] ground_scope = keras.preprocessing.sequence.pad_sequences(ground_scope, maxlen=100) ground_scope = np.array(ground_scope) print(ground_scope.shape) # X_train = cue_postag_features # y_train = ground_scope # nsamples, nx, ny = X_train.shape # X_train_2d_bef = X_train.reshape((nsamples,nx*ny)) # y_train = y_train.reshape(nsamples*nx) # svm = SVC(kernel="linear", C=0.0025, random_state = 101) # svm.fit(X_train_2d_bef, y_train) # pickle.dump(svm, open("scope_detector.sav", 'wb')) X_train = cue_postag_features # y_train = ground_scope nsamples, col, vec = X_train.shape X_train_2d_bef = X_train.reshape((nsamples, col * vec)) y_train_2d_bef = np.array(ground_scope, dtype=float) X_train_2d, X_validate_2d, y_train, y_validate = train_test_split( X_train_2d_bef, y_train_2d_bef, test_size=0.4, random_state=101) # classifier = LabelPowerset(GaussianNB())#0.49 # classifier = LabelPowerset(RandomForestClassifier(n_estimators=25)).577 classifier = LabelPowerset(RandomForestClassifier(n_estimators=50)) #.586 # classifier = BinaryRelevance(MLPClassifier(hidden_layer_sizes = 200, verbose=True, max_iter = 15,learning_rate_init = 0.0035 )) # classifier = LabelPowerset(MLPClassifier()) # 0.58 # classifier = LabelPowerset(MLPClassifier(hidden_layer_sizes = 200, verbose=True)) #0.589 # classifier = LabelPowerset(MLPClassifier(hidden_layer_sizes = 200, verbose=True, max_iter = 400)) #.584 # classifier = LabelPowerset(MLPClassifier(hidden_layer_sizes = 400, verbose=True, learning_rate_init = 0.0035)) # classifier = LabelPowerset(GaussianNB())0.18 # classifier = ClassifierChain(DecisionTreeClassifier()) 0.17 # classifier = ClassifierChain(GaussianNB())0.02 # classifier = ClassifierChain(GaussianNB())#0.2 # classifier = BinaryRelevance(GaussianNB()).35 withoutcue # classifier = BinaryRelevance(DecisionTreeClassifier())0.15 # classifier = BinaryRelevance(OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1))0.41 # classifier = BinaryRelevance(OneClassSVM(nu=0.5, kernel="rbf", gamma=0.5)) # classifier = BinaryRelevance(RandomForestClassifier(n_estimators=25))0.29 # classifier = BinaryRelevance(RandomForestClassifier(n_estimators=100))0.24 # classifier = BinaryRelevance(RandomForestClassifier(n_estimators=3)).2900 # classifier = BinaryRelevance(MLPClassifier()).22 classifier.fit(X_train_2d, y_train) y_predict = classifier.predict(X_validate_2d) f1_measure = f1_score( y_validate, y_predict, average="weighted", ) print(f1_measure) pickle.dump(classifier, open("scope_detector.sav", 'wb'))
# In[35]: classifier = LabelPowerset(GaussianNB()) # In[36]: # train classifier.fit(X_train, y_train) #y_train.head() #y_train.isnull().sum() #cols.isnull().sum() # In[38]: # predict predictions1 = classifier.predict(X_test) #predictions = classifier.predict(X_test) # In[49]: from sklearn.metrics import accuracy_score, classification_report accuracy_score(y_test, predictions1) # In[50]: print(classification_report(y_test, predictions1)) # # Accuracy is too less so, drop few columns at first place where there are more nas i.e., 20% nas # In[7]:
ClassifierChainMultinomialNB_classifier.fit(X_train, Y_train) # Predictions predictions = ClassifierChainMultinomialNB_classifier.predict(X_test) # Accuracy print("Accuracy : {}".format(accuracy_score(Y_test,predict)*100)) # Create and save with pickle save_mydocuments = open("pickled_algos/MultilabelClassifierchainWithMultinomialNB.pickle","wb") pickle.dump(ClassifierChainMultinomialNB_classifier, save_mydocuments) save_mydocuments.close() print("Classifier chain with MultinomialNB classifier is done, time--- %s seconds ---" % (time.time() - start_time)) # 6. Label Powerset with MultinomialNB classifier (from scikit-multilearn) # create and fit classifier from skmultilearn.problem_transform import LabelPowerset LabelPowersetMultinomialNB_classifier = LabelPowerset(MultinomialNB()) LabelPowersetMultinomialNB_classifier.fit(X_train, Y_train) # Predictions predictions = LabelPowersetMultinomialNB_classifier.predict(X_test) # Accuracy print("Accuracy : {}".format(accuracy_score(Y_test,predict)*100)) # Create and save with pickle save_mydocuments = open("pickled_algos/MultilabelPowersetWithMultinomialNB.pickle","wb") pickle.dump(LabelPowersetMultinomialNB_classifier, save_mydocuments) save_mydocuments.close() print("LabelPowersetMultinomialNB_classifier is done, time--- %s seconds ---" % (time.time() - start_time)) print("Done")
Score_tree = dtree_model.score(features_test, labels_test) #------------------------------------------------------------------------------ # Using naive Bayes from skmultilearn.problem_transform import LabelPowerset from sklearn.naive_bayes import GaussianNB # initialize Label Powerset multi-label classifier # with a gaussian naive bayes base classifier classifier_nb = LabelPowerset(GaussianNB()) # train classifier_nb.fit(features_train, labels_train) # predict predictions_nb = classifier_nb.predict(features_test) score_nb= accuracy_score(labels_test,predictions_nb) #------------------------------------------------------------------------------ #using random forest classifier from sklearn.ensemble import RandomForestClassifier classifier1 = RandomForestClassifier(n_estimators =10, criterion = 'entropy', random_state = 0) classifier1.fit(features_train,labels_train) forest_pred = classifier1.predict(features_test) Score_forest = classifier1.score(features_test, labels_test) #------------------------------------------------------------------------------ # Using base Classifier with single-class SVM from skmultilearn.problem_transform import BinaryRelevance from sklearn.svm import SVC
class LMWrapper(Model): def __init__(self, C=1.0, use_idf=False, filename=None, **kwargs): self.lm = LabelPowerset(MultinomialNB()) self.vect1 = TfidfVectorizer(norm=None, use_idf=use_idf, min_df=0.0, ngram_range=(1, 1)) self.selector = sklearn.feature_selection.SelectKBest(k='all') self.output_dim = 0 if filename is not None: self.load(filename) def build_representation(self, x, y=None, fit=False): auxX = [ ' \n '.join([ ' '.join(['w_' + str(token) for token in field if token != 0]) for field in instance ]) for instance in x ] if fit: self.vect1.fit(auxX) auxX = self.vect1.transform(auxX) if fit: self.selector.fit(auxX, np.array([np.argmax(i) for i in y])) auxX = self.selector.transform(auxX) return auxX.todense() def fit(self, x, y, validation_data=None): auxY = y print('Build representation...') auxX = self.build_representation(x, auxY, fit=True) print('auxX shape:', auxX.shape) print('Fit model...') self.lm.fit(auxX, auxY) self.output_dim = auxY.shape[1] if validation_data is None: return None res = self.evaluate(validation_data[0], validation_data[1]) print("Accuracy in validation data =", res) return None def predict(self, x): auxX = self.build_representation(x, fit=False) print('Predicting baseline...') auxY = self.lm.predict(auxX) #auxY = to_categorical(auxY) if auxY.shape[1] < self.output_dim: npad = ((0, 0), (0, self.output_dim - auxY.shape[1])) auxY = np.pad(auxY, pad_width=npad, mode='constant', constant_values=0) return [auxY, [], []] def predict_prob(self, x): auxX = self.build_representation(x, fit=False) print('Predicting baseline...') auxY = self.lm.predict_proba(auxX) if auxY.shape[1] < self.output_dim: npad = ((0, 0), (0, self.output_dim - auxY.shape[1])) auxY = np.pad(auxY, pad_width=npad, mode='constant', constant_values=0) return [auxY, [], []] def evaluate(self, x, y): auxX = self.build_representation(x, fit=False) auxY = y auxY = np.array([np.argmax(i) for i in auxY]) return sklearn.metrics.accuracy_score(y_true=auxY, y_pred=self.lm.predict(auxX)) def save(self, filename): f = open(filename, "wb") pickle.dump(self.lm, f, protocol=4) pickle.dump(self.vect1, f, protocol=4) pickle.dump(self.selector, f, protocol=4) pickle.dump(self.output_dim, f, protocol=4) f.close() def load(self, filename): f = open(filename, "rb") self.lm = pickle.load(f) self.vect1 = pickle.load(f) self.selector = pickle.load(f) self.output_dim = pickle.load(f) f.close()
def labelpowerset(x_train, y_train, x_test, y_test): classifier = LabelPowerset(RandomForestClassifier(n_estimators=estimators)) classifier.fit(x_train, y_train) predictions = classifier.predict(x_test) print("Accuracy = {}".format(accuracy_score(y_test, predictions)))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) #classifier = BinaryRelevance(GaussianNB()) #classifier = BinaryRelevance(tree.DecisionTreeClassifier()) #classifier = ClassifierChain(tree.DecisionTreeClassifier()) classifier = LabelPowerset(tree.DecisionTreeClassifier()) #classifier = MLkNN(k=5) #classifier = BRkNNaClassifier(k=5) #ptclassifier = LabelPowerset(tree.DecisionTreeClassifier()) #clusterer = IGraphLabelCooccurenceClusterer('fastgreedy', weighted=True, include_self_edges=True) #classifier = LabelSpacePartitioningClassifier(ptclassifier, clusterer) classifier.fit(X_train.as_matrix(), y_train.as_matrix()) predictions = classifier.predict(X_test.as_matrix()) loss = hamming_loss(y_test.as_matrix(), predictions) print 'Hamming loss: ', loss #acc = accuracy_score(y_test.as_matrix(), predictions) #print 'accuracy: ', acc #lrloss = label_ranking_loss(y_test, predictions.toarray()) #lrap = label_ranking_average_precision_score(y_test, predictions.toarray()) #print "LRLOSS: best value 0: ", lrloss #print "LRAP: best value 1: ", lrap #macro_score = f1_score(y_test, predictions.toarray(), average='macro') micro_score = f1_score(y_test, predictions.toarray(), average='micro') weighted_score = f1_score(y_test, predictions.toarray(), average='weighted')
X_train = [X_train[index] for index in range(0,len(X_train)) if X_train[index] != ['"']] X_train = [X_train[index] for index in range(0,len(X_train)) if X_train[index] != ['']] test_data = [test_data[index] for index in range(0,len(test_data)) if test_data[index] != ['"']] test_data = [test_data[index] for index in range(0,len(test_data)) if test_data[index] != ['']] with open(train, 'r') as file: for line in file.readlines(): index = line.index(',')''' # initialize Label Powerset multi-label classifier # with a gaussian naive bayes base classifier classifier = LabelPowerset(GaussianNB()) # train classifier.fit(X, y) # predict predictions = classifier.predict(data) accuracy_score(meta, predictions) '''classifier = MLkNN(k=20) # train classifier.fit(data, meta) # predict predictions = classifier.predict(X) accuracy_score(Y,predictions)'''
hdrs = ['Release Year', 'Origin/Ethnicity', 'Director', 'Cast', 'Title'] for hd in hdrs: if hd == 'Title': cv = TfidfVectorizer(analyzer='word', stop_words=stopwords) # TF-IDF для названий else: cv = CountVectorizer(analyzer='word', min_df=2) # определить, относится или нет # конкретный режиссер (актер, год, страна) к конкретному фильму c = cv.fit_transform(x[hd].map(lambda pl: str(pl))) p = pandas.DataFrame(c.todense(), index=x.index, columns=cv.get_feature_names()) plot = pandas.concat([plot, p], axis=1) x_train, x_test, y_train, y_test = train_test_split(plot, y, test_size=0.33, random_state=42) classif = LabelPowerset(LogisticRegression()) classif.fit(x_train, y_train) pr = classif.predict(x_test) print("Accuracy = ", accuracy_score( y_test, pr)) # доля фильмов, у которых жанры предсказаны абсолюно точно print("Jaccard similarity score =", jaccard_similarity_score( y_test, pr)) # мера схожести исходных и предсказанных наборов жанров # precision - доля правильных присвоений данного жанра # recall - способность находить фильмы данного жанра # f1 - среднее гармоническое precision и recall # support - количество фильмов каждого жанра в y_test print(classification_report(y_test, pr, target_names=list(y.head())))
for line in f.readlines(): tag_list.append(line[:-1].decode('utf-8')) #classifier = MLkNN(k=100) #classifier = MLARAM() #classifier = LabelPowerset(classifier = SVC(), require_dense = [False, True]) #classifier = ClassifierChain(GaussianNB()) #classifier = ClassifierChain(SGDClassifier()) classifier = LabelPowerset(tree.DecisionTreeClassifier(),require_dense = [False, False]) #classifier = ClassifierChain(tree.DecisionTreeClassifier()) #classifier = BinaryRelevance(classifier = SVC(), require_dense = [False, True]) print "Start Training" classifier.fit(X_train, Y_train) y_train_pred = classifier.predict(X_train) y_test_pred = classifier.predict(X_test) movie_tag_dict = dict() for i in range(len(train_id)): movie_tag_dict[train_id[i]] = [] for j in range(2015): if y_train_pred[i,j] == 1: movie_tag_dict[train_id[i]].append(tag_list[j]) for i in range(len(test_id)): movie_tag_dict[test_id[i]] = [] for j in range(2015): if y_test_pred[i,j] == 1: movie_tag_dict[test_id[i]].append(tag_list[j])
roc_auc_score(y_test, classifier.predict_proba(x_test).toarray())) # # Label Powerset # * Label Powerset creates a unique class for every possible label combination that is present in the training set, this way it makes use of label correlation # * Only problem with this method is as the no of classes increases its computational complexity also increases. # In[67]: log_classifier = LabelPowerset(LogisticRegression()) # In[68]: log_classifier.fit(x_train, y_train) print('Accuracy_score using LabelPowerset is ', round(accuracy_score(y_test, log_classifier.predict(x_test)) * 100, 1), '%') print('-------------------------------------------------') print('roc_auc_score using LabelPowerset is ', roc_auc_score(y_test, log_classifier.predict_proba(x_test).toarray())) # # ClassifierChain # * This method uses a chain of binary classifiers # * Each new Classifier uses the predictions of all previous classifiers # * This was the correlation b/w labels is taken into account # In[69]: chain = ClassifierChain(LogisticRegression())
#Hamming Loss for Classifier Chaines hamm_loss_cc = hamming_loss(y_test, predictions_cc) print("Hamming Loss:", hamm_loss_cc) print("\n\n\nTraining data with Label Powerset using Gaussian Naive Bayes") #initialize Label Powerset multi-label classifier #with a gaussian naive bayes base classifier classifier_lp = LabelPowerset(GaussianNB()) # train for Label Powerset classifier_lp.fit(X_train, y_train) # predict for Label Powerset predictions_lp = classifier_lp.predict(X_test) #Hamming Loss for Label PowerSet hamm_loss_lp = hamming_loss(y_test, predictions_lp) print("Hamming Loss:", hamm_loss_lp) print("\n\n\nAll hamming loss:") print("Binary Relevance:\n", hamm_loss_binary) print("Classifier Chains:\n", hamm_loss_cc) print("Label Powerset:\n", hamm_loss_lp) objects = ('BinaryRelevance', 'ClassifierChain', 'LabelPowerset') y_pos = np.arange(len(objects)) performance = [hamm_loss_binary, hamm_loss_cc, hamm_loss_lp]
i = 5 while i > 0: i = i - 1 t = int(np.mod(a, 2)) a = int(np.floor(a / 2)) res[i] = t return res test = datasets.make_multilabel_classification() data = pickle.load(open('datasets.pickle', 'rb')) X = data[0] Y = data[1] logs = [] yt = [] for i in range(Y.shape[0]): yt.append(transfer(Y[i, :])) log = LogisticRegression() log.fit(X, yt) p = log.predict(X) res = [] for i in range(len(p)): rt = transfer1(p[i]) res.append(rt) print(accuracy_score(np.matrix(res), Y)) lb = LabelPowerset(LogisticRegression()) lb.fit(X, Y) pred = lb.predict(X) print(accuracy_score(pred, Y))
# The matrices are initially in lil_matrix format # Converting them to compressed row matrix format X_train = X_train.tocsr() y_train = y_train.todense() X_test = X_test.tocsr() y_test = y_test.todense() label_set = set([0, 3, 4, 7, 8, 9, 12, 15, 17, 19, 20, 21]) label_list = [0, 3, 4, 7, 8, 9, 12, 15, 17, 19, 20, 21] y_train = y_train[:, label_list] y_test = y_test[:, label_list] start_time = time.process_time() # classifier = LabelPowerset(RandomForestClassifier(random_state=0, n_estimators=10, n_jobs=-1)) # classifier = RandomForestClassifier(random_state=0, n_estimators=10) # classifier = BinaryRelevance(classifier = LinearSVC(), require_dense = [False, True]) classifier = LabelPowerset(SGDClassifier(penalty='l2', alpha=0.01)) classifier.fit(X_train, y_train) y_predicted = classifier.predict(X_test) total_time = time.process_time() - start_time print("Total time taken is : " + str(total_time)) print("Jaccard Similarity Score is : " + str(jaccard_similarity_score(y_test, y_predicted))) print("Hamming Loss is : " + str(hamming_loss(y_test, y_predicted))) # print("F1_Similarity score is : "+str(f1_score(y_test,y_predicted,average='macro')))
print( '\n Multi-Class OCSVM: Precision = %2.2f, Recall = %2.2f, FalseAlarm = %2.2f' % (precision, recall, falsealarm)) # step 2. perform multi-class learning approaches # using binary relevance # initialize binary relevance multi-label classifier # with a gaussian naive bayes base classifier classifier = LabelPowerset(GaussianNB()) # train classifier.fit(sample_train, label_train[:, :5]) # predict predictions = classifier.predict(sample_test) # approach 1: one-vs-all (review the approach) classif = OneVsRestClassifier(linear_model.LogisticRegression()) classif.fit(sample_train, label_train[:, :4]) # explain output label_score label_score = classif.decision_function(sample_test) # explain decision strategy here idxsort = np.argsort(label_score) label_pred = idxsort[:, -1] + 1 print('\n one-vs-all: %.2f' % accuracy_score(label_test[:, :4], label_pred)) ## approach 2: one-vs-one (review the approach) #classif = OneVsOneClassifier(linear_model.LogisticRegression()) #classif.fit(sample_train, label_train) ## explain similar output and decision strategy