def classifer_chain(self): # initialize classifier chains multi-label classifier # with a gaussian naive bayes base classifier print("build classifier...") classifier = ClassifierChain(RandomForestClassifier()) #classifier = LabelPowerset(RandomForestClassifier()) print("end...") print("start training...") classifier.fit(self.X_train, self.y_train) print("end...") # predict print("start test...") predictions = classifier.predict(self.X_test) print("end...") print("result as following:") result = hamming_loss(self.y_test, predictions) print("hanming_loss: ", result) print("accuracy score: ", accuracy_score(y_test, predictions)) result = f1_score(self.y_test, predictions, average='micro') print("micro-f1_score: ", result)
def train(self): classifier = ClassifierChain(LogisticRegression()) classifier.fit(self.x_data, self.y_data) predictions = classifier.predict(self.x_test) return { 'accuracy': accuracy_score(self.y_test, predictions), 'f1_score': f1_score(self.y_test, predictions, average='micro') }
class ClassifierChains: def __init__(self): self.model = ClassifierChain(LGBMClassifier()) def set_grow_step(self, new_step): self.grow_boost_round = new_step def fit(self, X_train, y_train): self.model.fit(X_train, y_train) def predict(self, X_test): return self.model.predict(X_test).A
def classifiers(X_train, Y_train, X_test): classifier1 = BinaryRelevance(GaussianNB()) classifier2 = ClassifierChain(GaussianNB()) classifier3 = LabelPowerset(GaussianNB()) classifier1.fit(X_train, Y_train) classifier2.fit(X_train, Y_train) classifier3.fit(X_train, Y_train) predictions1 = classifier1.predict(X_test) predictions2 = classifier2.predict(X_test) predictions3 = classifier3.predict(X_test) return predictions1, predictions2, predictions3
def randomForestClassifierChain(): print("Random forest classifier chain") start = time.time() classifier = ClassifierChain(classifier=RandomForestClassifier(), require_dense=[False, True]) filename = "randomForestClassifierChain" # classifier.fit(train_x, train_y) # save # pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def supportVectorMachineChain(): print("Support vector machine") start = time.time() classifier = ClassifierChain(classifier=svm.SVC(), require_dense=[False, True]) filename = "SupportVectorMachine" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def knnClassifierChain(): print("knn classifier chain") start = time.time() classifier = ClassifierChain(KNeighborsClassifier()) filename = "knnChain" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def gaussianNaiveBayes(): print("Gaussian naive bayes") start = time.time() classifier = ClassifierChain(GaussianNB()) filename = "gaussianNaiveBayes" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def ClassifierChain (): # Train-Test Split ======================================================= print("setting up a neural network...") from sklearn.model_selection import train_test_split train, test = train_test_split(df, test_size=0.33, shuffle=True) train_text = train['Book_Text'] test_text = test['Book_Text'] # TF-IDF ================================================================== from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2') vectorizer.fit(train_text) vectorizer.fit(test_text) x_train = vectorizer.transform(train_text) y_train = train.drop(labels = ['Book_Text'], axis=1) x_test = vectorizer.transform(test_text) y_test = test.drop(labels = ['Book_Text'], axis=1) # using classifier chains from skmultilearn.problem_transform import ClassifierChain from sklearn.linear_model import LogisticRegression # initialize classifier chains multi-label classifier classifier = ClassifierChain(LogisticRegression()) # Training logistic regression model on train data classifier.fit(x_train, y_train) # predict predictions = classifier.predict(x_test) # accuracy print("Accuracy = ",accuracy_score(y_test,predictions)) print("\n")
class Multi_labeling: def __init__(self, label_dict, train_labels, train_data, test_labels, test_data): self.label_dict = label_dict self.train_labels = train_labels self.train_data = train_data self.test_labels = test_labels self.test_data = test_data def classify(self): from skmultilearn.problem_transform import ClassifierChain from sklearn.svm import SVC,LinearSVC import sklearn.metrics as metrics # ============================= # ClassifierChain # # ============================= from sklearn.multiclass import OneVsRestClassifier # from sklearn.multioutput import ClassifierChain from sklearn.linear_model import LogisticRegression # cc = ClassifierChain(LogisticRegression()) self.cc = ClassifierChain(LinearSVC()) self.cc.fit(self.train_data, self.train_labels) # y_pred = self.cc.predict(self.test_data) # cc_art_f1 = metrics.f1_score(self.test_labels, y_pred, average='micro') # # initialize Classifier Chain multi-label classifier # # with an SVM classifier # # SVM in scikit only supports the X matrix in sparse representation # classifier = ClassifierChain( # classifier=SVC(), # require_dense=[False, True] # ) # # train # classifier.fit(self.train_data, self.train_labels) # # predict # predictions = classifier.predict(self.test_data) # print(predictions) # art_f1 = metrics.f1_score(self.test_labels, predictions, average='macro') # return art_f1 # ============================= # KNeighborsClassifier # # ============================= from sklearn.neighbors import KNeighborsClassifier knc = KNeighborsClassifier() knc.fit(self.train_data, self.train_labels) # Y_pred = knc.predict(self.test_data) # knc_art_f1 = metrics.f1_score(self.test_labels, Y_pred, average='micro') # ============================= # SGDClassifier # # ============================= from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=0, max_iter=6, tol=None) clf = OneVsRestClassifier(sgd) clf.fit(self.train_data, self.train_labels) # y_pred = clf.predict(self.test_data) # sgd_art_f1 = metrics.f1_score(self.test_labels, y_pred, average='micro') # return cc_art_f1, knc_art_f1, sgd_art_f1 def pred_all_other(self, input_data): y_pred = self.cc.predict(input_data) return y_pred
res = res / y_pred.shape[0] return np.round(res, 2) for i in range(5): log = LogisticRegression() log.fit(np.hstack((X, Y[:, 0:i])), Y[:, i]) # 每次训练将前一次的预测结果附带上 logs.append(log) results = [] for i in range(5): res = logs[i].predict(np.hstack((X, Y[:, 0:i]))) results.append(res) fres = [] for i in range(len(results[0])): a = [ results[0][i], results[1][i], results[2][i], results[3][i], results[4][i] ] fres.append(a) fres = np.matrix(fres) print(accuracy_score(fres, Y)) test = datasets.make_multilabel_classification() # 使用已写好的分类器链算法验证结果 cl = ClassifierChain(LogisticRegression()) cl.fit(data[0], data[1]) pred = cl.predict(test[0]) print(accuracy_score(pred, test[1]))
y_test = y_test.values #n-gram #tfidf = TfidfVectorizer(ngram_range = (1,1), stop_words = 'english') #tfidf = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,1), norm='l2') tfidf = CountVectorizer() tfidf.fit(x_train) x_train = tfidf.transform(x_train) x_test = tfidf.transform(x_test) # train #classifier = BinaryRelevance(GaussianNB()) classifier.fit(x_train, y_train) # predict predictions = classifier.predict(x_test) y_pred = [] for i in predictions: y_pred.append(list(i.A[0])) #print(y_pred) y_test = y_test.tolist() #print(len(y_test)) y_pred_dataframe = pd.DataFrame(y_pred, columns=categories) y_test_dataframe = pd.DataFrame(y_test, columns=categories) #print(len(y_pred_dataframe)) this_pred_list = y_pred_dataframe[category].tolist() this_test_list = y_test_dataframe[category].tolist() this_accuracy = accuracy_score(this_test_list, this_pred_list)
solver='liblinear')) i = 0 j = 0 for i in range(0, 47): X_copy = X_orig[(i):( i + 1)] #Slice the ith element from the numpy array y_copy = y_orig[(i):(i + 1)] X_model = X_orig y_model = y_orig X_model = np.delete( X_model, i, axis=0 ) #Create a new array to train the model with slicing out the ith item for LOOCV y_model = np.delete(y_model, i, axis=0) classifier.fit(X_model, y_model) prediction = classifier.predict(X_copy) equal = prediction.toarray() print(equal, y_copy) if np.array_equal(equal, y_copy): j = j + 1 #print(y_copy, equal) if np.not_equal: #print(y_copy, equal) pass print(j / 48) #prediction = classifier.predict(X_test) #print(prediction.toarray()) #classifier.fit(X_train, y_train) #predictions = classifier.predict(X_test) #ans_formatted = predictions.toarray()
result = {"accuracy:": acc, "hamming_score": ham} return result clf_chain_model = build_model(MultinomialNB(), ClassifierChain, X_train, y_train, X_test, y_test) clf_chain_model clf = ClassifierChain(MultinomialNB()) clf.fit(X_train, y_train) # x = [ 'how to write ml code in python and java i have data but do not know how to do it','java data but do not know how to do it'] x = ['how to write code python'] xt = tfidf.transform(x) multilabel.inverse_transform(clf.predict(xt)) """#### LabelPowerset ![](https://github.com/Jcharis/Python-Machine-Learning/blob/master/Multi_Label_Text_Classification_with_Skmultilearn/labelPowerset_multilabel_ml_jcharistech.png?raw=1) """ clf_labelP_model = build_model(MultinomialNB(), LabelPowerset, X_train, y_train, X_test, y_test) clf_labelP_model ### Apply On A Simple Ttitle/Question ex1 = df['title'].iloc[0] ex1 # Vectorized
else: y[i2].append(0) from skmultilearn.problem_transform import ClassifierChain from sklearn.naive_bayes import GaussianNB from sklearn import metrics # initialize classifier chains multi-label classifier # with a gaussian naive bayes base classifier classifier = ClassifierChain(GaussianNB()) X = np.array(X) y = np.array(y) # train classifier.fit(X, y) # predict predictions = classifier.predict(X[1]) pred = predictions.toarray() result = list(np.where(pred == 1)[1]) print('\n\nPrediction:') for r in result: print('\t*',genre_unique[r]) joblib.dump(vectorizer, 'vectorizer.pkl') joblib.dump(classifier, 'model.pkl') with open("genres_files.txt", "w") as file: file.write(str(genre_unique)) #print(metrics.accuracy_score(y,predictions))
L1.append(d1[k]) X = np.array(L) Y = np.array(L1) X_s, Y_s = shuffle(X, Y) size = [0.2] Mic = [] Mac = [] Wt = [] Acc = [] if mic >= thres: break for j in range(0, len(size)): X_train, X_test, Y_train, Y_test = train_test_split(X_s, Y_s, test_size=size[j]) #k_fold = KFold(len(Y), n_folds=10, shuffle=True, random_state=0) clf = ClassifierChain(LogisticRegression()) #clf = tree.DecisionTreeClassifier() #clf=RandomForestClassifier() clf.fit(X_train, Y_train) Y_predicted = clf.predict(X_test) Mic.append(f1_score(Y_test, Y_predicted, average='micro')) Mac.append(f1_score(Y_test, Y_predicted, average='macro')) Wt.append(f1_score(Y_test, Y_predicted, average='weighted')) Acc.append((accuracy_score(Y_test, Y_predicted))) mic += Mic[j] mac += Mac[j] print "Micro-F1=", float(mic) / float(len(size)) * 1.0 print "Macro-F1=", float(mac) / float(len(size)) * 1.0
d = d.as_matrix() #The results might vary due to the usage of random state with train and test split X_train, X_test, y_train, y_test = train_test_split(d, y, test_size=0.2, random_state=42) # The classifier instance with the classifier as # RandomForestClassifier clf_cc = ClassifierChain( RandomForestClassifier(n_estimators=100, max_depth=200)) #fitting the model for the classification into the labels clf_cc.fit(X_train, y_train.astype(float)) #predictions predictions_cc = clf_cc.predict(X_test) pred_prob = clf_cc.predict_proba(X_test) #Finding the evaluation metrics # micro recall, macro recall, micro precision, macro precision # micro f1, macro f1, hamming loss r1 = recall_score(y_true=y_test, y_pred=predictions_cc, average='micro') r2 = recall_score(y_true=y_test, y_pred=predictions_cc, average='macro') p1 = precision_score(y_true=y_test, y_pred=predictions_cc, average='micro') p2 = precision_score(y_true=y_test, y_pred=predictions_cc, average='macro') f1 = f1_score(y_true=y_test, y_pred=predictions_cc, average='micro') f2 = f1_score(y_true=y_test, y_pred=predictions_cc, average='macro') Score_cc_ham = hamming_loss(y_test, predictions_cc) # Printing the evaluation metrics print "Hamming Loss for classifier chains", Score_cc_ham
#Hamming Loss for Binary Relevance hamm_loss_binary = hamming_loss(y_test, predictions_binary) print("Hamming Loss:", hamm_loss_binary) print("\n\n\nTraining data with Classifier Chains using Gaussian Naive Bayes") #initialize Classifier Chains multi-label classifier #with a gaussian naive bayes base classifier classifier_cc = ClassifierChain(GaussianNB()) # train for Classifier Chaines classifier_cc.fit(X_train, y_train) # predict for Classifier Chains predictions_cc = classifier_cc.predict(X_test) #Hamming Loss for Classifier Chaines hamm_loss_cc = hamming_loss(y_test, predictions_cc) print("Hamming Loss:", hamm_loss_cc) print("\n\n\nTraining data with Label Powerset using Gaussian Naive Bayes") #initialize Label Powerset multi-label classifier #with a gaussian naive bayes base classifier classifier_lp = LabelPowerset(GaussianNB()) # train for Label Powerset classifier_lp.fit(X_train, y_train)
#replace <D> with nothing from data train_data = train_data.iloc[:, 0].str.replace('<\d+>', '') test_data = test_data.iloc[:, 0].str.replace('<\d+>', '') #count the frequency of every word in vocabulary in each document vectorizer = CountVectorizer() train_data_vector = vectorizer.fit_transform(train_data) test_data_vector = vectorizer.transform(test_data) #train the classifier model = ClassifierChain(RandomForestClassifier(n_jobs=-1, verbose=1)) model.fit(train_data_vector, train_labels) #test the classifier predicted_labels = model.predict(test_data_vector) predicted_labels_train = model.predict(train_data_vector) predicted_probabilities = model.predict_proba(test_data_vector) #test accuracy #~7% with random forest and binary relevance #~7% with random forest and classifier chain #~5% with random forest and label powerset #~4% with multilabel knn test_acc = accuracy_score(test_labels, predicted_labels) train_acc = accuracy_score(train_labels, predicted_labels_train) test_hamm_loss = hamming_loss(test_labels, predicted_labels) test_cov_err = coverage_error(test_labels, predicted_probabilities.toarray()) test_rank_loss = label_ranking_loss(test_labels, predicted_probabilities.toarray()) test_avr_prec = label_ranking_average_precision_score(
# In[68]: log_classifier.fit(x_train, y_train) print('Accuracy_score using LabelPowerset is ', round(accuracy_score(y_test, log_classifier.predict(x_test)) * 100, 1), '%') print('-------------------------------------------------') print('roc_auc_score using LabelPowerset is ', roc_auc_score(y_test, log_classifier.predict_proba(x_test).toarray())) # # ClassifierChain # * This method uses a chain of binary classifiers # * Each new Classifier uses the predictions of all previous classifiers # * This was the correlation b/w labels is taken into account # In[69]: chain = ClassifierChain(LogisticRegression()) # In[70]: chain.fit(x_train, y_train) print('Accuracy_score using ClassifierChain is ', round(accuracy_score(y_test, chain.predict(x_test)) * 100, 1), '%') print('-------------------------------------------------') print('roc_auc_score using ClassifierChain is ', roc_auc_score(y_test, chain.predict_proba(x_test).toarray()))
# # Accuracy # print("Accuracy : {}".format(accuracy_score(Y_test,predict)*100)) # # Create and save with pickle # save_mydocuments = open("pickled_algos/MultilabelBinaryRelevanceWithGausseanNB.pickle","wb") # pickle.dump(clf, save_mydocuments) # save_mydocuments.close() # print("BR Method with GausseanNB classifier is done, time--- %s seconds ---" % (time.time() - start_time)) # 5. Classifier chain with MultinomialNB classifier (from scikit-multilearn) # create and fit classifier from skmultilearn.problem_transform import ClassifierChain ClassifierChainMultinomialNB_classifier = ClassifierChain(MultinomialNB()) ClassifierChainMultinomialNB_classifier.fit(X_train, Y_train) # Predictions predictions = ClassifierChainMultinomialNB_classifier.predict(X_test) # Accuracy print("Accuracy : {}".format(accuracy_score(Y_test,predict)*100)) # Create and save with pickle save_mydocuments = open("pickled_algos/MultilabelClassifierchainWithMultinomialNB.pickle","wb") pickle.dump(ClassifierChainMultinomialNB_classifier, save_mydocuments) save_mydocuments.close() print("Classifier chain with MultinomialNB classifier is done, time--- %s seconds ---" % (time.time() - start_time)) # 6. Label Powerset with MultinomialNB classifier (from scikit-multilearn) # create and fit classifier from skmultilearn.problem_transform import LabelPowerset LabelPowersetMultinomialNB_classifier = LabelPowerset(MultinomialNB()) LabelPowersetMultinomialNB_classifier.fit(X_train, Y_train)
Y_train = train[:, 0:label_data.shape[1]] X_train = train[:, label_data.shape[1]:] Y_test = test[:, 0:label_data.shape[1]] X_test = test[:, label_data.shape[1]:] # ### Gaussian Naive Bayesian + Classifier Chain # In[26]: classifier = ClassifierChain(GaussianNB()) classifier.fit(X_train, Y_train) predictions = classifier.predict(X_test) accuracy_score(Y_test, predictions) # ### Neural Network + Classifier Chain # In[44]: mlp = MLPClassifier(solver='lbfgs', activation='relu', alpha=1e-4, hidden_layer_sizes=(50, 50), random_state=1, max_iter=1000, verbose=10, learning_rate_init=.1)
class MultiLabelClassifier(object): def __init__(self): self.total_data_df = pd.read_csv(os.path.join("data", "cleaned_data.csv"), encoding="ISO-8859-1") self.data_df = self.total_data_df[~self.total_data_df.Tags.isnull()] self.total_records = len(self.data_df.index) self.train_df = self.data_df.tail(int(self.total_records * .67)) self.test_df = self.data_df.head(int(self.total_records * .23)) self.total_tag_list = self.get_tag_list() self.total_word_list = self.get_word_list() self.modified_train_df = pd.DataFrame() self.modified_test_df = pd.DataFrame() self.classifier = BernoulliNB() self.classifier_multilabel = ClassifierChain(BernoulliNB()) self.classifier_dt = DecisionTreeRegressor(max_depth=2000) self.classifier_random_forest = RandomForestRegressor(max_depth=100) self.classifier_svm = svm.SVC(kernel='linear') self.test_tags = pd.DataFrame() def get_tag_list(self): tag_set = set() for tags in self.train_df.Tags: if tags is not nan: tag_set.update(tags.split(',')) return sorted(list(tag_set)) def get_word_list(self): word_set = set() for words in self.train_df.stemmed_words: if words is not nan: word_set.update(words.split(' ')) return sorted(list(word_set)) def setup_data_frame(self): for each in self.total_word_list: self.modified_train_df[each] = pd.Series([ 1 if each in words.split(' ') else 0 for words in self.train_df.stemmed_words ], index=self.train_df.index) self.modified_test_df[each] = pd.Series([ 1 if each in words.split(' ') else 0 for words in self.test_df.stemmed_words ], index=self.test_df.index) for tag in self.total_tag_list: self.modified_train_df[tag] = pd.Series([ 1 if tag in tags.split(',') else 0 for tags in self.train_df.Tags ], index=self.train_df.index) self.test_tags[tag] = pd.Series([ 1 if tag in tags.split(',') else 0 for tags in self.test_df.Tags ], index=self.test_df.index) pca = PCA(n_components=966) principal = pca.fit(self.modified_train_df) # self.modified_train_df = principal return self.modified_train_df def multi_label_naive_bayes_classifier(self): test_rows = self.modified_test_df.values self.modified_test_df['predicted_labels'] = pd.Series( ['' for each in self.modified_test_df.index], index=self.modified_test_df.index) for tag in self.total_tag_list: self.classifier.fit( self.modified_train_df[self.total_word_list].values, self.modified_train_df[tag].tolist()) self.modified_test_df[tag] = pd.Series( self.classifier.predict(test_rows), index=self.modified_test_df.index) self.modified_test_df['predicted_labels'] = pd.Series( [ each + ',' + tag if value == 1 else each for each, value in zip( self.modified_test_df.predicted_labels, self.modified_test_df.tag) ], index=self.modified_test_df.index) def multi_label_naive_bayes_classifier_sklearn(self): test_rows = self.modified_test_df.values self.classifier_multilabel.fit( self.modified_train_df[self.total_word_list].values, self.modified_train_df[self.total_tag_list]) c = self.classifier_multilabel.predict(test_rows) print(c.shape) print(sps.csc_matrix(self.test_tags.values).shape) print(accuracy_score(sps.csc_matrix(self.test_tags.values), c)) def multi_label_decision_tree_regressor(self): test_rows = self.modified_test_df.values self.classifier_dt.fit( self.modified_train_df[self.total_word_list].values, self.modified_train_df[self.total_tag_list]) predictions = self.classifier_dt.predict(test_rows) temp_df = pd.DataFrame(predictions, columns=self.total_tag_list) self.test_df['predicted_labels'] = pd.Series( ['' for each in self.modified_test_df.index], index=self.modified_test_df.index) for tag in self.total_tag_list: self.test_df['predicted_labels'] = pd.Series( [ each + ',' + tag if value == 1 else each for each, value in zip(self.test_df.predicted_labels, temp_df[tag]) ], index=self.test_df.index) self.test_df[['stemmed_words', 'Tags', 'predicted_labels' ]].to_csv(os.path.join("data", "decision_tree_result.csv"), index=False) def multi_label_random_forest(self): test_rows = self.modified_test_df.values self.classifier_random_forest.fit( self.modified_train_df[self.total_word_list].values, self.modified_train_df[self.total_tag_list]) predictions = self.classifier_random_forest.predict(test_rows) temp_df = pd.DataFrame(predictions, columns=self.total_tag_list) self.test_df['predicted_labels'] = pd.Series( ['' for each in self.modified_test_df.index], index=self.modified_test_df.index) for tag in self.total_tag_list: self.test_df['predicted_labels'] = pd.Series( [ each + ',' + tag if value == 1 else each for each, value in zip(self.test_df.predicted_labels, temp_df[tag]) ], index=self.test_df.index) self.test_df[['stemmed_words', 'Tags', 'predicted_labels' ]].to_csv(os.path.join("data", "random_forest_result.csv"), index=False) def multi_label_svm(self): test_rows = self.modified_test_df.values tags = array(self.modified_train_df[self.total_tag_list]) temp_df = pd.DataFrame() for col in range(tags.shape[1]): self.classifier_svm.fit( self.modified_train_df[self.total_word_list].values, tags[:, col]) predictions = self.classifier_svm.predict(test_rows) temp_df[self.total_tag_list[col]] = pd.Series(predictions) #temp_df = pd.DataFrame(predictions, columns=self.total_tag_list) self.test_df['predicted_labels'] = pd.Series( ['' for each in self.modified_test_df.index], index=self.modified_test_df.index) for tag in self.total_tag_list: self.test_df['predicted_labels'] = pd.Series( [ each + ',' + tag if value == 1 else each for each, value in zip(self.test_df.predicted_labels, temp_df[tag]) ], index=self.test_df.index) self.test_df[['stemmed_words', 'Tags', 'predicted_labels' ]].to_csv(os.path.join("data", "linear_svm.csv"), index=False)
def movie5_2(): def clear_folders(my_path): """ if len(os.listdir(my_path)) == 0: print('Empty:',my_path) else: print('Clearing:',my_path) """ filesToRemove = [os.path.join(my_path, f) for f in os.listdir(my_path)] for f in filesToRemove: os.remove(f) return import os my_path = os.getcwd() clear_folders(os.path.join(my_path, 'values')) import pandas as pd inputCSVfile = "IMDB-Movie-Data (1).csv" # reading csv file #print('Reading file:',inputCSVfile) try: my_data = pd.read_csv(inputCSVfile) except FileNotFoundError: #print('Error!\n',inputCSVfile,'doesnt exist.') exit() else: #print(my_data.head()) newData = my_data.head() available_fields = my_data.columns.values.tolist() num_fields = len(available_fields) num_values = len(my_data['Rank']) #print('\nTotal number of fields:',num_fields) #for count,f in enumerate(available_fields): # print(count+1,f) #print(num_values,' values.') #### Preprocessing """ print('\nChecking for empty fields in ',inputCSVfile,end='.\n') for f in available_fields: s = my_data[f].isnull().sum() print('Checking ',f,end='\t') if s == 0: #No missing fields print('OK') else: print('ERROR.',s,' values missing') #print('Removing Empty rows') #my_data = my_data.dropna()""" #print('\nFetching genre names.') genres = my_data['Genre'] genre_list = list(genres) genre_all = [] for my_genre in genre_list: g = my_genre.split(',') g_stripped = [x.strip() for x in g] # remove white space genre_all.extend(g_stripped) #print('\nDetecting unique genre names.') genre_unique = list(set(genre_all)) genre_unique = sorted(genre_unique) #for my_genre in genre_unique: # print('\t* ',my_genre) num_genre = len(genre_unique) #print('Number of genres:',num_genre) ########### #print('Creating label matrix.',end='\t') y = [[] for _ in range(num_values)] for i2 in range(num_values): for g2 in genre_unique: if (genres[i2].find(g2) != -1): y[i2].append(1) else: y[i2].append(0) #print('Done.\n') #print(y) ########### #print('\nCreating noun-verb dictionaries.') for g in genre_unique: gv = g + '_verb.txt' gn = g + '_noun.txt' pathV = os.path.join(my_path, 'values', gv) pathN = os.path.join(my_path, 'values', gn) fV = open(pathV, "w+") fN = open(pathN, "w+") def text_preprocessor(text_data, word_type): # Remove regular expressions and numbers contents = re.sub(r'[\W]', ' ', text_data) contents = re.sub("\d+", "", contents) # Remove short words shortword = re.compile(r'\W*\b\w{1,3}\b') contents = shortword.sub('', contents) # Tokenization txt_tokenized = word_tokenize(contents) # print(txt_tokenized) if word_type == 'verb': # POS tagging txt_pos = [ token for token, pos in pos_tag(txt_tokenized) if pos.startswith('V') ] elif word_type == 'noun': # POS tagging txt_pos = [ token for token, pos in pos_tag(txt_tokenized) if pos.startswith('N') ] # print(pos_tag(txt_tokenized)) # print(txt_pos) # Stop words elimination stop_words = set(stopwords.words('english')) filtered_sentence = [w for w in txt_pos if not w in stop_words] # Stemming ps = PorterStemmer() stemmed_out = [ps.stem(w) for w in filtered_sentence] # print(filtered_sentence) return stemmed_out plot_data = my_data['Description'] #feat_all = [] for i in range(num_values): my_plot = plot_data[i] my_title = my_data['Title'].iloc[i] my_genres = genre_list[i] g = my_genres.split(',') g_stripped = [x.strip() for x in g] # remove white space featN = text_preprocessor(my_plot, 'noun') featV = text_preprocessor(my_plot, 'verb') fN = '\n'.join(featN) fV = '\n'.join(featV) # Creating dictionary for gg in g_stripped: fileN_to_open = os.path.join(my_path, 'values', (gg + '_noun.txt')) fileV_to_open = os.path.join(my_path, 'values', (gg + '_verb.txt')) fileN = open(fileN_to_open, 'a+') fileN.write("\n") fileN.write(fN) fileV = open(fileV_to_open, 'a+') fileV.write("\n") fileV.write(fV) if gg == 'Fantasy': #print(fN,fV) a = 0 """ if i<5:#displaying some values. print('\n\n',my_title.upper(),end=':\n') for f in featN: print(f,end=' ') for f in featV: print(f,end=' ')""" #feat_all.append(featN) ''' # Select a genre to see its word cloud my_genre = 'Sport' print('\nShowing WordCloud for:\n\t\t\t',my_genre) to_show = [] for n in range(num_values): g = genres[n] if my_genre in g.split(','): to_show.append(corpus[n]) to_show = " ".join(to_show) from wordcloud import WordCloud, STOPWORDS import matplotlib.pyplot as plt wc = WordCloud(width = 800, height = 800, background_color ='white', min_font_size = 10) wordcloud = wc.generate(to_show) # plot the WordCloud image plt.figure(figsize = (8, 8), facecolor = None) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad = 0) plt.show()''' ######################################### import os def computeTF(wordDict, bow): tfDict = {} bowCount = len(bow) for word, count in wordDict.items(): tfDict[word] = count / float(bowCount) return tfDict fullname = "" my_path = os.getcwd() #print('\nReading from saved data:') data_dir = os.path.join(my_path, 'values') onlyfiles = [f for f in os.listdir(data_dir)] num_files = len(onlyfiles) wordSet = set([]) wordDicts = [] bow_vals = [] for i3, f3 in enumerate(onlyfiles): #print('Fetching data from ',f3) fullname = os.path.join(data_dir, f3) f = open(fullname, "r") contents = f.read() doc = contents.strip() bow = doc.split("\n") w = "" if w in bow: bow.remove("") bow_vals.append(bow) for b in bow: wordSet.add(b) info = doc.split('\n') wordDicts.append(dict.fromkeys(wordSet, 0)) for word in bow: wordDicts[i3][word] += 1 # print(wordDicts) import pandas as pd pd.DataFrame(wordDicts) tf_idf_vals = [] tf_vals = [] idf_vals = [] for i4 in range(num_files): tfBow = computeTF(wordDicts[i4], bow_vals[i4]) tf_vals.append(tfBow) #print('\nFinding unique words...') words_uniq = set([]) for tf in tf_vals: for k in tf.keys(): words_uniq.add(k) words_uniq = list(words_uniq) #print('\nCreating feature matrix from dictionary...') feat_all = [[] for _ in range(num_values)] #print(feat_all) for i4 in (range(num_values)): #print(i4,end=' ') my_plot = plot_data[i4] featN = text_preprocessor(my_plot, 'noun') featV = text_preprocessor(my_plot, 'verb') fff = 0 f5 = [] flag = 0 for w in words_uniq: flag = 0 if w in featN: #for f_n in featN: #if f_n in words_uniq: for my_dict in tf_vals: if w in my_dict.keys(): result = my_dict[w] feat_all[i4].append(result) break else: feat_all[i4].append(0) #feat_all[i4] = f5 #print(feat_all) #print(fff) # print(y) X = feat_all #print('Performing classification.',end='\t') from skmultilearn.problem_transform import ClassifierChain from sklearn.naive_bayes import GaussianNB from sklearn import metrics from sklearn.metrics import accuracy_score # initialize classifier chains multi-label classifier # with a gaussian naive bayes base classifier classifier = ClassifierChain(GaussianNB()) X = np.array(X) y = np.array(y) # train classifier.fit(X, y) # predict predictions = classifier.predict(X[1]) pred = predictions.toarray() result = list(np.where(pred == 1)[1]) """print('\n\nPrediction:') for r in result: print('\t*',genre_unique[r])""" #joblib.dump(vectorizer, 'vectorizer.pkl') joblib.dump(classifier, 'model.pkl') with open("genres_files.txt", "w") as file: file.write(str(genre_unique)) with open("words_file.txt", "w") as file: file.write(str(words_uniq)) #print(metrics.accuracy_score(y,predictions)) ################################################ ############################################### #print('\n\n\t****Single File Test****\n') filename = "endgame_1.txt" #### f = open(filename, "r") #print('Input File:\n\t',filename) with open("genres_files.txt", "r") as file: my_genres = eval(file.readline()) with open("words_file.txt", "r") as file: words_uniq = eval(file.readline()) contents = f.read() contents = contents.strip() #data = text_preprocessor(contents,'noun') #data = " ".join(data) ############################# featN = text_preprocessor(contents, 'noun') fff = 0 f5 = [] flag = 0 feat_all = [] for w in words_uniq: flag = 0 if w in featN: #for f_n in featN: #if f_n in words_uniq: for my_dict in tf_vals: if w in my_dict.keys(): result = my_dict[w] feat_all.append(result) break else: feat_all.append(0) ############################# X = np.array(feat_all) ####X1 = X.todense() y_pred = classifier.predict(X) pred = y_pred.toarray() result = list(np.where(pred == 1)[1]) #print('\n\nPrediction:') filename = "output_1.txt" #### fo = open(filename, "w+") for r in result: #print('\t*',my_genres[r]) fo.write(my_genres[r]) fo.write(" ")
Y_train = train.iloc[:,4:].values Y_test = test.iloc[:,4:].values print (Y_test) """ Naive Bayes Classifier """ #naiveBayes = GaussianNB() classifier = ClassifierChain(GaussianNB()) classifier.fit(X_train_idf,Y_train) predictions = classifier.predict(X_test_idf) print (accuracy_score(Y_test,predictions)) """ Get training and test dataset """ """ naiveBayes.fit(X_train_idf,Y_train[:,97:98].flatten()) y_pred = naiveBayes.predict(X_test_idf) """ #print (naiveBayes.score(X_test_idf,Y_test[:,1:2].flatten()))
tmp = np.zeros((19,3)) print(tmp.shape) tmp = tmp.astype(int) for i in range(len(train_labels)): tmp[i][train_labels[i]] = 1 ################################################################################### Multilabel Classifier ###################################################################################### from skmultilearn.problem_transform import ClassifierChain classifier = ClassifierChain(svm.SVC(decision_function_shape='ovo')) classifier.fit(train_features,tmp) p=classifier.predict(test_features) print(p) from skmultilearn.adapt import MLkNN clsfr= MLkNN(k=1) clsfr.fit(train_features,tmp) p=clsfr.predict(test_features) print(p) ########################################################################### Search for videos with similar tags ################################################################################## import urllib
def binary_relevance(train_data, test_data): """ 可以正常运行和预测 使用二元关联。 仅仅选取一个分类结果,即将问题简化为多分类单标签问题。而实际问题是多分类多标签问题。 :param train_data: :param test_data: :return: """ from skmultilearn.problem_transform import BinaryRelevance from skmultilearn.problem_transform import ClassifierChain from sklearn.naive_bayes import GaussianNB # 用一个基于高斯朴素贝叶斯的分类器 # classifier = BinaryRelevance(GaussianNB())# 初始化二元关联多标签分类器 classifier = ClassifierChain(GaussianNB()) #X_train = train X_train, y_train = train_data.iloc[:, [0]], train_data.iloc[:, list(range(1, 21))] X_test, y_test = test_data.iloc[:, [0]], test_data.iloc[:, list(range(1, 21))] # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # 训练 temp = X_train.values.tolist() X = [] for i in range(len(temp)): X.append(temp[i][0]) x = tfidf.transform(X) y = y_train.values.tolist() Y = []#长度为20的矩阵 for j in range(len(y)): if "1" in y[j]: indexs = y[j].index("1") Y.append(indexs+1) else: # print("0") Y.append(0)#其实有21类,因为有空 Y = np.array(Y) # Y值不能是多值?? classifier.fit(x, Y)# 直接预测数字? """ 报错:raise TypeError('no supported conversion for types: %r' % (args,)) TypeError: no supported conversion for types: (dtype('O'),) 难道是?? """ # 预测 temp = X_test.values.tolist() X_ts = [] for i in range(len(temp)): X_ts.append(temp[i][0]) x_test = tfidf.transform(X_ts) y_test = y_test.values.tolist() Y_test = [] for j in range(len(y_test)): if "1" in y_test[j]: indexs = y_test[j].index("1") Y_test.append(indexs + 1) else: # print("0") Y_test.append(0) # 其实有21类,因为有空 Y_test = np.array(Y_test)#形成一个矩阵 unique_test, counts_test = np.unique(Y_test, return_counts=True) print("truth=", dict(zip(unique_test, counts_test))) predictions = classifier.predict(x_test)#此时csr_matrix类型 predictions = predictions.toarray() # 里面有0吗?? unique, counts = np.unique(predictions, return_counts=True) print("preditions=", dict(zip(unique, counts))) from sklearn.metrics import accuracy_score score = accuracy_score(Y_test, predictions) print(score)
print('Performing classification.', end='\t') from skmultilearn.problem_transform import ClassifierChain from sklearn.naive_bayes import GaussianNB from sklearn import metrics from sklearn.metrics import accuracy_score # initialize classifier chains multi-label classifier # with a gaussian naive bayes base classifier classifier = ClassifierChain(GaussianNB()) X = np.array(X) y = np.array(y) # train classifier.fit(X, y) # predict predictions = classifier.predict(X[3]) pred = predictions.toarray() result = list(np.where(pred == 1)[1]) print('\n\nPrediction:') for r in result: print('\t*', genre_unique[r]) #joblib.dump(vectorizer, 'vectorizer.pkl') joblib.dump(classifier, 'model.pkl') with open("genres_files.txt", "w") as file: file.write(str(genre_unique)) with open("words_file.txt", "w") as file: file.write(str(words_uniq))