Exemple #1
0
    def classifer_chain(self):


        # initialize classifier chains multi-label classifier
        # with a gaussian naive bayes base classifier
        print("build classifier...")
        classifier = ClassifierChain(RandomForestClassifier())
        #classifier = LabelPowerset(RandomForestClassifier())
        print("end...")

        print("start training...")
        classifier.fit(self.X_train, self.y_train)
        print("end...")

        # predict
        print("start test...")
        predictions = classifier.predict(self.X_test)
        print("end...")

        print("result as following:")

        result = hamming_loss(self.y_test, predictions)
        print("hanming_loss: ", result)

        print("accuracy score: ", accuracy_score(y_test, predictions))

        result = f1_score(self.y_test, predictions, average='micro')
        print("micro-f1_score: ", result)
Exemple #2
0
    def train(self):
        classifier = ClassifierChain(LogisticRegression())
        classifier.fit(self.x_data, self.y_data)

        predictions = classifier.predict(self.x_test)

        return {
            'accuracy': accuracy_score(self.y_test, predictions),
            'f1_score': f1_score(self.y_test, predictions, average='micro')
        }
Exemple #3
0
class ClassifierChains:
    def __init__(self):
        self.model = ClassifierChain(LGBMClassifier())

    def set_grow_step(self, new_step):
        self.grow_boost_round = new_step

    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test).A
Exemple #4
0
def classifiers(X_train, Y_train, X_test):

    classifier1 = BinaryRelevance(GaussianNB())
    classifier2 = ClassifierChain(GaussianNB())
    classifier3 = LabelPowerset(GaussianNB())

    classifier1.fit(X_train, Y_train)
    classifier2.fit(X_train, Y_train)
    classifier3.fit(X_train, Y_train)

    predictions1 = classifier1.predict(X_test)
    predictions2 = classifier2.predict(X_test)
    predictions3 = classifier3.predict(X_test)

    return predictions1, predictions2, predictions3
Exemple #5
0
def randomForestClassifierChain():
    print("Random forest classifier chain")

    start = time.time()
    classifier = ClassifierChain(classifier=RandomForestClassifier(),
                                 require_dense=[False, True])
    filename = "randomForestClassifierChain"

    # classifier.fit(train_x, train_y)

    # save
    # pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
Exemple #6
0
def supportVectorMachineChain():
    print("Support vector machine")

    start = time.time()
    classifier = ClassifierChain(classifier=svm.SVC(),
                                 require_dense=[False, True])
    filename = "SupportVectorMachine"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
Exemple #7
0
def knnClassifierChain():
    print("knn classifier chain")

    start = time.time()
    classifier = ClassifierChain(KNeighborsClassifier())

    filename = "knnChain"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
Exemple #8
0
def gaussianNaiveBayes():
    print("Gaussian naive bayes")

    start = time.time()
    classifier = ClassifierChain(GaussianNB())

    filename = "gaussianNaiveBayes"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
Exemple #9
0
def ClassifierChain ():
    # Train-Test Split =======================================================
    print("setting up a neural network...")
    from sklearn.model_selection import train_test_split
    train, test = train_test_split(df, test_size=0.33, shuffle=True)
    
    train_text = train['Book_Text']
    test_text = test['Book_Text']
    
    # TF-IDF ==================================================================
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
    vectorizer.fit(train_text)
    vectorizer.fit(test_text)
    
    x_train = vectorizer.transform(train_text)
    y_train = train.drop(labels = ['Book_Text'], axis=1)
    
    x_test = vectorizer.transform(test_text)
    y_test = test.drop(labels = ['Book_Text'], axis=1)
    
    # using classifier chains
    from skmultilearn.problem_transform import ClassifierChain
    from sklearn.linear_model import LogisticRegression

    # initialize classifier chains multi-label classifier
    classifier = ClassifierChain(LogisticRegression())
    
    # Training logistic regression model on train data
    classifier.fit(x_train, y_train)
    
    # predict
    predictions = classifier.predict(x_test)
    
    # accuracy
    print("Accuracy = ",accuracy_score(y_test,predictions))
    print("\n")
Exemple #10
0
class Multi_labeling:
    def __init__(self, label_dict, train_labels, train_data, test_labels, test_data):
        self.label_dict = label_dict
        self.train_labels = train_labels
        self.train_data = train_data
        self.test_labels = test_labels
        self.test_data = test_data

    def classify(self):
        from skmultilearn.problem_transform import ClassifierChain
        from sklearn.svm import SVC,LinearSVC
        import sklearn.metrics as metrics

        # =============================
        #      ClassifierChain        #
        # =============================
        from sklearn.multiclass import OneVsRestClassifier
        # from sklearn.multioutput import ClassifierChain
        from sklearn.linear_model import LogisticRegression
        # cc = ClassifierChain(LogisticRegression())
        self.cc = ClassifierChain(LinearSVC())
        self.cc.fit(self.train_data, self.train_labels)
        # y_pred = self.cc.predict(self.test_data)
        # cc_art_f1 = metrics.f1_score(self.test_labels, y_pred, average='micro')



        # # initialize Classifier Chain multi-label classifier
        # # with an SVM classifier
        # # SVM in scikit only supports the X matrix in sparse representation
        # classifier = ClassifierChain(
        #     classifier=SVC(),
        #     require_dense=[False, True]
        # )
        # # train
        # classifier.fit(self.train_data, self.train_labels)
        # # predict
        # predictions = classifier.predict(self.test_data)
        # print(predictions)
        # art_f1 = metrics.f1_score(self.test_labels, predictions, average='macro')
        # return art_f1




        # =============================
        #    KNeighborsClassifier     #
        # =============================
        from sklearn.neighbors import KNeighborsClassifier
        knc = KNeighborsClassifier()

        knc.fit(self.train_data, self.train_labels)
        # Y_pred = knc.predict(self.test_data)
        # knc_art_f1 = metrics.f1_score(self.test_labels, Y_pred, average='micro')




        # =============================
        #           SGDClassifier     #
        # =============================
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.linear_model import SGDClassifier
        sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=0, max_iter=6, tol=None)
        clf = OneVsRestClassifier(sgd)
        clf.fit(self.train_data, self.train_labels)
        # y_pred = clf.predict(self.test_data)
        # sgd_art_f1 = metrics.f1_score(self.test_labels, y_pred, average='micro')
        # return cc_art_f1, knc_art_f1, sgd_art_f1

    def pred_all_other(self, input_data):
        y_pred = self.cc.predict(input_data)
        return y_pred
Exemple #11
0
    res = res / y_pred.shape[0]

    return np.round(res, 2)


for i in range(5):
    log = LogisticRegression()
    log.fit(np.hstack((X, Y[:, 0:i])), Y[:, i])  # 每次训练将前一次的预测结果附带上
    logs.append(log)

results = []
for i in range(5):
    res = logs[i].predict(np.hstack((X, Y[:, 0:i])))
    results.append(res)

fres = []
for i in range(len(results[0])):
    a = [
        results[0][i], results[1][i], results[2][i], results[3][i],
        results[4][i]
    ]
    fres.append(a)

fres = np.matrix(fres)
print(accuracy_score(fres, Y))
test = datasets.make_multilabel_classification()
# 使用已写好的分类器链算法验证结果
cl = ClassifierChain(LogisticRegression())
cl.fit(data[0], data[1])
pred = cl.predict(test[0])
print(accuracy_score(pred, test[1]))
        y_test = y_test.values

        #n-gram
        #tfidf = TfidfVectorizer(ngram_range = (1,1), stop_words = 'english')
        #tfidf = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,1), norm='l2')
        tfidf = CountVectorizer()
        tfidf.fit(x_train)
        x_train = tfidf.transform(x_train)
        x_test = tfidf.transform(x_test)

        # train
        #classifier = BinaryRelevance(GaussianNB())
        classifier.fit(x_train, y_train)

        # predict
        predictions = classifier.predict(x_test)

        y_pred = []
        for i in predictions:
            y_pred.append(list(i.A[0]))

        #print(y_pred)
        y_test = y_test.tolist()
        #print(len(y_test))
        y_pred_dataframe = pd.DataFrame(y_pred, columns=categories)
        y_test_dataframe = pd.DataFrame(y_test, columns=categories)
        #print(len(y_pred_dataframe))
        this_pred_list = y_pred_dataframe[category].tolist()
        this_test_list = y_test_dataframe[category].tolist()

        this_accuracy = accuracy_score(this_test_list, this_pred_list)
Exemple #13
0
                                          solver='liblinear'))

        i = 0
        j = 0
        for i in range(0, 47):
            X_copy = X_orig[(i):(
                i + 1)]  #Slice the ith element from the numpy array
            y_copy = y_orig[(i):(i + 1)]
            X_model = X_orig
            y_model = y_orig
            X_model = np.delete(
                X_model, i, axis=0
            )  #Create a new array to train the model with slicing out the ith item for LOOCV
            y_model = np.delete(y_model, i, axis=0)
            classifier.fit(X_model, y_model)
            prediction = classifier.predict(X_copy)
            equal = prediction.toarray()
            print(equal, y_copy)
            if np.array_equal(equal, y_copy):
                j = j + 1
                #print(y_copy, equal)
            if np.not_equal:
                #print(y_copy, equal)
                pass
        print(j / 48)
        #prediction = classifier.predict(X_test)
        #print(prediction.toarray())

#classifier.fit(X_train, y_train)
#predictions = classifier.predict(X_test)
#ans_formatted = predictions.toarray()
    result = {"accuracy:": acc, "hamming_score": ham}
    return result


clf_chain_model = build_model(MultinomialNB(), ClassifierChain, X_train,
                              y_train, X_test, y_test)

clf_chain_model

clf = ClassifierChain(MultinomialNB())
clf.fit(X_train, y_train)

# x = [ 'how to write ml code in python and java i have data but do not know how to do it','java data but do not know how to do it']
x = ['how to write code python']
xt = tfidf.transform(x)
multilabel.inverse_transform(clf.predict(xt))
"""#### LabelPowerset
![](https://github.com/Jcharis/Python-Machine-Learning/blob/master/Multi_Label_Text_Classification_with_Skmultilearn/labelPowerset_multilabel_ml_jcharistech.png?raw=1)
"""

clf_labelP_model = build_model(MultinomialNB(), LabelPowerset, X_train,
                               y_train, X_test, y_test)

clf_labelP_model

### Apply On A Simple Ttitle/Question

ex1 = df['title'].iloc[0]
ex1

# Vectorized
Exemple #15
0
        else:
            y[i2].append(0)


from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
# initialize classifier chains multi-label classifier
# with a gaussian naive bayes base classifier
classifier = ClassifierChain(GaussianNB())
X = np.array(X)
y = np.array(y)
# train
classifier.fit(X, y)

# predict
predictions = classifier.predict(X[1])

pred = predictions.toarray()
result = list(np.where(pred == 1)[1])
print('\n\nPrediction:')
for r in result:
    print('\t*',genre_unique[r])

joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(classifier, 'model.pkl')

with open("genres_files.txt", "w") as file:
    file.write(str(genre_unique))
#print(metrics.accuracy_score(y,predictions))
Exemple #16
0
        L1.append(d1[k])

    X = np.array(L)
    Y = np.array(L1)
    X_s, Y_s = shuffle(X, Y)
    size = [0.2]
    Mic = []
    Mac = []
    Wt = []
    Acc = []
    if mic >= thres:
        break
    for j in range(0, len(size)):
        X_train, X_test, Y_train, Y_test = train_test_split(X_s,
                                                            Y_s,
                                                            test_size=size[j])
        #k_fold = KFold(len(Y), n_folds=10, shuffle=True, random_state=0)
        clf = ClassifierChain(LogisticRegression())
        #clf = tree.DecisionTreeClassifier()
        #clf=RandomForestClassifier()
        clf.fit(X_train, Y_train)
        Y_predicted = clf.predict(X_test)
        Mic.append(f1_score(Y_test, Y_predicted, average='micro'))
        Mac.append(f1_score(Y_test, Y_predicted, average='macro'))
        Wt.append(f1_score(Y_test, Y_predicted, average='weighted'))
        Acc.append((accuracy_score(Y_test, Y_predicted)))
        mic += Mic[j]
        mac += Mac[j]
print "Micro-F1=", float(mic) / float(len(size)) * 1.0
print "Macro-F1=", float(mac) / float(len(size)) * 1.0
Exemple #17
0
d = d.as_matrix()
#The results might vary due to the usage of random state with train and test split
X_train, X_test, y_train, y_test = train_test_split(d,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# The classifier instance with the classifier as
# RandomForestClassifier
clf_cc = ClassifierChain(
    RandomForestClassifier(n_estimators=100, max_depth=200))

#fitting the model for the classification into the labels
clf_cc.fit(X_train, y_train.astype(float))
#predictions
predictions_cc = clf_cc.predict(X_test)
pred_prob = clf_cc.predict_proba(X_test)

#Finding the evaluation metrics
# micro recall, macro recall, micro precision, macro precision
# micro f1, macro f1, hamming loss
r1 = recall_score(y_true=y_test, y_pred=predictions_cc, average='micro')
r2 = recall_score(y_true=y_test, y_pred=predictions_cc, average='macro')
p1 = precision_score(y_true=y_test, y_pred=predictions_cc, average='micro')
p2 = precision_score(y_true=y_test, y_pred=predictions_cc, average='macro')
f1 = f1_score(y_true=y_test, y_pred=predictions_cc, average='micro')
f2 = f1_score(y_true=y_test, y_pred=predictions_cc, average='macro')
Score_cc_ham = hamming_loss(y_test, predictions_cc)

# Printing the evaluation metrics
print "Hamming Loss for classifier chains", Score_cc_ham
#Hamming Loss for Binary Relevance
hamm_loss_binary = hamming_loss(y_test, predictions_binary)

print("Hamming Loss:", hamm_loss_binary)

print("\n\n\nTraining data with Classifier Chains using Gaussian Naive Bayes")

#initialize Classifier Chains multi-label classifier
#with a gaussian naive bayes base classifier
classifier_cc = ClassifierChain(GaussianNB())

# train for Classifier Chaines
classifier_cc.fit(X_train, y_train)

# predict for Classifier Chains
predictions_cc = classifier_cc.predict(X_test)

#Hamming Loss for Classifier Chaines
hamm_loss_cc = hamming_loss(y_test, predictions_cc)

print("Hamming Loss:", hamm_loss_cc)

print("\n\n\nTraining data with Label Powerset using Gaussian Naive Bayes")

#initialize Label Powerset multi-label classifier
#with a gaussian naive bayes base classifier
classifier_lp = LabelPowerset(GaussianNB())

# train for Label Powerset
classifier_lp.fit(X_train, y_train)
Exemple #19
0
#replace <D> with nothing from data
train_data = train_data.iloc[:, 0].str.replace('<\d+>', '')
test_data = test_data.iloc[:, 0].str.replace('<\d+>', '')

#count the frequency of every word in vocabulary in each document
vectorizer = CountVectorizer()
train_data_vector = vectorizer.fit_transform(train_data)
test_data_vector = vectorizer.transform(test_data)

#train the classifier
model = ClassifierChain(RandomForestClassifier(n_jobs=-1, verbose=1))
model.fit(train_data_vector, train_labels)

#test the classifier
predicted_labels = model.predict(test_data_vector)
predicted_labels_train = model.predict(train_data_vector)
predicted_probabilities = model.predict_proba(test_data_vector)

#test accuracy
#~7% with random forest and binary relevance
#~7% with random forest and classifier chain
#~5% with random forest and label powerset
#~4% with multilabel knn
test_acc = accuracy_score(test_labels, predicted_labels)
train_acc = accuracy_score(train_labels, predicted_labels_train)
test_hamm_loss = hamming_loss(test_labels, predicted_labels)
test_cov_err = coverage_error(test_labels, predicted_probabilities.toarray())
test_rank_loss = label_ranking_loss(test_labels,
                                    predicted_probabilities.toarray())
test_avr_prec = label_ranking_average_precision_score(
# In[68]:

log_classifier.fit(x_train, y_train)
print('Accuracy_score using LabelPowerset is ',
      round(accuracy_score(y_test, log_classifier.predict(x_test)) * 100, 1),
      '%')
print('-------------------------------------------------')
print('roc_auc_score using LabelPowerset is ',
      roc_auc_score(y_test,
                    log_classifier.predict_proba(x_test).toarray()))

# # ClassifierChain
# * This method uses a chain of binary classifiers
# * Each new Classifier uses the predictions of all previous classifiers
# * This was the correlation b/w labels is taken into account

# In[69]:

chain = ClassifierChain(LogisticRegression())

# In[70]:

chain.fit(x_train, y_train)
print('Accuracy_score using ClassifierChain is ',
      round(accuracy_score(y_test, chain.predict(x_test)) * 100, 1), '%')
print('-------------------------------------------------')
print('roc_auc_score using ClassifierChain is ',
      roc_auc_score(y_test,
                    chain.predict_proba(x_test).toarray()))
# # Accuracy
# print("Accuracy : {}".format(accuracy_score(Y_test,predict)*100))
# # Create and save with pickle
# save_mydocuments = open("pickled_algos/MultilabelBinaryRelevanceWithGausseanNB.pickle","wb")
# pickle.dump(clf, save_mydocuments)
# save_mydocuments.close()
# print("BR Method with GausseanNB classifier is done, time--- %s seconds ---" % (time.time() - start_time))

# 5. Classifier chain with MultinomialNB classifier (from scikit-multilearn)
# create and fit classifier
from skmultilearn.problem_transform import ClassifierChain
ClassifierChainMultinomialNB_classifier = ClassifierChain(MultinomialNB())
ClassifierChainMultinomialNB_classifier.fit(X_train, Y_train)

# Predictions
predictions = ClassifierChainMultinomialNB_classifier.predict(X_test)

# Accuracy
print("Accuracy : {}".format(accuracy_score(Y_test,predict)*100))
# Create and save with pickle
save_mydocuments = open("pickled_algos/MultilabelClassifierchainWithMultinomialNB.pickle","wb")
pickle.dump(ClassifierChainMultinomialNB_classifier, save_mydocuments)
save_mydocuments.close()
print("Classifier chain with MultinomialNB classifier is done, time--- %s seconds ---" % (time.time() - start_time))

# 6. Label Powerset with MultinomialNB classifier (from scikit-multilearn)
# create and fit classifier
from skmultilearn.problem_transform import LabelPowerset
LabelPowersetMultinomialNB_classifier = LabelPowerset(MultinomialNB())
LabelPowersetMultinomialNB_classifier.fit(X_train, Y_train)
Y_train = train[:, 0:label_data.shape[1]]
X_train = train[:, label_data.shape[1]:]

Y_test = test[:, 0:label_data.shape[1]]
X_test = test[:, label_data.shape[1]:]

# ### Gaussian Naive Bayesian + Classifier Chain

# In[26]:

classifier = ClassifierChain(GaussianNB())

classifier.fit(X_train, Y_train)

predictions = classifier.predict(X_test)

accuracy_score(Y_test, predictions)

# ### Neural Network + Classifier Chain

# In[44]:

mlp = MLPClassifier(solver='lbfgs',
                    activation='relu',
                    alpha=1e-4,
                    hidden_layer_sizes=(50, 50),
                    random_state=1,
                    max_iter=1000,
                    verbose=10,
                    learning_rate_init=.1)
Exemple #23
0
class MultiLabelClassifier(object):
    def __init__(self):
        self.total_data_df = pd.read_csv(os.path.join("data",
                                                      "cleaned_data.csv"),
                                         encoding="ISO-8859-1")
        self.data_df = self.total_data_df[~self.total_data_df.Tags.isnull()]
        self.total_records = len(self.data_df.index)
        self.train_df = self.data_df.tail(int(self.total_records * .67))
        self.test_df = self.data_df.head(int(self.total_records * .23))
        self.total_tag_list = self.get_tag_list()
        self.total_word_list = self.get_word_list()
        self.modified_train_df = pd.DataFrame()
        self.modified_test_df = pd.DataFrame()
        self.classifier = BernoulliNB()
        self.classifier_multilabel = ClassifierChain(BernoulliNB())
        self.classifier_dt = DecisionTreeRegressor(max_depth=2000)
        self.classifier_random_forest = RandomForestRegressor(max_depth=100)
        self.classifier_svm = svm.SVC(kernel='linear')

        self.test_tags = pd.DataFrame()

    def get_tag_list(self):
        tag_set = set()
        for tags in self.train_df.Tags:
            if tags is not nan:
                tag_set.update(tags.split(','))
        return sorted(list(tag_set))

    def get_word_list(self):
        word_set = set()
        for words in self.train_df.stemmed_words:
            if words is not nan:
                word_set.update(words.split(' '))
        return sorted(list(word_set))

    def setup_data_frame(self):
        for each in self.total_word_list:
            self.modified_train_df[each] = pd.Series([
                1 if each in words.split(' ') else 0
                for words in self.train_df.stemmed_words
            ],
                                                     index=self.train_df.index)
            self.modified_test_df[each] = pd.Series([
                1 if each in words.split(' ') else 0
                for words in self.test_df.stemmed_words
            ],
                                                    index=self.test_df.index)
        for tag in self.total_tag_list:
            self.modified_train_df[tag] = pd.Series([
                1 if tag in tags.split(',') else 0
                for tags in self.train_df.Tags
            ],
                                                    index=self.train_df.index)
            self.test_tags[tag] = pd.Series([
                1 if tag in tags.split(',') else 0
                for tags in self.test_df.Tags
            ],
                                            index=self.test_df.index)
        pca = PCA(n_components=966)
        principal = pca.fit(self.modified_train_df)
        # self.modified_train_df = principal
        return self.modified_train_df

    def multi_label_naive_bayes_classifier(self):
        test_rows = self.modified_test_df.values
        self.modified_test_df['predicted_labels'] = pd.Series(
            ['' for each in self.modified_test_df.index],
            index=self.modified_test_df.index)
        for tag in self.total_tag_list:
            self.classifier.fit(
                self.modified_train_df[self.total_word_list].values,
                self.modified_train_df[tag].tolist())
            self.modified_test_df[tag] = pd.Series(
                self.classifier.predict(test_rows),
                index=self.modified_test_df.index)
            self.modified_test_df['predicted_labels'] = pd.Series(
                [
                    each + ',' + tag if value == 1 else each
                    for each, value in zip(
                        self.modified_test_df.predicted_labels,
                        self.modified_test_df.tag)
                ],
                index=self.modified_test_df.index)

    def multi_label_naive_bayes_classifier_sklearn(self):
        test_rows = self.modified_test_df.values
        self.classifier_multilabel.fit(
            self.modified_train_df[self.total_word_list].values,
            self.modified_train_df[self.total_tag_list])
        c = self.classifier_multilabel.predict(test_rows)

        print(c.shape)
        print(sps.csc_matrix(self.test_tags.values).shape)
        print(accuracy_score(sps.csc_matrix(self.test_tags.values), c))

    def multi_label_decision_tree_regressor(self):
        test_rows = self.modified_test_df.values
        self.classifier_dt.fit(
            self.modified_train_df[self.total_word_list].values,
            self.modified_train_df[self.total_tag_list])
        predictions = self.classifier_dt.predict(test_rows)
        temp_df = pd.DataFrame(predictions, columns=self.total_tag_list)
        self.test_df['predicted_labels'] = pd.Series(
            ['' for each in self.modified_test_df.index],
            index=self.modified_test_df.index)
        for tag in self.total_tag_list:
            self.test_df['predicted_labels'] = pd.Series(
                [
                    each + ',' + tag if value == 1 else each for each, value in
                    zip(self.test_df.predicted_labels, temp_df[tag])
                ],
                index=self.test_df.index)
        self.test_df[['stemmed_words', 'Tags', 'predicted_labels'
                      ]].to_csv(os.path.join("data",
                                             "decision_tree_result.csv"),
                                index=False)

    def multi_label_random_forest(self):
        test_rows = self.modified_test_df.values
        self.classifier_random_forest.fit(
            self.modified_train_df[self.total_word_list].values,
            self.modified_train_df[self.total_tag_list])
        predictions = self.classifier_random_forest.predict(test_rows)
        temp_df = pd.DataFrame(predictions, columns=self.total_tag_list)
        self.test_df['predicted_labels'] = pd.Series(
            ['' for each in self.modified_test_df.index],
            index=self.modified_test_df.index)
        for tag in self.total_tag_list:
            self.test_df['predicted_labels'] = pd.Series(
                [
                    each + ',' + tag if value == 1 else each for each, value in
                    zip(self.test_df.predicted_labels, temp_df[tag])
                ],
                index=self.test_df.index)
        self.test_df[['stemmed_words', 'Tags', 'predicted_labels'
                      ]].to_csv(os.path.join("data",
                                             "random_forest_result.csv"),
                                index=False)

    def multi_label_svm(self):
        test_rows = self.modified_test_df.values
        tags = array(self.modified_train_df[self.total_tag_list])
        temp_df = pd.DataFrame()
        for col in range(tags.shape[1]):
            self.classifier_svm.fit(
                self.modified_train_df[self.total_word_list].values, tags[:,
                                                                          col])
            predictions = self.classifier_svm.predict(test_rows)
            temp_df[self.total_tag_list[col]] = pd.Series(predictions)
        #temp_df = pd.DataFrame(predictions, columns=self.total_tag_list)
        self.test_df['predicted_labels'] = pd.Series(
            ['' for each in self.modified_test_df.index],
            index=self.modified_test_df.index)
        for tag in self.total_tag_list:
            self.test_df['predicted_labels'] = pd.Series(
                [
                    each + ',' + tag if value == 1 else each for each, value in
                    zip(self.test_df.predicted_labels, temp_df[tag])
                ],
                index=self.test_df.index)
        self.test_df[['stemmed_words', 'Tags', 'predicted_labels'
                      ]].to_csv(os.path.join("data", "linear_svm.csv"),
                                index=False)
Exemple #24
0
def movie5_2():
    def clear_folders(my_path):
        """
		if len(os.listdir(my_path)) == 0:
			print('Empty:',my_path)
		else:
			print('Clearing:',my_path)
		"""
        filesToRemove = [os.path.join(my_path, f) for f in os.listdir(my_path)]
        for f in filesToRemove:
            os.remove(f)
        return

    import os
    my_path = os.getcwd()

    clear_folders(os.path.join(my_path, 'values'))
    import pandas as pd
    inputCSVfile = "IMDB-Movie-Data (1).csv"
    # reading csv file
    #print('Reading file:',inputCSVfile)
    try:
        my_data = pd.read_csv(inputCSVfile)
    except FileNotFoundError:
        #print('Error!\n',inputCSVfile,'doesnt exist.')
        exit()
    else:
        #print(my_data.head())
        newData = my_data.head()

    available_fields = my_data.columns.values.tolist()

    num_fields = len(available_fields)
    num_values = len(my_data['Rank'])

    #print('\nTotal number of fields:',num_fields)

    #for count,f in enumerate(available_fields):
    #	print(count+1,f)

    #print(num_values,' values.')

    #### Preprocessing
    """	print('\nChecking for empty fields in ',inputCSVfile,end='.\n')
		for f in available_fields:
			s = my_data[f].isnull().sum()
			print('Checking ',f,end='\t')
			if s == 0:
				#No missing fields
				print('OK')
			else:
				print('ERROR.',s,' values missing')
				#print('Removing Empty rows')
				#my_data = my_data.dropna()"""

    #print('\nFetching genre names.')

    genres = my_data['Genre']
    genre_list = list(genres)

    genre_all = []
    for my_genre in genre_list:
        g = my_genre.split(',')
        g_stripped = [x.strip() for x in g]  # remove white space
        genre_all.extend(g_stripped)

    #print('\nDetecting unique genre names.')

    genre_unique = list(set(genre_all))
    genre_unique = sorted(genre_unique)

    #for my_genre in genre_unique:
    #    	print('\t* ',my_genre)

    num_genre = len(genre_unique)
    #print('Number of genres:',num_genre)

    ###########
    #print('Creating label matrix.',end='\t')
    y = [[] for _ in range(num_values)]
    for i2 in range(num_values):
        for g2 in genre_unique:
            if (genres[i2].find(g2) != -1):
                y[i2].append(1)
            else:
                y[i2].append(0)
    #print('Done.\n')
    #print(y)
    ###########
    #print('\nCreating noun-verb dictionaries.')
    for g in genre_unique:

        gv = g + '_verb.txt'
        gn = g + '_noun.txt'

        pathV = os.path.join(my_path, 'values', gv)
        pathN = os.path.join(my_path, 'values', gn)

        fV = open(pathV, "w+")
        fN = open(pathN, "w+")

    def text_preprocessor(text_data, word_type):
        # Remove regular expressions and numbers
        contents = re.sub(r'[\W]', ' ', text_data)
        contents = re.sub("\d+", "", contents)

        # Remove short words
        shortword = re.compile(r'\W*\b\w{1,3}\b')
        contents = shortword.sub('', contents)

        # Tokenization
        txt_tokenized = word_tokenize(contents)
        # print(txt_tokenized)
        if word_type == 'verb':
            # POS tagging
            txt_pos = [
                token for token, pos in pos_tag(txt_tokenized)
                if pos.startswith('V')
            ]
        elif word_type == 'noun':
            # POS tagging
            txt_pos = [
                token for token, pos in pos_tag(txt_tokenized)
                if pos.startswith('N')
            ]
        # print(pos_tag(txt_tokenized))
        # print(txt_pos)

        # Stop words elimination
        stop_words = set(stopwords.words('english'))
        filtered_sentence = [w for w in txt_pos if not w in stop_words]

        # Stemming
        ps = PorterStemmer()
        stemmed_out = [ps.stem(w) for w in filtered_sentence]

        # print(filtered_sentence)
        return stemmed_out

    plot_data = my_data['Description']
    #feat_all = []
    for i in range(num_values):

        my_plot = plot_data[i]
        my_title = my_data['Title'].iloc[i]
        my_genres = genre_list[i]

        g = my_genres.split(',')
        g_stripped = [x.strip() for x in g]  # remove white space

        featN = text_preprocessor(my_plot, 'noun')
        featV = text_preprocessor(my_plot, 'verb')

        fN = '\n'.join(featN)
        fV = '\n'.join(featV)

        # Creating dictionary
        for gg in g_stripped:
            fileN_to_open = os.path.join(my_path, 'values', (gg + '_noun.txt'))
            fileV_to_open = os.path.join(my_path, 'values', (gg + '_verb.txt'))

            fileN = open(fileN_to_open, 'a+')
            fileN.write("\n")
            fileN.write(fN)

            fileV = open(fileV_to_open, 'a+')
            fileV.write("\n")
            fileV.write(fV)

            if gg == 'Fantasy':
                #print(fN,fV)
                a = 0
        """	if i<5:#displaying some values.
				print('\n\n',my_title.upper(),end=':\n')
				for f in featN:
						print(f,end='  ')
				
				for f in featV:
					print(f,end='  ')"""

    #feat_all.append(featN)
    '''
	# Select a genre to see its word cloud
	my_genre = 'Sport'
	print('\nShowing WordCloud for:\n\t\t\t',my_genre)
	to_show = []
	for n in range(num_values):
	    g = genres[n]
	    if my_genre in g.split(','):
		to_show.append(corpus[n])

	to_show = " ".join(to_show)

	from wordcloud import WordCloud, STOPWORDS 
	import matplotlib.pyplot as plt

	wc = WordCloud(width = 800, height = 800, 
		        background_color ='white', 
		        min_font_size = 10)
	wordcloud = wc.generate(to_show) 
	  
	# plot the WordCloud image                        
	plt.figure(figsize = (8, 8), facecolor = None) 
	plt.imshow(wordcloud) 
	plt.axis("off") 
	plt.tight_layout(pad = 0) 
	  
	plt.show()'''
    #########################################
    import os

    def computeTF(wordDict, bow):
        tfDict = {}
        bowCount = len(bow)
        for word, count in wordDict.items():
            tfDict[word] = count / float(bowCount)
        return tfDict

    fullname = ""
    my_path = os.getcwd()
    #print('\nReading from saved data:')
    data_dir = os.path.join(my_path, 'values')
    onlyfiles = [f for f in os.listdir(data_dir)]
    num_files = len(onlyfiles)
    wordSet = set([])
    wordDicts = []
    bow_vals = []

    for i3, f3 in enumerate(onlyfiles):
        #print('Fetching data from ',f3)
        fullname = os.path.join(data_dir, f3)
        f = open(fullname, "r")
        contents = f.read()
        doc = contents.strip()

        bow = doc.split("\n")
        w = ""
        if w in bow:
            bow.remove("")

        bow_vals.append(bow)

        for b in bow:
            wordSet.add(b)
        info = doc.split('\n')
        wordDicts.append(dict.fromkeys(wordSet, 0))

        for word in bow:
            wordDicts[i3][word] += 1

    # print(wordDicts)
    import pandas as pd
    pd.DataFrame(wordDicts)

    tf_idf_vals = []
    tf_vals = []
    idf_vals = []

    for i4 in range(num_files):
        tfBow = computeTF(wordDicts[i4], bow_vals[i4])
        tf_vals.append(tfBow)

    #print('\nFinding unique words...')
    words_uniq = set([])
    for tf in tf_vals:
        for k in tf.keys():
            words_uniq.add(k)

    words_uniq = list(words_uniq)
    #print('\nCreating feature matrix from dictionary...')
    feat_all = [[] for _ in range(num_values)]
    #print(feat_all)
    for i4 in (range(num_values)):
        #print(i4,end=' ')
        my_plot = plot_data[i4]
        featN = text_preprocessor(my_plot, 'noun')
        featV = text_preprocessor(my_plot, 'verb')
        fff = 0
        f5 = []
        flag = 0
        for w in words_uniq:
            flag = 0
            if w in featN:
                #for f_n in featN:
                #if f_n in words_uniq:
                for my_dict in tf_vals:
                    if w in my_dict.keys():
                        result = my_dict[w]
                        feat_all[i4].append(result)
                        break
            else:
                feat_all[i4].append(0)
    #feat_all[i4] = f5
    #print(feat_all)
    #print(fff)

    # print(y)

    X = feat_all

    #print('Performing classification.',end='\t')

    from skmultilearn.problem_transform import ClassifierChain
    from sklearn.naive_bayes import GaussianNB
    from sklearn import metrics
    from sklearn.metrics import accuracy_score
    # initialize classifier chains multi-label classifier
    # with a gaussian naive bayes base classifier
    classifier = ClassifierChain(GaussianNB())
    X = np.array(X)
    y = np.array(y)
    # train
    classifier.fit(X, y)

    # predict
    predictions = classifier.predict(X[1])

    pred = predictions.toarray()
    result = list(np.where(pred == 1)[1])
    """print('\n\nPrediction:')
	for r in result:
	    	print('\t*',genre_unique[r])"""

    #joblib.dump(vectorizer, 'vectorizer.pkl')
    joblib.dump(classifier, 'model.pkl')

    with open("genres_files.txt", "w") as file:
        file.write(str(genre_unique))

    with open("words_file.txt", "w") as file:
        file.write(str(words_uniq))
    #print(metrics.accuracy_score(y,predictions))

    ################################################
    ###############################################

    #print('\n\n\t****Single File Test****\n')

    filename = "endgame_1.txt"  ####
    f = open(filename, "r")
    #print('Input File:\n\t',filename)

    with open("genres_files.txt", "r") as file:
        my_genres = eval(file.readline())

    with open("words_file.txt", "r") as file:
        words_uniq = eval(file.readline())

    contents = f.read()
    contents = contents.strip()

    #data = text_preprocessor(contents,'noun')
    #data = " ".join(data)
    #############################

    featN = text_preprocessor(contents, 'noun')
    fff = 0
    f5 = []
    flag = 0
    feat_all = []
    for w in words_uniq:
        flag = 0
        if w in featN:
            #for f_n in featN:
            #if f_n in words_uniq:
            for my_dict in tf_vals:
                if w in my_dict.keys():
                    result = my_dict[w]
                    feat_all.append(result)
                    break
        else:
            feat_all.append(0)

    #############################
    X = np.array(feat_all)

    ####X1 = X.todense()

    y_pred = classifier.predict(X)
    pred = y_pred.toarray()
    result = list(np.where(pred == 1)[1])
    #print('\n\nPrediction:')
    filename = "output_1.txt"  ####
    fo = open(filename, "w+")
    for r in result:
        #print('\t*',my_genres[r])
        fo.write(my_genres[r])
        fo.write("   ")
Exemple #25
0
Y_train = train.iloc[:,4:].values
Y_test = test.iloc[:,4:].values

print (Y_test)




"""
Naive Bayes Classifier
"""
#naiveBayes = GaussianNB()

classifier = ClassifierChain(GaussianNB())
classifier.fit(X_train_idf,Y_train)
predictions = classifier.predict(X_test_idf)
print (accuracy_score(Y_test,predictions))


"""
Get training and test dataset
"""

"""
naiveBayes.fit(X_train_idf,Y_train[:,97:98].flatten())
y_pred = naiveBayes.predict(X_test_idf)
"""



#print (naiveBayes.score(X_test_idf,Y_test[:,1:2].flatten()))
Exemple #26
0
tmp = np.zeros((19,3))
print(tmp.shape)
tmp = tmp.astype(int)
for i in range(len(train_labels)):
    tmp[i][train_labels[i]] = 1



###################################################################################      Multilabel Classifier     ######################################################################################

from skmultilearn.problem_transform import ClassifierChain
classifier = ClassifierChain(svm.SVC(decision_function_shape='ovo'))
classifier.fit(train_features,tmp)

p=classifier.predict(test_features)
print(p)



from skmultilearn.adapt import MLkNN
clsfr= MLkNN(k=1)
clsfr.fit(train_features,tmp)

p=clsfr.predict(test_features)
print(p)


###########################################################################      Search for videos with similar tags   ##################################################################################

import urllib
def binary_relevance(train_data, test_data):
    """
    可以正常运行和预测
    使用二元关联。
    仅仅选取一个分类结果,即将问题简化为多分类单标签问题。而实际问题是多分类多标签问题。
    :param train_data:
    :param test_data:
    :return:
    """

    from skmultilearn.problem_transform import BinaryRelevance
    from skmultilearn.problem_transform import ClassifierChain
    from sklearn.naive_bayes import GaussianNB
    # 用一个基于高斯朴素贝叶斯的分类器
    # classifier = BinaryRelevance(GaussianNB())# 初始化二元关联多标签分类器
    classifier = ClassifierChain(GaussianNB())
    #X_train = train
    X_train, y_train = train_data.iloc[:, [0]], train_data.iloc[:, list(range(1, 21))]
    X_test, y_test = test_data.iloc[:, [0]], test_data.iloc[:, list(range(1, 21))]
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    # 训练
    temp = X_train.values.tolist()
    X = []
    for i in range(len(temp)):
        X.append(temp[i][0])
    x = tfidf.transform(X)
    y = y_train.values.tolist()
    Y = []#长度为20的矩阵
    for j in range(len(y)):
        if "1" in y[j]:
            indexs = y[j].index("1")
            Y.append(indexs+1)
        else:
            # print("0")
            Y.append(0)#其实有21类,因为有空
    Y = np.array(Y)
    # Y值不能是多值??
    classifier.fit(x, Y)# 直接预测数字?
    """
    报错:raise TypeError('no supported conversion for types: %r' % (args,))
    TypeError: no supported conversion for types: (dtype('O'),)
    难道是??
    """

    # 预测
    temp = X_test.values.tolist()
    X_ts = []
    for i in range(len(temp)):
        X_ts.append(temp[i][0])
    x_test = tfidf.transform(X_ts)

    y_test = y_test.values.tolist()
    Y_test = []
    for j in range(len(y_test)):
        if "1" in y_test[j]:
            indexs = y_test[j].index("1")
            Y_test.append(indexs + 1)
        else:
            # print("0")
            Y_test.append(0)  # 其实有21类,因为有空
    Y_test = np.array(Y_test)#形成一个矩阵
    unique_test, counts_test = np.unique(Y_test, return_counts=True)
    print("truth=", dict(zip(unique_test, counts_test)))

    predictions = classifier.predict(x_test)#此时csr_matrix类型
    predictions = predictions.toarray()
    # 里面有0吗??
    unique, counts = np.unique(predictions, return_counts=True)
    print("preditions=", dict(zip(unique, counts)))
    from sklearn.metrics import accuracy_score
    score = accuracy_score(Y_test, predictions)
    print(score)
Exemple #28
0
print('Performing classification.', end='\t')

from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import accuracy_score
# initialize classifier chains multi-label classifier
# with a gaussian naive bayes base classifier
classifier = ClassifierChain(GaussianNB())
X = np.array(X)
y = np.array(y)
# train
classifier.fit(X, y)

# predict
predictions = classifier.predict(X[3])

pred = predictions.toarray()
result = list(np.where(pred == 1)[1])
print('\n\nPrediction:')
for r in result:
    print('\t*', genre_unique[r])

#joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(classifier, 'model.pkl')

with open("genres_files.txt", "w") as file:
    file.write(str(genre_unique))

with open("words_file.txt", "w") as file:
    file.write(str(words_uniq))