Exemple #1
0
 def train(self,
           data: DataFrame,
           X_column: str,
           y_columns: List[str] = None):
     if y_columns is None:
         _ = data.columns.to_list()
         y_columns = list(set(_) - set([X_column]))
     X = data[X_column]
     y: DataFrame = data.drop(X_column, axis=1)
     xtrain, xtest, ytrain, ytest = train_test_split(X,
                                                     y,
                                                     random_state=42,
                                                     test_size=0.2)
     mlb = MultiLabelBinarizer()
     train_labels = mlb.fit_transform(ytrain[y_columns].values)
     # test_labels not used when training
     # test_labels = mlb.fit_transform(ytest[y_columns].values)
     train_cleaned = xtrain.copy(deep=True).apply(
         nlp_preprocess.Preprocess().clean_text)
     # test cleaned not used when training
     # test_cleaned = xtest.copy(deep=True).apply(clean_text)
     vectorizer = TfidfVectorizer()
     vectorised_train_documents = vectorizer.fit_transform(train_cleaned)
     powersetsvc = LabelPowerset(LinearSVC())
     powersetsvc.fit(vectorised_train_documents, train_labels)
     dump(powersetsvc, open("powersetsvc.pickle", "wb"))
     with open('vec.pickle', 'wb') as f1:
         dump(vectorizer, f1)
     return powersetsvc, vectorizer
def reduce_dimension(data1, label1, dimension_num, estimators=100):
    # The method is to reduce the dimension of vector
    # and choose the most important features

    # print('label1: ', label1.shape)
    y_train = sparse.lil_matrix((label1.shape[0], 85))
    y_train[:, :] = label1
    # print(y_train.shape)

    X_train = sparse.lil_matrix((label1.shape[0], 4189))
    X_train[:, :] = data1
    # print(X_train.shape)

    classifier5 = RandomForestClassifier(n_estimators=estimators,
                                         random_state=1)
    classifier1 = LabelPowerset(classifier=classifier5,
                                require_dense=[False, True])
    classifier1.fit(X_train, y_train)

    importances = classifier5.feature_importances_
    # print('importances1: ', importances)
    indices = np.argsort(importances)[::-1]
    # print('indices', indices)
    features_importances = importances[indices]
    # plot_feature_importances(importances, 'Features Importance(Random Forest)', name1)

    return indices[:dimension_num], indices[
        dimension_num:], features_importances
class MyLabelPowerSetFeatureSelect():
   
    def fit(self, X, y):
        
        # I'm using a gaussian naive bayes base classifier
        self.LabelPowerSetObject = LabelPowerset(GaussianNB())
        
        # fitting the data
        self.LabelPowerSetObject.fit(X, y)
        
        # transformed y 
        y_transformed  = self.LabelPowerSetObject.transform(y)
        
        # instanciating with SelectKBest object
        self.X_new = SelectKBest(chi2, k=2)
        
        # the feature selecting
        self.X_transformed = self.X_new.fit_transform(X, y_transformed)
        
        # save indices of the saved attributes
        self.selected_attributes_indices = self.X_new.get_support(indices = True)
        
        #print(self.attributes_indices,'the indices of the selected atributes')
        
        return self
        
    
    def transform(self, X):    
        return X[:,self.selected_attributes_indices]
    
    def predict(self, X):
        return self.LabelPowerSetObject.predict(X)
    
    def predict_proba(self, X):
        return self.LabelPowerSetObject.predict_proba(X)
Exemple #4
0
    def labelSet(self):
        classifier = LabelPowerset(GaussianNB())

        classifier.fit(self.X_train, self.y_train)

        # predict
        predictions = classifier.predict(self.X_test)
        result = accuracy_score(self.y_test, predictions)
        print(result)
def logistic_regression_classifier(train_x, train_y):
    from sklearn.linear_model import LogisticRegression
    from skmultilearn.problem_transform import BinaryRelevance
    from skmultilearn.problem_transform import LabelPowerset
    from skmultilearn.problem_transform import ClassifierChain
    from sklearn.naive_bayes import GaussianNB

    model = LabelPowerset(LogisticRegression(penalty='l1'))
    model.fit(train_x, train_y)
    return model
def naive_bayes_classifier(train_x, train_y):
    from skmultilearn.problem_transform import BinaryRelevance
    from skmultilearn.problem_transform import LabelPowerset
    from skmultilearn.problem_transform import ClassifierChain
    from sklearn.naive_bayes import GaussianNB
    classifier = LabelPowerset(GaussianNB())
    #    classifier = ClassifierChain(GaussianNB())
    #    classifier = BinaryRelevance(GaussianNB())
    classifier.fit(train_x, train_y)

    return classifier
    def powerset(self):

        classifier = LabelPowerset(LogisticRegression())
        classifier.fit(self.x_data, self.y_data)

        predictions = classifier.predict(self.x_test)

        return {
            'accuracy': accuracy_score(self.y_test, predictions),
            'f1_score': f1_score(self.y_test, predictions, average='micro')
        }
Exemple #8
0
def buildLBClassifier(xTrain, yTrain):
    # initialize Label Powerset multi-label classifier
    # with a gaussian naive bayes base classifier
    classifier = LabelPowerset(GaussianNB())

    # train
    xTrain = np.ascontiguousarray(xTrain)
    yTrain = np.ascontiguousarray(yTrain)
    classifier.fit(xTrain, yTrain)

    return classifier
Exemple #9
0
def classifiers(X_train, Y_train, X_test):

    classifier1 = BinaryRelevance(GaussianNB())
    classifier2 = ClassifierChain(GaussianNB())
    classifier3 = LabelPowerset(GaussianNB())

    classifier1.fit(X_train, Y_train)
    classifier2.fit(X_train, Y_train)
    classifier3.fit(X_train, Y_train)

    predictions1 = classifier1.predict(X_test)
    predictions2 = classifier2.predict(X_test)
    predictions3 = classifier3.predict(X_test)

    return predictions1, predictions2, predictions3
Exemple #10
0
class LP():
    '''
        Label Powerset Method
    '''

    h = None

    def __init__(self, h=LogisticRegression()):
        self.h = LabelPowerset(h)

    def fit(self, X, Y):
        '''
            Train the model on training data X,Y
        '''
        return self.h.fit(X, Y)

    def predict(self, X):
        '''
            Return predictions Y, given X
        '''
        return self.h.predict(X)

    def predict_proba(self, X):
        '''
            Return matrix P, where P[i,j] = P(Y[i,j] = 1 | X[i])
            (where i-th row/example, and j-th label)
        '''
        return self.h.predict_proba(X)
Exemple #11
0
def LabelPowerset_method(X_train, y_train, samples_leaf, samples_split):
    """
	问题转换-->标签Powerset方法
	:param X_train: 输入数据
	:param y_train: 对应标签数据
	:return:
	"""
    try:
        classifier = LabelPowerset(
            DecisionTreeClassifier(min_samples_leaf=int(samples_leaf),
                                   min_samples_split=int(samples_split)))
        classifier.fit(X_train, y_train)
        return classifier
    except Exception as e:
        print("warning----标签Powerset|LabelPowerset_method----" + str(e))

    return None
Exemple #12
0
def runSet(model, x, y):
    mse = []
    accuracy = []
    kf = KFold(n_splits=splitNo)
    for train, test in kf.split(x):
        classifier = LabelPowerset(model)
        classifier.fit(x[train], y[train])
        predictions = classifier.predict(x[test])
        accuracy.append(accuracy_score(y[test], predictions))
        mse.append(mean_squared_error(y[test], predictions.toarray()))
    mse = np.array(mse)
    accuracy = np.array(accuracy)

    mse = np.mean(mse)
    accuracy = np.mean(accuracy)

    return accuracy, mse
Exemple #13
0
def evaluate_verse(embedding, labels, number_shuffles=10, train_perc=0.1):
    from skmultilearn.problem_transform import LabelPowerset

    micro = []
    macro = []
    sss = StratifiedShuffleSplit(
        n_splits=number_shuffles,
        test_size=1 - train_perc)
    for train_index, test_index in sss.split(embedding, labels):
        X_train, X_test = embedding[train_index], embedding[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        clf = LabelPowerset(LogisticRegression())
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        micro.append(f1_score(y_test, preds, average='micro'))
        macro.append(f1_score(y_test, preds, average='macro'))
    return (micro, macro)
def get_train_test_lda(topic):
    model = VGG16(include_top=False, pooling='avg')

    x_train, y_train, x_test, y_test = load()

    x_train = x_train.astype('float32')
    x_train /= 255

    y_train = y_train.astype('int64')

    x_test = x_test.astype('float32')
    x_test /= 255
    y_test = y_test.astype('float32')

    X_train = model.predict(x_train)
    print(X_train.shape)
    X_test = model.predict(x_test)
    # X_train = model.predict(x_train)
    # X_test = model.predict(x_test)

    for k in topic:
        X_iter = X_train

        model_label = lda.LDA(n_topics=k, n_iter=1000)
        model_label.fit(y_train)
        doc_topic = model_label.doc_topic_
        x2 = doc_topic

        x = x2
        x = discretization_doc_topic(x)
        X_train = np.hstack((X_train, x))

        # multi-label learning to get x2
        classifier = LabelPowerset(RandomForestClassifier())
        classifier.fit(X_iter, x)

        x = np.array(sp.csr_matrix(classifier.predict(X_test)).toarray())
        # print(x)
        # x = alpha * x1 + (1-alpha) * x2
        # x = self.discretization_doc_topic(x)
        X_test = np.hstack((X_test, x))

    return np.array(X_train)[:, -28:], np.array(y_train), np.array(
        X_test)[:, -28:], np.array(y_test)
Exemple #15
0
def cross_validation_fold(index, splits_in, splits_out):
    """
    k-fold cross-validation "fold": performs validation using exactly
    one of the splits as validation set and the rest of the dataset
    as training data.
    :param index: Index of the split to use as validation data
    :param splits_in: List of splits of the original dataset inputs
    :param splits_out: List of splits of the origina dataset outputs
    :return: The accuracy score for a LinearSVC trained on all the
    splits except <index> and then validated on split <index>
    """
    validation_in = splits_in[index]
    validation_out = splits_out[index]
    cf = LabelPowerset(LinearSVC())

    # train on all splits except split <index>
    cf.fit(np.vstack(splits_in[:index] + splits_in[index + 1:]),
           sparse_vstack(splits_out[:index] + splits_out[index + 1:]))

    # validate on split <index>
    return validate(cf,
                    validation_in,
                    validation_out,
                    return_predictions=False)
# The matrices are initially in lil_matrix format
# Converting them to compressed row matrix format

X_train = X_train.tocsr()
y_train = y_train.todense()
X_test = X_test.tocsr()
y_test = y_test.todense()

label_set = set([0, 3, 4, 7, 8, 9, 12, 15, 17, 19, 20, 21])
label_list = [0, 3, 4, 7, 8, 9, 12, 15, 17, 19, 20, 21]

y_train = y_train[:, label_list]
y_test = y_test[:, label_list]

start_time = time.process_time()
# classifier = LabelPowerset(RandomForestClassifier(random_state=0, n_estimators=10, n_jobs=-1))

# classifier = RandomForestClassifier(random_state=0, n_estimators=10)
# classifier = BinaryRelevance(classifier = LinearSVC(), require_dense = [False, True])
classifier = LabelPowerset(SGDClassifier(penalty='l2', alpha=0.01))
classifier.fit(X_train, y_train)
y_predicted = classifier.predict(X_test)
total_time = time.process_time() - start_time

print("Total time taken is : " + str(total_time))

print("Jaccard Similarity Score is : " +
      str(jaccard_similarity_score(y_test, y_predicted)))
print("Hamming Loss is : " + str(hamming_loss(y_test, y_predicted)))
# print("F1_Similarity score is : "+str(f1_score(y_test,y_predicted,average='macro')))
Exemple #17
0
# X_train, X_test, y_train, y_test = train_test_split(X, y)

# In[4]:

X_test = weather[weather['Year'] > 2016]
X_test = X_test[X_test['Month'] > 8].iloc[:, 5:12]
X_train = weather[~weather.index.isin(X_test.index)].iloc[:, 5:12]
y_test = y[y.index.isin(X_test.index)]
y_train = y[~y.index.isin(X_test.index)]

# In[5]:

# model = LabelPowerset(GaussianNB())
model = LabelPowerset(KNeighborsClassifier(n_neighbors=20))
# model = MLkNN(k=18)
model.fit(X_train.values, y_train.values)
predictions = model.predict(X_test)
result = predictions.toarray()
predicted = pd.DataFrame(result, columns=description)
print(accuracy_score(y_test, predictions))

# In[6]:


def columnName(row):
    name = ''
    idx = row[row == 1].index
    for i in range(len(idx)):
        name += (idx[i] + ' ')
    return name
# predict for Classifier Chains
predictions_cc = classifier_cc.predict(X_test)

#Hamming Loss for Classifier Chaines
hamm_loss_cc = hamming_loss(y_test, predictions_cc)

print("Hamming Loss:", hamm_loss_cc)

print("\n\n\nTraining data with Label Powerset using Gaussian Naive Bayes")

#initialize Label Powerset multi-label classifier
#with a gaussian naive bayes base classifier
classifier_lp = LabelPowerset(GaussianNB())

# train for Label Powerset
classifier_lp.fit(X_train, y_train)

# predict for Label Powerset
predictions_lp = classifier_lp.predict(X_test)

#Hamming Loss for Label PowerSet
hamm_loss_lp = hamming_loss(y_test, predictions_lp)

print("Hamming Loss:", hamm_loss_lp)

print("\n\n\nAll hamming loss:")
print("Binary Relevance:\n", hamm_loss_binary)
print("Classifier Chains:\n", hamm_loss_cc)
print("Label Powerset:\n", hamm_loss_lp)

objects = ('BinaryRelevance', 'ClassifierChain', 'LabelPowerset')
Exemple #19
0
def train_SVC_LP(vec, label):
    classifier = LabelPowerset(classifier=LinearSVC(),
                               require_dense=[False, True])
    classifier.fit(vec, label)
    return classifier
Exemple #20
0
X_train = [X_train[index] for index in range(0,len(X_train)) if X_train[index] != ['"']]
X_train = [X_train[index] for index in range(0,len(X_train)) if X_train[index] != ['']]
test_data = [test_data[index] for index in range(0,len(test_data)) if test_data[index] != ['"']]
test_data = [test_data[index] for index in range(0,len(test_data)) if test_data[index] != ['']]

with open(train, 'r') as file:
    for line in file.readlines():
        index = line.index(',')'''

# initialize Label Powerset multi-label classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(GaussianNB())

# train
classifier.fit(X, y)

# predict
predictions = classifier.predict(data)

accuracy_score(meta, predictions)
'''classifier = MLkNN(k=20)

# train
classifier.fit(data, meta)

# predict
predictions = classifier.predict(X)

accuracy_score(Y,predictions)'''
targets = y.columns.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#classifier = BinaryRelevance(GaussianNB())
#classifier = BinaryRelevance(tree.DecisionTreeClassifier())
#classifier = ClassifierChain(tree.DecisionTreeClassifier())
classifier = LabelPowerset(tree.DecisionTreeClassifier())
#classifier = MLkNN(k=5)
#classifier = BRkNNaClassifier(k=5)
#ptclassifier = LabelPowerset(tree.DecisionTreeClassifier())
#clusterer = IGraphLabelCooccurenceClusterer('fastgreedy', weighted=True, include_self_edges=True)
#classifier = LabelSpacePartitioningClassifier(ptclassifier, clusterer)

classifier.fit(X_train.as_matrix(), y_train.as_matrix())

predictions = classifier.predict(X_test.as_matrix())

loss = hamming_loss(y_test.as_matrix(), predictions)
print 'Hamming loss: ', loss

#acc = accuracy_score(y_test.as_matrix(), predictions)
#print 'accuracy: ', acc

#lrloss = label_ranking_loss(y_test, predictions.toarray())
#lrap = label_ranking_average_precision_score(y_test, predictions.toarray())
#print "LRLOSS: best value 0: ", lrloss
#print "LRAP: best value 1: ", lrap

#macro_score = f1_score(y_test, predictions.toarray(), average='macro')
Exemple #22
0
model.fit(X_train, y_train)

predicted = model.predict(X_test)
accuracy_score(y_test, predicted)

accuracy_score(y_test, dtree_predictions)

from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(LinearSVC())

# train
classifier.fit(train_X, train_label)

# predict
predictions = classifier.predict(valid_X)
accuracy_score(valid_label, predictions)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

##
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)

from sklearn.multiclass import OutputCodeClassifier
Exemple #23
0
                  metrics=['accuracy'])
    return model


def create_model_multiclass(input_dim, output_dim):
    # create model
    model = Sequential()
    model.add(Dense(8, input_dim=input_dim, activation='relu'))
    model.add(Dense(output_dim, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model


KERAS_PARAMS = dict(epochs=10, batch_size=100, verbose=0)

# clf = BinaryRelevance(classifier=Keras(create_model_single_class, False, KERAS_PARAMS), require_dense=[True,True])
# clf.fit(X_train, y_train)
# result = clf.predict(X_test)
# print(result)

clf = LabelPowerset(classifier=Keras(create_model_multiclass, True,
                                     KERAS_PARAMS),
                    require_dense=[True, True])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy = %0.3f\n' % accuracy)
print(type(accuracy))
Exemple #24
0
    return np.round(res, 2)


X = data[0]
Y = data[1]
k = 3
n = 2 * len(Y[0, :])
indices = [0, 1, 2, 3, 4]
lbs = []
index_store = []
recode_cnt = np.linspace(0, 0, 5, dtype=np.int)
for i in range(n):
    np.random.shuffle(indices)
    index = indices[0:3]
    recode_cnt[index] = recode_cnt[index] + 1
    index_store.append(index)
    yt = Y[:, index]
    lb = LabelPowerset(LogisticRegression())
    lb.fit(X, yt)
    lbs.append(lb)

result = np.zeros(Y.shape)
for i in range(n):
    res = lbs[i].predict(X)
    index = index_store[i]
    result[:, index] = result[:, index] + res
# assert(i in index_store != 0)
result = result / recode_cnt
pred = (result > 0.5) + 0
print(accuary(pred, Y))
Exemple #25
0
    for p in range(0, 49):
        X_copy = X_orig[(p):(p +
                             1)]  #Slice the ith element from the numpy array
        y_copy = y_orig[(p):(p + 1)]
        X_model = X_orig
        y_model = y_orig  #Set X and y equal to samples and labels

        X_model = np.delete(
            X_model, p, axis=0
        )  #Create a new array to train the model with slicing out the ith item for LOOCV
        y_model = np.delete(y_model, p, axis=0)

        train_set = np.concatenate((X_model, y_model),
                                   axis=1)  #combine numpy matrices

        classifier.fit(X_model, y_model)
        prediction = classifier.predict(X_copy)
        #print(prediction.toarray(), y_copy)
        results = np.append(results, np.array(prediction.toarray()), axis=0)
        if np.array_equal(y_copy, prediction.toarray()):
            j = j + 1
            #print(y_copy, prediction.toarray())
        else:
            #print(y_copy, prediction.toarray())
            pass
    print(j / 49)

att = results[:, 0]
esc = results[:, 1:2]
ns = results[:, 2:3]
tang = results[:, 3:4]
def train_model(X_train=None, y_train=None, clf=None, X=None, y=None, cross_validate=False, k=3, load_model=False,
                tune_params=False, verbose=1):
    """
    Trains and returns classifier.

    :param verbose: Verbosity
    :param tune_params: Flag indication whether to tune hyperparameters
    :param k: Number of folds for cross validation
    :param cross_validate: Flag indicating whether to use k-fold cross validation
    :param y: Array of full labels
    :param X: Matrix of predictors
    :param load_model: Flag indicating whether to load pre-trained model instead of re-training it
    :param clf: Classifier to train
    :param X_train: Features of training set
    :param y_train: Labels of training set
    :return: Trained classifier
    """

    # Parameters for Grid Search
    parameters = [
        # {
        #     'classifier': [LinearSVC(class_weight='balanced', max_iter=10000)],
        #     'classifier__C': [1, 10],
        # },
        {
            'classifier': [SVC(class_weight='balanced', max_iter=10000)],
            'classifier__C': [1, 10],
            'classifier__gamma': ['scale'],
            'classifier__kernel': ['rbf']
        },
        {
            'classifier': [LogisticRegression(max_iter=10000, class_weight='balanced')],
            'classifier__C': [1, 10]
        },
    ]

    # If model needs to be retrained or trained for the first time
    if not load_model:
        # classifier = OneVsRestClassifier(clf)
        # classifier = BinaryRelevance(clf)
        classifier = LabelPowerset(clf)
        # classifier = ClassifierChain(clf)

    # If trained model can be loaded from file
    else:
        classifier = load(os.path.join(ROOT_DIR, 'DataCollection/data/models/trained_model.joblib'))

    if cross_validate:
        if tune_params:
            print('Starting cross-validated parameter tuning ...')
            grid_search_clf = GridSearchCV(LabelPowerset(), parameters, cv=k, scoring='f1_weighted', verbose=verbose,
                                           n_jobs=multiprocessing.cpu_count())
            grid_search_clf.fit(X.astype(float), y.astype(float))
            cross_val_accuracy = grid_search_clf.best_score_
            classifier = grid_search_clf.best_estimator_
            print('Configuration results:')
            results = pd.DataFrame(grid_search_clf.cv_results_)[['params', 'mean_test_score', 'std_test_score']]
            for i, row in results.iterrows():
                print('Result for parameter setting %s:' % row['params'])
                print('Mean test score: %g' % row['mean_test_score'])
                print('Standard deviation test score: %g' % row['std_test_score'])
                print()

            print('Best found classifier:')
            print(classifier)

            return cross_val_accuracy, classifier

        else:
            print('Starting Cross Validation using %s ...' % str(classifier))
            predictions = cross_val_predict(classifier, X.astype(float), y.astype(float), cv=k,
                                            n_jobs=multiprocessing.cpu_count(),
                                            verbose=2)
            cross_val_accuracy = metrics.f1_score(y, predictions, average='weighted')  # TODO: Change back to 'samples'

            # Fit classifier to all available data
            classifier.fit(X, y)

            return cross_val_accuracy, classifier


    else:
        classifier.fit(X_train.astype(float), y_train.astype(float))
        dump(classifier, os.path.join(ROOT_DIR, 'DataCollection/data/trained_model.joblib'))

    return classifier
print('-------------------------------------------------')
print('roc_auc_score using BinaryRelevance is ',
      roc_auc_score(y_test,
                    classifier.predict_proba(x_test).toarray()))

# # Label Powerset
# * Label Powerset creates a unique class for every possible label combination that is present in the training set, this way it makes use of label correlation
# * Only problem with this method is as the no of classes increases its computational complexity also increases.

# In[67]:

log_classifier = LabelPowerset(LogisticRegression())

# In[68]:

log_classifier.fit(x_train, y_train)
print('Accuracy_score using LabelPowerset is ',
      round(accuracy_score(y_test, log_classifier.predict(x_test)) * 100, 1),
      '%')
print('-------------------------------------------------')
print('roc_auc_score using LabelPowerset is ',
      roc_auc_score(y_test,
                    log_classifier.predict_proba(x_test).toarray()))

# # ClassifierChain
# * This method uses a chain of binary classifiers
# * Each new Classifier uses the predictions of all previous classifiers
# * This was the correlation b/w labels is taken into account

# In[69]:
hdrs = ['Release Year', 'Origin/Ethnicity', 'Director', 'Cast', 'Title']
for hd in hdrs:
    if hd == 'Title':
        cv = TfidfVectorizer(analyzer='word',
                             stop_words=stopwords)  # TF-IDF для названий
    else:
        cv = CountVectorizer(analyzer='word',
                             min_df=2)  # определить, относится или нет
        # конкретный режиссер (актер, год, страна) к конкретному фильму
    c = cv.fit_transform(x[hd].map(lambda pl: str(pl)))
    p = pandas.DataFrame(c.todense(),
                         index=x.index,
                         columns=cv.get_feature_names())
    plot = pandas.concat([plot, p], axis=1)
x_train, x_test, y_train, y_test = train_test_split(plot,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)
classif = LabelPowerset(LogisticRegression())
classif.fit(x_train, y_train)
pr = classif.predict(x_test)
print("Accuracy = ", accuracy_score(
    y_test, pr))  # доля фильмов, у которых жанры предсказаны абсолюно точно
print("Jaccard similarity score =", jaccard_similarity_score(
    y_test, pr))  # мера схожести исходных и предсказанных наборов жанров
# precision - доля правильных присвоений данного жанра
# recall - способность находить фильмы данного жанра
# f1 - среднее гармоническое precision и recall
# support - количество фильмов каждого жанра в y_test
print(classification_report(y_test, pr, target_names=list(y.head())))
Exemple #29
0
def training_phase(filename):
    cue_ground_truths = []
    fp = open(filename, 'r')
    data = fp.readlines()
    corpus = []
    for i in data:
        i = i.replace('\n', '')
        corpus.append(i.split('\t'))

    postag = []
    # feature extraction
    for line in range(len(corpus)):
        if (len(corpus[line]) > 8):
            count = int((len(corpus[line]) - 7) / 3)
            pakodi = []
            for j in range(count):
                word = corpus[line][7 + (j * 3)]
                pakodi.append(word)
            if (corpus[line][3] in pakodi):
                cue_ground_truths.append(1)
                postag.append(corpus[line][5].lower())

            else:
                cue_ground_truths.append(0)
                postag.append(corpus[line][5].lower())

        elif (len(corpus[line]) == 8):
            cue_ground_truths.append(0)
            postag.append(corpus[line][5].lower())

    pos_tags = []
    for i in postag:
        if i not in pos_tags:
            pos_tags.append(i)

    # one-hot-encoding the postags
    one_hot_postag = []

    for i in range(len(postag)):
        seq = []
        if (postag[i] in pos_tags):
            req = pos_tags.index(postag[i])
            for j in range(len(pos_tags)):
                if (j == req):
                    seq.append(1.0)
                else:
                    seq.append(0.0)
        remaining = 100 - len(pos_tags)
        for j in range(remaining):
            seq.append(0.0)
        one_hot_postag.append(seq)

    zero_list = []
    for i in range(100):
        zero_list.append(0.0)

    sent_index = []
    for i in range(len(corpus)):
        if (len(corpus[i]) == 1):
            sent_index.append(corpus[i - 1][2])

    temp = corpus
    corpus = []
    for i in range(len(temp)):
        if (len(temp[i]) == 1):
            continue
        else:
            corpus.append(temp[i])

    # uncomment the below lines to take only before 5 and next 6 postags as features
    # temp_corpus = corpus
    # temp_cue_ground_truths = cue_ground_truths
    # temp_one_hot_postag = one_hot_postag
    # features = []
    # cue_postag_features = []
    # for i in sent_index:
    # 	i = int(i)
    # 	target_sentence = temp_corpus[:i+1]
    # 	target_cues = temp_cue_ground_truths[:i+1]
    # 	target_pos = temp_one_hot_postag[:i+1]
    # 	temp_corpus = temp_corpus[i+1:]
    # 	temp_cue_ground_truths = temp_cue_ground_truths[i+1:]
    # 	temp_one_hot_postag = temp_one_hot_postag[i+1:]
    # 	for j in range(len(target_cues)):
    # 		if (target_cues[j] == 1):
    # 			missing = 6 - j
    # 			if(missing>0):
    # 				for k in range(missing):
    # 					features.append(zero_list)
    # 				n = 0
    # 				while n<=j:
    # 					features.append(target_pos[n])
    # 					n = n + 1
    # 			else:
    # 				n = j-6
    # 				while n <= j:
    # 					features.append(target_pos[n])
    # 					n = n +1
    # 			missing = 7 - len(target_sentence) + j
    # 			if missing>0:
    # 				n = j + 1
    # 				while n < len(target_sentence):
    # 					features.append(target_pos[n])
    # 					n = n+1
    # 				for n in range(missing):
    # 					features.append(zero_list)
    # 			else:
    # 				n = j+1
    # 				while n < (j+7):
    # 					features.append(target_pos[n])
    # 					n = n+1
    # 			cue_postag_features.append(features)
    # 			features = []

    # for i in range(len(cue_postag_features)):
    # 	if(len(cue_postag_features[i]) != 13):
    # 		print(len(cue_postag_features[i]), end= " ")
    # 		print(i)
    # for j in range(len(cue_postag_features[0])):
    # 	for k in range(len(cue_postag_features[0][j])):
    # 		if cue_postag_features[0][j][k] == 1:
    # 			print(pos_tags[k], end= " ")

    temp_corpus = corpus
    temp_cue_ground_truths = cue_ground_truths
    temp_one_hot_postag = one_hot_postag
    features = []
    feature1 = []
    cue_postag_features = []
    for i in sent_index:
        i = int(i)
        target_sentence = temp_corpus[:i + 1]
        target_cues = temp_cue_ground_truths[:i + 1]
        target_pos = temp_one_hot_postag[:i + 1]
        temp_corpus = temp_corpus[i + 1:]
        temp_cue_ground_truths = temp_cue_ground_truths[i + 1:]
        temp_one_hot_postag = temp_one_hot_postag[i + 1:]
        for j in range(len(target_cues)):
            if (target_cues[j] == 1):
                for k in range(len(target_pos)):
                    features.append(target_pos[j])
                    for l in range(100):
                        if l == j:
                            feature1.append(1.0)
                        else:
                            feature1.append(0.0)
                    features.append(feature1)
                    feature1 = []
                cue_postag_features.append(features)
                features = []

    cue_postag_features = keras.preprocessing.sequence.pad_sequences(
        cue_postag_features, maxlen=100)

    cue_postag_features = np.array(cue_postag_features)
    print(cue_postag_features.shape)

    temp_corpus = corpus
    temp_cue_ground_truths = cue_ground_truths
    temp_one_hot_postag = one_hot_postag
    features = []
    ground_scope = []
    for i in sent_index:
        i = int(i)
        target_sentence = temp_corpus[:i + 1]
        target_cues = temp_cue_ground_truths[:i + 1]
        target_pos = temp_one_hot_postag[:i + 1]
        temp_corpus = temp_corpus[i + 1:]
        temp_cue_ground_truths = temp_cue_ground_truths[i + 1:]
        temp_one_hot_postag = temp_one_hot_postag[i + 1:]
        for j in range(len(target_cues)):
            if (target_cues[j] == 1):
                cue_count = int((len(target_sentence[j]) - 7) / 3)
                paks = []
                for k in range(cue_count):
                    word = target_sentence[j][7 + (k * 3)]
                    paks.append(word)
                indi = 0
                for k in range(cue_count):
                    if (paks[k] != '_'):
                        indi = k
                for k in range(len(target_sentence)):
                    thing = target_sentence[k][7 + (indi * 3) + 1]
                    if (thing != '_'):
                        features.append(1.0)
                    else:
                        features.append(0.0)
                ground_scope.append(features)
                features = []

    ground_scope = keras.preprocessing.sequence.pad_sequences(ground_scope,
                                                              maxlen=100)
    ground_scope = np.array(ground_scope)
    print(ground_scope.shape)

    # X_train = cue_postag_features
    # y_train = ground_scope

    # nsamples, nx, ny = X_train.shape
    # X_train_2d_bef = X_train.reshape((nsamples,nx*ny))

    # y_train = y_train.reshape(nsamples*nx)

    # svm = SVC(kernel="linear", C=0.0025, random_state = 101)
    # svm.fit(X_train_2d_bef, y_train)
    # pickle.dump(svm, open("scope_detector.sav", 'wb'))

    X_train = cue_postag_features
    # y_train = ground_scope

    nsamples, col, vec = X_train.shape

    X_train_2d_bef = X_train.reshape((nsamples, col * vec))
    y_train_2d_bef = np.array(ground_scope, dtype=float)

    X_train_2d, X_validate_2d, y_train, y_validate = train_test_split(
        X_train_2d_bef, y_train_2d_bef, test_size=0.4, random_state=101)

    # classifier = LabelPowerset(GaussianNB())#0.49
    # classifier = LabelPowerset(RandomForestClassifier(n_estimators=25)).577
    classifier = LabelPowerset(RandomForestClassifier(n_estimators=50))  #.586
    # classifier = BinaryRelevance(MLPClassifier(hidden_layer_sizes = 200, verbose=True, max_iter = 15,learning_rate_init = 0.0035 ))
    # classifier = LabelPowerset(MLPClassifier()) # 0.58
    # classifier = LabelPowerset(MLPClassifier(hidden_layer_sizes = 200, verbose=True)) #0.589
    # classifier = LabelPowerset(MLPClassifier(hidden_layer_sizes = 200, verbose=True, max_iter = 400)) #.584
    # classifier = LabelPowerset(MLPClassifier(hidden_layer_sizes = 400, verbose=True, learning_rate_init = 0.0035))
    # classifier = LabelPowerset(GaussianNB())0.18
    # classifier = ClassifierChain(DecisionTreeClassifier()) 0.17
    # classifier = ClassifierChain(GaussianNB())0.02
    # classifier = ClassifierChain(GaussianNB())#0.2
    # classifier = BinaryRelevance(GaussianNB()).35 withoutcue
    # classifier = BinaryRelevance(DecisionTreeClassifier())0.15
    # classifier = BinaryRelevance(OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1))0.41
    # classifier = BinaryRelevance(OneClassSVM(nu=0.5, kernel="rbf", gamma=0.5))
    # classifier = BinaryRelevance(RandomForestClassifier(n_estimators=25))0.29
    # classifier = BinaryRelevance(RandomForestClassifier(n_estimators=100))0.24
    # classifier = BinaryRelevance(RandomForestClassifier(n_estimators=3)).2900
    # classifier = BinaryRelevance(MLPClassifier()).22
    classifier.fit(X_train_2d, y_train)
    y_predict = classifier.predict(X_validate_2d)
    f1_measure = f1_score(
        y_validate,
        y_predict,
        average="weighted",
    )
    print(f1_measure)
    pickle.dump(classifier, open("scope_detector.sav", 'wb'))
class LMWrapper(Model):
    def __init__(self, C=1.0, use_idf=False, filename=None, **kwargs):
        self.lm = LabelPowerset(MultinomialNB())
        self.vect1 = TfidfVectorizer(norm=None,
                                     use_idf=use_idf,
                                     min_df=0.0,
                                     ngram_range=(1, 1))
        self.selector = sklearn.feature_selection.SelectKBest(k='all')
        self.output_dim = 0
        if filename is not None: self.load(filename)

    def build_representation(self, x, y=None, fit=False):
        auxX = [
            ' \n '.join([
                ' '.join(['w_' + str(token) for token in field if token != 0])
                for field in instance
            ]) for instance in x
        ]
        if fit: self.vect1.fit(auxX)
        auxX = self.vect1.transform(auxX)
        if fit: self.selector.fit(auxX, np.array([np.argmax(i) for i in y]))
        auxX = self.selector.transform(auxX)
        return auxX.todense()

    def fit(self, x, y, validation_data=None):
        auxY = y
        print('Build representation...')
        auxX = self.build_representation(x, auxY, fit=True)
        print('auxX shape:', auxX.shape)
        print('Fit model...')
        self.lm.fit(auxX, auxY)
        self.output_dim = auxY.shape[1]
        if validation_data is None: return None
        res = self.evaluate(validation_data[0], validation_data[1])
        print("Accuracy in validation data =", res)
        return None

    def predict(self, x):
        auxX = self.build_representation(x, fit=False)
        print('Predicting baseline...')
        auxY = self.lm.predict(auxX)
        #auxY = to_categorical(auxY)
        if auxY.shape[1] < self.output_dim:
            npad = ((0, 0), (0, self.output_dim - auxY.shape[1]))
            auxY = np.pad(auxY,
                          pad_width=npad,
                          mode='constant',
                          constant_values=0)
        return [auxY, [], []]

    def predict_prob(self, x):
        auxX = self.build_representation(x, fit=False)
        print('Predicting baseline...')
        auxY = self.lm.predict_proba(auxX)
        if auxY.shape[1] < self.output_dim:
            npad = ((0, 0), (0, self.output_dim - auxY.shape[1]))
            auxY = np.pad(auxY,
                          pad_width=npad,
                          mode='constant',
                          constant_values=0)
        return [auxY, [], []]

    def evaluate(self, x, y):
        auxX = self.build_representation(x, fit=False)
        auxY = y
        auxY = np.array([np.argmax(i) for i in auxY])
        return sklearn.metrics.accuracy_score(y_true=auxY,
                                              y_pred=self.lm.predict(auxX))

    def save(self, filename):
        f = open(filename, "wb")
        pickle.dump(self.lm, f, protocol=4)
        pickle.dump(self.vect1, f, protocol=4)
        pickle.dump(self.selector, f, protocol=4)
        pickle.dump(self.output_dim, f, protocol=4)
        f.close()

    def load(self, filename):
        f = open(filename, "rb")
        self.lm = pickle.load(f)
        self.vect1 = pickle.load(f)
        self.selector = pickle.load(f)
        self.output_dim = pickle.load(f)
        f.close()