def train(self, data: DataFrame, X_column: str, y_columns: List[str] = None): if y_columns is None: _ = data.columns.to_list() y_columns = list(set(_) - set([X_column])) X = data[X_column] y: DataFrame = data.drop(X_column, axis=1) xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=42, test_size=0.2) mlb = MultiLabelBinarizer() train_labels = mlb.fit_transform(ytrain[y_columns].values) # test_labels not used when training # test_labels = mlb.fit_transform(ytest[y_columns].values) train_cleaned = xtrain.copy(deep=True).apply( nlp_preprocess.Preprocess().clean_text) # test cleaned not used when training # test_cleaned = xtest.copy(deep=True).apply(clean_text) vectorizer = TfidfVectorizer() vectorised_train_documents = vectorizer.fit_transform(train_cleaned) powersetsvc = LabelPowerset(LinearSVC()) powersetsvc.fit(vectorised_train_documents, train_labels) dump(powersetsvc, open("powersetsvc.pickle", "wb")) with open('vec.pickle', 'wb') as f1: dump(vectorizer, f1) return powersetsvc, vectorizer
def reduce_dimension(data1, label1, dimension_num, estimators=100): # The method is to reduce the dimension of vector # and choose the most important features # print('label1: ', label1.shape) y_train = sparse.lil_matrix((label1.shape[0], 85)) y_train[:, :] = label1 # print(y_train.shape) X_train = sparse.lil_matrix((label1.shape[0], 4189)) X_train[:, :] = data1 # print(X_train.shape) classifier5 = RandomForestClassifier(n_estimators=estimators, random_state=1) classifier1 = LabelPowerset(classifier=classifier5, require_dense=[False, True]) classifier1.fit(X_train, y_train) importances = classifier5.feature_importances_ # print('importances1: ', importances) indices = np.argsort(importances)[::-1] # print('indices', indices) features_importances = importances[indices] # plot_feature_importances(importances, 'Features Importance(Random Forest)', name1) return indices[:dimension_num], indices[ dimension_num:], features_importances
class MyLabelPowerSetFeatureSelect(): def fit(self, X, y): # I'm using a gaussian naive bayes base classifier self.LabelPowerSetObject = LabelPowerset(GaussianNB()) # fitting the data self.LabelPowerSetObject.fit(X, y) # transformed y y_transformed = self.LabelPowerSetObject.transform(y) # instanciating with SelectKBest object self.X_new = SelectKBest(chi2, k=2) # the feature selecting self.X_transformed = self.X_new.fit_transform(X, y_transformed) # save indices of the saved attributes self.selected_attributes_indices = self.X_new.get_support(indices = True) #print(self.attributes_indices,'the indices of the selected atributes') return self def transform(self, X): return X[:,self.selected_attributes_indices] def predict(self, X): return self.LabelPowerSetObject.predict(X) def predict_proba(self, X): return self.LabelPowerSetObject.predict_proba(X)
def labelSet(self): classifier = LabelPowerset(GaussianNB()) classifier.fit(self.X_train, self.y_train) # predict predictions = classifier.predict(self.X_test) result = accuracy_score(self.y_test, predictions) print(result)
def logistic_regression_classifier(train_x, train_y): from sklearn.linear_model import LogisticRegression from skmultilearn.problem_transform import BinaryRelevance from skmultilearn.problem_transform import LabelPowerset from skmultilearn.problem_transform import ClassifierChain from sklearn.naive_bayes import GaussianNB model = LabelPowerset(LogisticRegression(penalty='l1')) model.fit(train_x, train_y) return model
def naive_bayes_classifier(train_x, train_y): from skmultilearn.problem_transform import BinaryRelevance from skmultilearn.problem_transform import LabelPowerset from skmultilearn.problem_transform import ClassifierChain from sklearn.naive_bayes import GaussianNB classifier = LabelPowerset(GaussianNB()) # classifier = ClassifierChain(GaussianNB()) # classifier = BinaryRelevance(GaussianNB()) classifier.fit(train_x, train_y) return classifier
def powerset(self): classifier = LabelPowerset(LogisticRegression()) classifier.fit(self.x_data, self.y_data) predictions = classifier.predict(self.x_test) return { 'accuracy': accuracy_score(self.y_test, predictions), 'f1_score': f1_score(self.y_test, predictions, average='micro') }
def buildLBClassifier(xTrain, yTrain): # initialize Label Powerset multi-label classifier # with a gaussian naive bayes base classifier classifier = LabelPowerset(GaussianNB()) # train xTrain = np.ascontiguousarray(xTrain) yTrain = np.ascontiguousarray(yTrain) classifier.fit(xTrain, yTrain) return classifier
def classifiers(X_train, Y_train, X_test): classifier1 = BinaryRelevance(GaussianNB()) classifier2 = ClassifierChain(GaussianNB()) classifier3 = LabelPowerset(GaussianNB()) classifier1.fit(X_train, Y_train) classifier2.fit(X_train, Y_train) classifier3.fit(X_train, Y_train) predictions1 = classifier1.predict(X_test) predictions2 = classifier2.predict(X_test) predictions3 = classifier3.predict(X_test) return predictions1, predictions2, predictions3
class LP(): ''' Label Powerset Method ''' h = None def __init__(self, h=LogisticRegression()): self.h = LabelPowerset(h) def fit(self, X, Y): ''' Train the model on training data X,Y ''' return self.h.fit(X, Y) def predict(self, X): ''' Return predictions Y, given X ''' return self.h.predict(X) def predict_proba(self, X): ''' Return matrix P, where P[i,j] = P(Y[i,j] = 1 | X[i]) (where i-th row/example, and j-th label) ''' return self.h.predict_proba(X)
def LabelPowerset_method(X_train, y_train, samples_leaf, samples_split): """ 问题转换-->标签Powerset方法 :param X_train: 输入数据 :param y_train: 对应标签数据 :return: """ try: classifier = LabelPowerset( DecisionTreeClassifier(min_samples_leaf=int(samples_leaf), min_samples_split=int(samples_split))) classifier.fit(X_train, y_train) return classifier except Exception as e: print("warning----标签Powerset|LabelPowerset_method----" + str(e)) return None
def runSet(model, x, y): mse = [] accuracy = [] kf = KFold(n_splits=splitNo) for train, test in kf.split(x): classifier = LabelPowerset(model) classifier.fit(x[train], y[train]) predictions = classifier.predict(x[test]) accuracy.append(accuracy_score(y[test], predictions)) mse.append(mean_squared_error(y[test], predictions.toarray())) mse = np.array(mse) accuracy = np.array(accuracy) mse = np.mean(mse) accuracy = np.mean(accuracy) return accuracy, mse
def evaluate_verse(embedding, labels, number_shuffles=10, train_perc=0.1): from skmultilearn.problem_transform import LabelPowerset micro = [] macro = [] sss = StratifiedShuffleSplit( n_splits=number_shuffles, test_size=1 - train_perc) for train_index, test_index in sss.split(embedding, labels): X_train, X_test = embedding[train_index], embedding[test_index] y_train, y_test = labels[train_index], labels[test_index] clf = LabelPowerset(LogisticRegression()) clf.fit(X_train, y_train) preds = clf.predict(X_test) micro.append(f1_score(y_test, preds, average='micro')) macro.append(f1_score(y_test, preds, average='macro')) return (micro, macro)
def get_train_test_lda(topic): model = VGG16(include_top=False, pooling='avg') x_train, y_train, x_test, y_test = load() x_train = x_train.astype('float32') x_train /= 255 y_train = y_train.astype('int64') x_test = x_test.astype('float32') x_test /= 255 y_test = y_test.astype('float32') X_train = model.predict(x_train) print(X_train.shape) X_test = model.predict(x_test) # X_train = model.predict(x_train) # X_test = model.predict(x_test) for k in topic: X_iter = X_train model_label = lda.LDA(n_topics=k, n_iter=1000) model_label.fit(y_train) doc_topic = model_label.doc_topic_ x2 = doc_topic x = x2 x = discretization_doc_topic(x) X_train = np.hstack((X_train, x)) # multi-label learning to get x2 classifier = LabelPowerset(RandomForestClassifier()) classifier.fit(X_iter, x) x = np.array(sp.csr_matrix(classifier.predict(X_test)).toarray()) # print(x) # x = alpha * x1 + (1-alpha) * x2 # x = self.discretization_doc_topic(x) X_test = np.hstack((X_test, x)) return np.array(X_train)[:, -28:], np.array(y_train), np.array( X_test)[:, -28:], np.array(y_test)
def cross_validation_fold(index, splits_in, splits_out): """ k-fold cross-validation "fold": performs validation using exactly one of the splits as validation set and the rest of the dataset as training data. :param index: Index of the split to use as validation data :param splits_in: List of splits of the original dataset inputs :param splits_out: List of splits of the origina dataset outputs :return: The accuracy score for a LinearSVC trained on all the splits except <index> and then validated on split <index> """ validation_in = splits_in[index] validation_out = splits_out[index] cf = LabelPowerset(LinearSVC()) # train on all splits except split <index> cf.fit(np.vstack(splits_in[:index] + splits_in[index + 1:]), sparse_vstack(splits_out[:index] + splits_out[index + 1:])) # validate on split <index> return validate(cf, validation_in, validation_out, return_predictions=False)
# The matrices are initially in lil_matrix format # Converting them to compressed row matrix format X_train = X_train.tocsr() y_train = y_train.todense() X_test = X_test.tocsr() y_test = y_test.todense() label_set = set([0, 3, 4, 7, 8, 9, 12, 15, 17, 19, 20, 21]) label_list = [0, 3, 4, 7, 8, 9, 12, 15, 17, 19, 20, 21] y_train = y_train[:, label_list] y_test = y_test[:, label_list] start_time = time.process_time() # classifier = LabelPowerset(RandomForestClassifier(random_state=0, n_estimators=10, n_jobs=-1)) # classifier = RandomForestClassifier(random_state=0, n_estimators=10) # classifier = BinaryRelevance(classifier = LinearSVC(), require_dense = [False, True]) classifier = LabelPowerset(SGDClassifier(penalty='l2', alpha=0.01)) classifier.fit(X_train, y_train) y_predicted = classifier.predict(X_test) total_time = time.process_time() - start_time print("Total time taken is : " + str(total_time)) print("Jaccard Similarity Score is : " + str(jaccard_similarity_score(y_test, y_predicted))) print("Hamming Loss is : " + str(hamming_loss(y_test, y_predicted))) # print("F1_Similarity score is : "+str(f1_score(y_test,y_predicted,average='macro')))
# X_train, X_test, y_train, y_test = train_test_split(X, y) # In[4]: X_test = weather[weather['Year'] > 2016] X_test = X_test[X_test['Month'] > 8].iloc[:, 5:12] X_train = weather[~weather.index.isin(X_test.index)].iloc[:, 5:12] y_test = y[y.index.isin(X_test.index)] y_train = y[~y.index.isin(X_test.index)] # In[5]: # model = LabelPowerset(GaussianNB()) model = LabelPowerset(KNeighborsClassifier(n_neighbors=20)) # model = MLkNN(k=18) model.fit(X_train.values, y_train.values) predictions = model.predict(X_test) result = predictions.toarray() predicted = pd.DataFrame(result, columns=description) print(accuracy_score(y_test, predictions)) # In[6]: def columnName(row): name = '' idx = row[row == 1].index for i in range(len(idx)): name += (idx[i] + ' ') return name
# predict for Classifier Chains predictions_cc = classifier_cc.predict(X_test) #Hamming Loss for Classifier Chaines hamm_loss_cc = hamming_loss(y_test, predictions_cc) print("Hamming Loss:", hamm_loss_cc) print("\n\n\nTraining data with Label Powerset using Gaussian Naive Bayes") #initialize Label Powerset multi-label classifier #with a gaussian naive bayes base classifier classifier_lp = LabelPowerset(GaussianNB()) # train for Label Powerset classifier_lp.fit(X_train, y_train) # predict for Label Powerset predictions_lp = classifier_lp.predict(X_test) #Hamming Loss for Label PowerSet hamm_loss_lp = hamming_loss(y_test, predictions_lp) print("Hamming Loss:", hamm_loss_lp) print("\n\n\nAll hamming loss:") print("Binary Relevance:\n", hamm_loss_binary) print("Classifier Chains:\n", hamm_loss_cc) print("Label Powerset:\n", hamm_loss_lp) objects = ('BinaryRelevance', 'ClassifierChain', 'LabelPowerset')
def train_SVC_LP(vec, label): classifier = LabelPowerset(classifier=LinearSVC(), require_dense=[False, True]) classifier.fit(vec, label) return classifier
X_train = [X_train[index] for index in range(0,len(X_train)) if X_train[index] != ['"']] X_train = [X_train[index] for index in range(0,len(X_train)) if X_train[index] != ['']] test_data = [test_data[index] for index in range(0,len(test_data)) if test_data[index] != ['"']] test_data = [test_data[index] for index in range(0,len(test_data)) if test_data[index] != ['']] with open(train, 'r') as file: for line in file.readlines(): index = line.index(',')''' # initialize Label Powerset multi-label classifier # with a gaussian naive bayes base classifier classifier = LabelPowerset(GaussianNB()) # train classifier.fit(X, y) # predict predictions = classifier.predict(data) accuracy_score(meta, predictions) '''classifier = MLkNN(k=20) # train classifier.fit(data, meta) # predict predictions = classifier.predict(X) accuracy_score(Y,predictions)'''
targets = y.columns.values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) #classifier = BinaryRelevance(GaussianNB()) #classifier = BinaryRelevance(tree.DecisionTreeClassifier()) #classifier = ClassifierChain(tree.DecisionTreeClassifier()) classifier = LabelPowerset(tree.DecisionTreeClassifier()) #classifier = MLkNN(k=5) #classifier = BRkNNaClassifier(k=5) #ptclassifier = LabelPowerset(tree.DecisionTreeClassifier()) #clusterer = IGraphLabelCooccurenceClusterer('fastgreedy', weighted=True, include_self_edges=True) #classifier = LabelSpacePartitioningClassifier(ptclassifier, clusterer) classifier.fit(X_train.as_matrix(), y_train.as_matrix()) predictions = classifier.predict(X_test.as_matrix()) loss = hamming_loss(y_test.as_matrix(), predictions) print 'Hamming loss: ', loss #acc = accuracy_score(y_test.as_matrix(), predictions) #print 'accuracy: ', acc #lrloss = label_ranking_loss(y_test, predictions.toarray()) #lrap = label_ranking_average_precision_score(y_test, predictions.toarray()) #print "LRLOSS: best value 0: ", lrloss #print "LRAP: best value 1: ", lrap #macro_score = f1_score(y_test, predictions.toarray(), average='macro')
model.fit(X_train, y_train) predicted = model.predict(X_test) accuracy_score(y_test, predicted) accuracy_score(y_test, dtree_predictions) from skmultilearn.problem_transform import LabelPowerset from sklearn.naive_bayes import GaussianNB # initialize binary relevance multi-label classifier # with a gaussian naive bayes base classifier classifier = LabelPowerset(LinearSVC()) # train classifier.fit(train_X, train_label) # predict predictions = classifier.predict(valid_X) accuracy_score(valid_label, predictions) from sklearn.metrics import accuracy_score accuracy_score(y_test, predictions) ## from sklearn.multiclass import OneVsRestClassifier from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier, GradientBoostingClassifier) from sklearn.multiclass import OutputCodeClassifier
metrics=['accuracy']) return model def create_model_multiclass(input_dim, output_dim): # create model model = Sequential() model.add(Dense(8, input_dim=input_dim, activation='relu')) model.add(Dense(output_dim, activation='softmax')) # Compile model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model KERAS_PARAMS = dict(epochs=10, batch_size=100, verbose=0) # clf = BinaryRelevance(classifier=Keras(create_model_single_class, False, KERAS_PARAMS), require_dense=[True,True]) # clf.fit(X_train, y_train) # result = clf.predict(X_test) # print(result) clf = LabelPowerset(classifier=Keras(create_model_multiclass, True, KERAS_PARAMS), require_dense=[True, True]) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print('Accuracy = %0.3f\n' % accuracy) print(type(accuracy))
return np.round(res, 2) X = data[0] Y = data[1] k = 3 n = 2 * len(Y[0, :]) indices = [0, 1, 2, 3, 4] lbs = [] index_store = [] recode_cnt = np.linspace(0, 0, 5, dtype=np.int) for i in range(n): np.random.shuffle(indices) index = indices[0:3] recode_cnt[index] = recode_cnt[index] + 1 index_store.append(index) yt = Y[:, index] lb = LabelPowerset(LogisticRegression()) lb.fit(X, yt) lbs.append(lb) result = np.zeros(Y.shape) for i in range(n): res = lbs[i].predict(X) index = index_store[i] result[:, index] = result[:, index] + res # assert(i in index_store != 0) result = result / recode_cnt pred = (result > 0.5) + 0 print(accuary(pred, Y))
for p in range(0, 49): X_copy = X_orig[(p):(p + 1)] #Slice the ith element from the numpy array y_copy = y_orig[(p):(p + 1)] X_model = X_orig y_model = y_orig #Set X and y equal to samples and labels X_model = np.delete( X_model, p, axis=0 ) #Create a new array to train the model with slicing out the ith item for LOOCV y_model = np.delete(y_model, p, axis=0) train_set = np.concatenate((X_model, y_model), axis=1) #combine numpy matrices classifier.fit(X_model, y_model) prediction = classifier.predict(X_copy) #print(prediction.toarray(), y_copy) results = np.append(results, np.array(prediction.toarray()), axis=0) if np.array_equal(y_copy, prediction.toarray()): j = j + 1 #print(y_copy, prediction.toarray()) else: #print(y_copy, prediction.toarray()) pass print(j / 49) att = results[:, 0] esc = results[:, 1:2] ns = results[:, 2:3] tang = results[:, 3:4]
def train_model(X_train=None, y_train=None, clf=None, X=None, y=None, cross_validate=False, k=3, load_model=False, tune_params=False, verbose=1): """ Trains and returns classifier. :param verbose: Verbosity :param tune_params: Flag indication whether to tune hyperparameters :param k: Number of folds for cross validation :param cross_validate: Flag indicating whether to use k-fold cross validation :param y: Array of full labels :param X: Matrix of predictors :param load_model: Flag indicating whether to load pre-trained model instead of re-training it :param clf: Classifier to train :param X_train: Features of training set :param y_train: Labels of training set :return: Trained classifier """ # Parameters for Grid Search parameters = [ # { # 'classifier': [LinearSVC(class_weight='balanced', max_iter=10000)], # 'classifier__C': [1, 10], # }, { 'classifier': [SVC(class_weight='balanced', max_iter=10000)], 'classifier__C': [1, 10], 'classifier__gamma': ['scale'], 'classifier__kernel': ['rbf'] }, { 'classifier': [LogisticRegression(max_iter=10000, class_weight='balanced')], 'classifier__C': [1, 10] }, ] # If model needs to be retrained or trained for the first time if not load_model: # classifier = OneVsRestClassifier(clf) # classifier = BinaryRelevance(clf) classifier = LabelPowerset(clf) # classifier = ClassifierChain(clf) # If trained model can be loaded from file else: classifier = load(os.path.join(ROOT_DIR, 'DataCollection/data/models/trained_model.joblib')) if cross_validate: if tune_params: print('Starting cross-validated parameter tuning ...') grid_search_clf = GridSearchCV(LabelPowerset(), parameters, cv=k, scoring='f1_weighted', verbose=verbose, n_jobs=multiprocessing.cpu_count()) grid_search_clf.fit(X.astype(float), y.astype(float)) cross_val_accuracy = grid_search_clf.best_score_ classifier = grid_search_clf.best_estimator_ print('Configuration results:') results = pd.DataFrame(grid_search_clf.cv_results_)[['params', 'mean_test_score', 'std_test_score']] for i, row in results.iterrows(): print('Result for parameter setting %s:' % row['params']) print('Mean test score: %g' % row['mean_test_score']) print('Standard deviation test score: %g' % row['std_test_score']) print() print('Best found classifier:') print(classifier) return cross_val_accuracy, classifier else: print('Starting Cross Validation using %s ...' % str(classifier)) predictions = cross_val_predict(classifier, X.astype(float), y.astype(float), cv=k, n_jobs=multiprocessing.cpu_count(), verbose=2) cross_val_accuracy = metrics.f1_score(y, predictions, average='weighted') # TODO: Change back to 'samples' # Fit classifier to all available data classifier.fit(X, y) return cross_val_accuracy, classifier else: classifier.fit(X_train.astype(float), y_train.astype(float)) dump(classifier, os.path.join(ROOT_DIR, 'DataCollection/data/trained_model.joblib')) return classifier
print('-------------------------------------------------') print('roc_auc_score using BinaryRelevance is ', roc_auc_score(y_test, classifier.predict_proba(x_test).toarray())) # # Label Powerset # * Label Powerset creates a unique class for every possible label combination that is present in the training set, this way it makes use of label correlation # * Only problem with this method is as the no of classes increases its computational complexity also increases. # In[67]: log_classifier = LabelPowerset(LogisticRegression()) # In[68]: log_classifier.fit(x_train, y_train) print('Accuracy_score using LabelPowerset is ', round(accuracy_score(y_test, log_classifier.predict(x_test)) * 100, 1), '%') print('-------------------------------------------------') print('roc_auc_score using LabelPowerset is ', roc_auc_score(y_test, log_classifier.predict_proba(x_test).toarray())) # # ClassifierChain # * This method uses a chain of binary classifiers # * Each new Classifier uses the predictions of all previous classifiers # * This was the correlation b/w labels is taken into account # In[69]:
hdrs = ['Release Year', 'Origin/Ethnicity', 'Director', 'Cast', 'Title'] for hd in hdrs: if hd == 'Title': cv = TfidfVectorizer(analyzer='word', stop_words=stopwords) # TF-IDF для названий else: cv = CountVectorizer(analyzer='word', min_df=2) # определить, относится или нет # конкретный режиссер (актер, год, страна) к конкретному фильму c = cv.fit_transform(x[hd].map(lambda pl: str(pl))) p = pandas.DataFrame(c.todense(), index=x.index, columns=cv.get_feature_names()) plot = pandas.concat([plot, p], axis=1) x_train, x_test, y_train, y_test = train_test_split(plot, y, test_size=0.33, random_state=42) classif = LabelPowerset(LogisticRegression()) classif.fit(x_train, y_train) pr = classif.predict(x_test) print("Accuracy = ", accuracy_score( y_test, pr)) # доля фильмов, у которых жанры предсказаны абсолюно точно print("Jaccard similarity score =", jaccard_similarity_score( y_test, pr)) # мера схожести исходных и предсказанных наборов жанров # precision - доля правильных присвоений данного жанра # recall - способность находить фильмы данного жанра # f1 - среднее гармоническое precision и recall # support - количество фильмов каждого жанра в y_test print(classification_report(y_test, pr, target_names=list(y.head())))
def training_phase(filename): cue_ground_truths = [] fp = open(filename, 'r') data = fp.readlines() corpus = [] for i in data: i = i.replace('\n', '') corpus.append(i.split('\t')) postag = [] # feature extraction for line in range(len(corpus)): if (len(corpus[line]) > 8): count = int((len(corpus[line]) - 7) / 3) pakodi = [] for j in range(count): word = corpus[line][7 + (j * 3)] pakodi.append(word) if (corpus[line][3] in pakodi): cue_ground_truths.append(1) postag.append(corpus[line][5].lower()) else: cue_ground_truths.append(0) postag.append(corpus[line][5].lower()) elif (len(corpus[line]) == 8): cue_ground_truths.append(0) postag.append(corpus[line][5].lower()) pos_tags = [] for i in postag: if i not in pos_tags: pos_tags.append(i) # one-hot-encoding the postags one_hot_postag = [] for i in range(len(postag)): seq = [] if (postag[i] in pos_tags): req = pos_tags.index(postag[i]) for j in range(len(pos_tags)): if (j == req): seq.append(1.0) else: seq.append(0.0) remaining = 100 - len(pos_tags) for j in range(remaining): seq.append(0.0) one_hot_postag.append(seq) zero_list = [] for i in range(100): zero_list.append(0.0) sent_index = [] for i in range(len(corpus)): if (len(corpus[i]) == 1): sent_index.append(corpus[i - 1][2]) temp = corpus corpus = [] for i in range(len(temp)): if (len(temp[i]) == 1): continue else: corpus.append(temp[i]) # uncomment the below lines to take only before 5 and next 6 postags as features # temp_corpus = corpus # temp_cue_ground_truths = cue_ground_truths # temp_one_hot_postag = one_hot_postag # features = [] # cue_postag_features = [] # for i in sent_index: # i = int(i) # target_sentence = temp_corpus[:i+1] # target_cues = temp_cue_ground_truths[:i+1] # target_pos = temp_one_hot_postag[:i+1] # temp_corpus = temp_corpus[i+1:] # temp_cue_ground_truths = temp_cue_ground_truths[i+1:] # temp_one_hot_postag = temp_one_hot_postag[i+1:] # for j in range(len(target_cues)): # if (target_cues[j] == 1): # missing = 6 - j # if(missing>0): # for k in range(missing): # features.append(zero_list) # n = 0 # while n<=j: # features.append(target_pos[n]) # n = n + 1 # else: # n = j-6 # while n <= j: # features.append(target_pos[n]) # n = n +1 # missing = 7 - len(target_sentence) + j # if missing>0: # n = j + 1 # while n < len(target_sentence): # features.append(target_pos[n]) # n = n+1 # for n in range(missing): # features.append(zero_list) # else: # n = j+1 # while n < (j+7): # features.append(target_pos[n]) # n = n+1 # cue_postag_features.append(features) # features = [] # for i in range(len(cue_postag_features)): # if(len(cue_postag_features[i]) != 13): # print(len(cue_postag_features[i]), end= " ") # print(i) # for j in range(len(cue_postag_features[0])): # for k in range(len(cue_postag_features[0][j])): # if cue_postag_features[0][j][k] == 1: # print(pos_tags[k], end= " ") temp_corpus = corpus temp_cue_ground_truths = cue_ground_truths temp_one_hot_postag = one_hot_postag features = [] feature1 = [] cue_postag_features = [] for i in sent_index: i = int(i) target_sentence = temp_corpus[:i + 1] target_cues = temp_cue_ground_truths[:i + 1] target_pos = temp_one_hot_postag[:i + 1] temp_corpus = temp_corpus[i + 1:] temp_cue_ground_truths = temp_cue_ground_truths[i + 1:] temp_one_hot_postag = temp_one_hot_postag[i + 1:] for j in range(len(target_cues)): if (target_cues[j] == 1): for k in range(len(target_pos)): features.append(target_pos[j]) for l in range(100): if l == j: feature1.append(1.0) else: feature1.append(0.0) features.append(feature1) feature1 = [] cue_postag_features.append(features) features = [] cue_postag_features = keras.preprocessing.sequence.pad_sequences( cue_postag_features, maxlen=100) cue_postag_features = np.array(cue_postag_features) print(cue_postag_features.shape) temp_corpus = corpus temp_cue_ground_truths = cue_ground_truths temp_one_hot_postag = one_hot_postag features = [] ground_scope = [] for i in sent_index: i = int(i) target_sentence = temp_corpus[:i + 1] target_cues = temp_cue_ground_truths[:i + 1] target_pos = temp_one_hot_postag[:i + 1] temp_corpus = temp_corpus[i + 1:] temp_cue_ground_truths = temp_cue_ground_truths[i + 1:] temp_one_hot_postag = temp_one_hot_postag[i + 1:] for j in range(len(target_cues)): if (target_cues[j] == 1): cue_count = int((len(target_sentence[j]) - 7) / 3) paks = [] for k in range(cue_count): word = target_sentence[j][7 + (k * 3)] paks.append(word) indi = 0 for k in range(cue_count): if (paks[k] != '_'): indi = k for k in range(len(target_sentence)): thing = target_sentence[k][7 + (indi * 3) + 1] if (thing != '_'): features.append(1.0) else: features.append(0.0) ground_scope.append(features) features = [] ground_scope = keras.preprocessing.sequence.pad_sequences(ground_scope, maxlen=100) ground_scope = np.array(ground_scope) print(ground_scope.shape) # X_train = cue_postag_features # y_train = ground_scope # nsamples, nx, ny = X_train.shape # X_train_2d_bef = X_train.reshape((nsamples,nx*ny)) # y_train = y_train.reshape(nsamples*nx) # svm = SVC(kernel="linear", C=0.0025, random_state = 101) # svm.fit(X_train_2d_bef, y_train) # pickle.dump(svm, open("scope_detector.sav", 'wb')) X_train = cue_postag_features # y_train = ground_scope nsamples, col, vec = X_train.shape X_train_2d_bef = X_train.reshape((nsamples, col * vec)) y_train_2d_bef = np.array(ground_scope, dtype=float) X_train_2d, X_validate_2d, y_train, y_validate = train_test_split( X_train_2d_bef, y_train_2d_bef, test_size=0.4, random_state=101) # classifier = LabelPowerset(GaussianNB())#0.49 # classifier = LabelPowerset(RandomForestClassifier(n_estimators=25)).577 classifier = LabelPowerset(RandomForestClassifier(n_estimators=50)) #.586 # classifier = BinaryRelevance(MLPClassifier(hidden_layer_sizes = 200, verbose=True, max_iter = 15,learning_rate_init = 0.0035 )) # classifier = LabelPowerset(MLPClassifier()) # 0.58 # classifier = LabelPowerset(MLPClassifier(hidden_layer_sizes = 200, verbose=True)) #0.589 # classifier = LabelPowerset(MLPClassifier(hidden_layer_sizes = 200, verbose=True, max_iter = 400)) #.584 # classifier = LabelPowerset(MLPClassifier(hidden_layer_sizes = 400, verbose=True, learning_rate_init = 0.0035)) # classifier = LabelPowerset(GaussianNB())0.18 # classifier = ClassifierChain(DecisionTreeClassifier()) 0.17 # classifier = ClassifierChain(GaussianNB())0.02 # classifier = ClassifierChain(GaussianNB())#0.2 # classifier = BinaryRelevance(GaussianNB()).35 withoutcue # classifier = BinaryRelevance(DecisionTreeClassifier())0.15 # classifier = BinaryRelevance(OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1))0.41 # classifier = BinaryRelevance(OneClassSVM(nu=0.5, kernel="rbf", gamma=0.5)) # classifier = BinaryRelevance(RandomForestClassifier(n_estimators=25))0.29 # classifier = BinaryRelevance(RandomForestClassifier(n_estimators=100))0.24 # classifier = BinaryRelevance(RandomForestClassifier(n_estimators=3)).2900 # classifier = BinaryRelevance(MLPClassifier()).22 classifier.fit(X_train_2d, y_train) y_predict = classifier.predict(X_validate_2d) f1_measure = f1_score( y_validate, y_predict, average="weighted", ) print(f1_measure) pickle.dump(classifier, open("scope_detector.sav", 'wb'))
class LMWrapper(Model): def __init__(self, C=1.0, use_idf=False, filename=None, **kwargs): self.lm = LabelPowerset(MultinomialNB()) self.vect1 = TfidfVectorizer(norm=None, use_idf=use_idf, min_df=0.0, ngram_range=(1, 1)) self.selector = sklearn.feature_selection.SelectKBest(k='all') self.output_dim = 0 if filename is not None: self.load(filename) def build_representation(self, x, y=None, fit=False): auxX = [ ' \n '.join([ ' '.join(['w_' + str(token) for token in field if token != 0]) for field in instance ]) for instance in x ] if fit: self.vect1.fit(auxX) auxX = self.vect1.transform(auxX) if fit: self.selector.fit(auxX, np.array([np.argmax(i) for i in y])) auxX = self.selector.transform(auxX) return auxX.todense() def fit(self, x, y, validation_data=None): auxY = y print('Build representation...') auxX = self.build_representation(x, auxY, fit=True) print('auxX shape:', auxX.shape) print('Fit model...') self.lm.fit(auxX, auxY) self.output_dim = auxY.shape[1] if validation_data is None: return None res = self.evaluate(validation_data[0], validation_data[1]) print("Accuracy in validation data =", res) return None def predict(self, x): auxX = self.build_representation(x, fit=False) print('Predicting baseline...') auxY = self.lm.predict(auxX) #auxY = to_categorical(auxY) if auxY.shape[1] < self.output_dim: npad = ((0, 0), (0, self.output_dim - auxY.shape[1])) auxY = np.pad(auxY, pad_width=npad, mode='constant', constant_values=0) return [auxY, [], []] def predict_prob(self, x): auxX = self.build_representation(x, fit=False) print('Predicting baseline...') auxY = self.lm.predict_proba(auxX) if auxY.shape[1] < self.output_dim: npad = ((0, 0), (0, self.output_dim - auxY.shape[1])) auxY = np.pad(auxY, pad_width=npad, mode='constant', constant_values=0) return [auxY, [], []] def evaluate(self, x, y): auxX = self.build_representation(x, fit=False) auxY = y auxY = np.array([np.argmax(i) for i in auxY]) return sklearn.metrics.accuracy_score(y_true=auxY, y_pred=self.lm.predict(auxX)) def save(self, filename): f = open(filename, "wb") pickle.dump(self.lm, f, protocol=4) pickle.dump(self.vect1, f, protocol=4) pickle.dump(self.selector, f, protocol=4) pickle.dump(self.output_dim, f, protocol=4) f.close() def load(self, filename): f = open(filename, "rb") self.lm = pickle.load(f) self.vect1 = pickle.load(f) self.selector = pickle.load(f) self.output_dim = pickle.load(f) f.close()