def fasttextTest2mlp(self ,fasttext_test_file, max_sequence_length, train_tokenizer, label_index_vector, vectorization_type): ''' Preparing test data for MLP training''' test_corpus_df = utils_nb.get_articles_from_folder(fasttext_test_file) self.x_test = test_corpus_df['text'] self.y_test = test_corpus_df['dewey'] #self.correct_deweys = test_corpus_df['dewey'].values validDeweys = utils_nb.findValidDeweysFromTrain(self.y_test, label_index_vector) test_corpus_df = test_corpus_df.loc[test_corpus_df['dewey'].isin(validDeweys)] #print(test_corpus_df.describe()) self.y_test = test_corpus_df['dewey'] self.x_test = test_corpus_df['text'] self.correct_deweys = self.y_test.values test_labels = [] for dewey in self.y_test: test_labels.append(label_index_vector[dewey.strip()]) test_sequences = train_tokenizer.texts_to_sequences(self.x_test) test_sequence_matrix = train_tokenizer.sequences_to_matrix(test_sequences, mode=vectorization_type) x_test = pad_sequences(test_sequence_matrix, maxlen=max_sequence_length) y_test = to_categorical(np.asarray(test_labels)) return x_test, y_test
def testFolder2Fasttext(self, pathToTestSet): test_corpus_df = utils_nb.get_articles_from_folder(pathToTestSet) ###Filtering articles by frequency of articles per dewey test_corpus_df = test_corpus_df.loc[test_corpus_df['dewey'].isin(self.validDeweys)] self.y_test = test_corpus_df['dewey'].values self.x_test = test_corpus_df['text'].values self.correct_deweys = self.y_test
def fit(self): self.fasttext2sklearn() #tfidf = TfidfVectorizer(norm = 'l2', min_df = 2, use_idf = True, smooth_idf= False, sublinear_tf = True, ngram_range = (1,4), # max_features = 20000) if self.vectorizationType == "tfidf": vectorizer = TfidfVectorizer() print("starter transformering") x_train_vectorized = vectorizer.fit_transform(self.x_train) else: if self.vectorizationType == "count": vectorizer = CountVectorizer() x_train_vectorized = vectorizer.fit_transform(self.x_train) self.vectorizer = vectorizer print("Transformering gjennomført") test_corpus_df = utils_nb.get_articles_from_folder(self.test_set) test_corpus_df = test_corpus_df.loc[test_corpus_df['dewey'].isin( self.validDeweys)] self.y_test = test_corpus_df['dewey'] self.x_test = test_corpus_df['text'] self.correct_deweys = test_corpus_df['dewey'].values x_test_vectorized = vectorizer.transform(self.x_test) self.x_test = x_test_vectorized print("Starter trening") mod = LogisticRegression() mod.fit(x_train_vectorized, self.y_train) #self.model = logMod self.model = mod self.saveModel()
def fasttext2sklearn(self): corpus_df = utils_nb.get_articles_from_folder(self.training_set) ###Filtering articles by frequency of articles per dewey corpus_df = corpus_df.groupby('dewey')[ 'text', 'file_name', 'dewey'].filter(lambda x: len(x) >= self.minNumArticlesPerDewey) if self.strictArticleSelection: corpus_df = utils_nb.getStrictArticleSelection( corpus_df, self.minNumArticlesPerDewey) print(corpus_df.describe()) self.y_train = corpus_df['dewey'] self.x_train = corpus_df['text'] self.findValidDeweysSklearn()
def fit_w_tuning(self): print("Her vil det tunes") self.fasttext2sklearn() #tfidf = TfidfVectorizer(norm = 'l2', min_df = 2, use_idf = True, smooth_idf= False, sublinear_tf = True, ngram_range = (1,4), # max_features = 20000) if self.vectorizationType == "tfidf": vectorizer = TfidfVectorizer() print("starter transformering") x_train_vectorized = vectorizer.fit_transform(self.x_train) else: if self.vectorizationType == "count": vectorizer = CountVectorizer() x_train_vectorized = vectorizer.fit_transform(self.x_train) print("Transformering gjennomført") test_corpus_df = utils_nb.get_articles_from_folder(self.test_set) test_corpus_df = test_corpus_df.loc[test_corpus_df['dewey'].isin( self.validDeweys)] self.y_test = test_corpus_df['dewey'] self.x_test = test_corpus_df['text'] self.correct_deweys = test_corpus_df['dewey'].values x_test_vectorized = vectorizer.transform(self.x_test) self.x_test = x_test_vectorized print("Starter trening") optimization_params = { 'C': [1, 10, 100, 100], 'penalty': ['l1', 'l2'], 'class_weight': [None, 'balanced'], 'multi_class': ['ovr', 'multinomial'] } mod = LogisticRegression() grid = GridSearchCV(mod, optimization_params, cv=4, scoring='accuracy') best_model = grid.fit(x_train_vectorized, self.y_train) # View best hyperparameters print('Best Penalty:', best_model.best_estimator_.get_params()['penalty']) print('Best C:', best_model.best_estimator_.get_params()['C']) print('Best class weight', best_model.best_estimator__.get_params()['class_weight']) print('Best multi_class', best_model.best_estimator_.get_params()['multi_class']) #mod.fit(x_train_vectorized, self.y_train) #self.model = logMod self.model = best_model self.saveModel()
def trainFolder2fasttext(self): corpus_df = utils_nb.get_articles_from_folder(self.trainingSetPath) ###Filtering articles by frequency of articles per dewey corpus_df = corpus_df.groupby('dewey')['text', 'file_name', 'dewey'].filter( lambda x: len(x) >= self.minNumArticlesPerDewey) if self.strictArticleSelection: corpus_df = utils_nb.getStrictArticleSelection(corpus_df, self.minNumArticlesPerDewey) print(corpus_df.describe()) self.y_train = corpus_df["dewey"].values self.x_train = corpus_df["text"].values print(len(self.y_train)) print(len(self.x_train)) self.findValidDeweysFT() fasttextInputFile = open(self.tmp_ft_file_path, "w") for i in range(0,len(self.y_train)): fasttextInputFile.write("__label__"+str(self.y_train[i])+" " + str(self.x_train[i]) + '\n')
def fasttextTrain2mlp(self, FASTTEXT_TRAIN_FILE, MAX_SEQUENCE_LENGTH, VOCAB_SIZE, VECTORIZATION_TYPE, minNumArticlesPerDewey): '''Converting training_set from fasttext format to MLP-format''' corpus_df = utils_nb.get_articles_from_folder(FASTTEXT_TRAIN_FILE) corpus_df = corpus_df.groupby('dewey')['text', 'file_name', 'dewey'].filter(lambda x: len(x) >= minNumArticlesPerDewey) if self.strictArticleSelection: corpus_df = utils_nb.getStrictArticleSelection(corpus_df, self.minNumArticlesPerDewey) print(corpus_df.describe()) y_train = corpus_df['dewey'] x_train = corpus_df['text'] print(len(y_train)) print(len(x_train)) labels_index = {} labels = [] for dewey in set(y_train): label_id = len(labels_index) labels_index[dewey] = label_id for dewey in y_train: labels.append(labels_index[dewey]) print("length of labels indexes: {} ".format(len(labels_index))) # print(labels_index) print("Length of labels:{}".format(len(labels))) num_classes = len(set(y_train)) # Preparing_training_set tokenizer = Tokenizer(num_words=VOCAB_SIZE) tokenizer.fit_on_texts(x_train) sequences = tokenizer.texts_to_sequences(x_train) sequence_matrix = tokenizer.sequences_to_matrix(sequences, mode=VECTORIZATION_TYPE) data = pad_sequences(sequence_matrix, maxlen=MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(labels)) print(labels.shape) x_train = data y_train = labels return x_train, y_train, tokenizer, num_classes, labels_index # x_test, y_test, num_classes
def fasttextTrain2CNN(self, training_set, max_sequence_length, vocab_size, minNumArticlesPerDewey): '''Transforming training set from fasttext format to CNN format.''' corpus_df = utils_nb.get_articles_from_folder(training_set) ###Filtering articles by frequency of articles per dewey corpus_df = corpus_df.groupby('dewey')['text', 'file_name', 'dewey'].filter(lambda x: len(x) >= minNumArticlesPerDewey) if self.strictArticleSelection: corpus_df = utils_nb.getStrictArticleSelection(corpus_df, self.minNumArticlesPerDewey) print(corpus_df.describe()) self.y_train = corpus_df['dewey'].values self.x_train = corpus_df['text'].values print(len(self.y_train)) print(len(self.x_train)) labels_index = {} labels = [] for dewey in set(self.y_train): label_id = len(labels_index) labels_index[dewey] = label_id for dewey in self.y_train: labels.append(labels_index[dewey]) num_classes = len(set(corpus_df['dewey'].values)) tokenizer = Tokenizer(num_words= vocab_size) tokenizer.fit_on_texts(self.x_train) sequences = tokenizer.texts_to_sequences(self.x_train) #print(sequences) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) x_train = pad_sequences(sequences, maxlen=max_sequence_length) y_train = to_categorical(np.asarray(labels)) return x_train, y_train, word_index, labels_index, tokenizer, num_classes
def findFeatureImportance(self): self.fasttext2sklearn() #tfidf = TfidfVectorizer(norm = 'l2', min_df = 2, use_idf = True, smooth_idf= False, sublinear_tf = True, ngram_range = (1,4), # max_features = 20000) if self.vectorizationType == "tfidf": vectorizer = TfidfVectorizer() print("starter transformering") x_train_vectorized = vectorizer.fit_transform(self.x_train) else: if self.vectorizationType == "count": vectorizer = CountVectorizer(min_df=10) x_train_vectorized = vectorizer.fit_transform(self.x_train) print("Transformering gjennomført") test_corpus_df = utils_nb.get_articles_from_folder(self.test_set) test_corpus_df = test_corpus_df.loc[test_corpus_df['dewey'].isin( self.validDeweys)] self.y_test = test_corpus_df['dewey'] self.x_test = test_corpus_df['text'] self.correct_deweys = test_corpus_df['dewey'].values x_test_vectorized = vectorizer.transform(self.x_test) self.x_test = x_test_vectorized print("Starter trening") forest = ExtraTreesClassifier(n_estimators=250, random_state=0, max_features=20) forest.fit(x_train_vectorized, self.y_train) print("most important features + \n") importances = forest.feature_importances_ std = np.std( [tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") feature_importance_file = open( "/home/ubuntu/PycharmProjects_saved/tgpl_w_oop/randFeaturesImportance.txt", "w") feature_names = vectorizer.get_feature_names() for f in range(x_train_vectorized.shape[1]): #print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) feature_importance_file.write( "%d. feature %s (%f) \n" % (f + 1, feature_names[indices[f]], importances[indices[f]])) print(feature_names) # Plot the feature importances of the forest # plt.figure() # plt.title("Feature importances") # plt.bar(range(x_train_vectorized.shape[1]), importances[indices], # color="r", yerr=std[indices], align="center") # plt.xticks(range(x_train_vectorized.shape[1]), indices) # plt.xlim([-1, x_train_vectorized.shape[1]]) # plt.show() #file.write(str(mod.feature_importances_)) #print(mod.feature_importances_) #self.model = logMod self.model = forest
def predict(self, test_set): '''Test module for CNN''' test_corpus_df = utils_nb.get_articles_from_folder(test_set) k_top_labels = self.kPreds #Loading model model = load_model(self.modelDir+'/model.bin') # loading tokenizer with open(self.modelDir+'/tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle) # loading label indexes with open(self.modelDir + '/label_indexes.pickle', 'rb') as handle: labels_index = pickle.load(handle) # Loading parameters like max_sequence_length, vocabulary_size and vectorization_type # with open(self.modelDir+'/model_stats', 'r') as params_file: # params_data = params_file.read() # # re_max_seq_length = re.search('length:(.+?)\n', params_data) # if re_max_seq_length: # self.maxSequenceLength = int(re_max_seq_length.group(1)) # print("Max sequence length: {}".format(MAX_SEQUENCE_LENGTH)) # re_vocab_size = re.search('size:(.+?)\n', params_data) # if re_vocab_size: # vocab_size = int(re_vocab_size.group(1)) # print("The vocabulary size: {}".format(vocab_size)) # if isMajority_rule == True: # predictions, test_accuracy = cnn_majority_rule_test(test_set_dewey=test_set, MODEL=model, # MAX_SEQUENCE_LENGTH = MAX_SEQUENCE_LENGTH, # TRAIN_TOKENIZER = tokenizer, LABEL_INDEX_VECTOR = labels_index, # k_output_labels=k_top_labels) # test_labels = [] # valid_deweys = set() # # Finding valid deweys based on training set # for dewey in self.y_test: # ##If statement to ensure that you have the same deweys in # if labels_index[dewey]: # #test_labels.append(labels_index[dewey]) # valid_deweys.update(dewey) self.y_test = test_corpus_df['dewey'] self.x_test = test_corpus_df['text'] validDeweys = utils_nb.findValidDeweysFromTrain(self.y_test, labels_index) #test_corpus_df = test_corpus_df[test_corpus_df['dewey'].isin(validDeweys)] test_corpus_df = test_corpus_df.loc[test_corpus_df['dewey'].isin(validDeweys)] print(test_corpus_df.describe()) self.y_test = test_corpus_df['dewey'] self.x_test = test_corpus_df['text'] self.correct_deweys = self.y_test.values test_labels = [] for dewey in self.y_test: test_labels.append(labels_index[dewey]) test_sequences = tokenizer.texts_to_sequences(self.x_test) test_word_index = tokenizer.word_index self.x_test = pad_sequences(test_sequences, maxlen=self.maxSequenceLength) self.y_test = to_categorical(test_labels) test_score, self.accuracy = model.evaluate(self.x_test, self.y_test, batch_size= self.batchSize, verbose=1) self.predictions = utils_nb.prediction(model, self.x_test, k_top_labels, labels_index) #Writing results to txt-file. with open(self.modelDir+"/result.txt",'a') as result_file: result_file.write('test_set:'+test_set+'\n'+ #'Test_score:'+ str(test_score)+ '\n' 'Test_accuracy:' + str(self.accuracy)+'\n\n')