Exemple #1
0
    def fasttextTest2mlp(self ,fasttext_test_file, max_sequence_length, train_tokenizer, label_index_vector,
                         vectorization_type):
        ''' Preparing test data for MLP training'''
        test_corpus_df = utils_nb.get_articles_from_folder(fasttext_test_file)
        self.x_test = test_corpus_df['text']
        self.y_test = test_corpus_df['dewey']
        #self.correct_deweys = test_corpus_df['dewey'].values
        validDeweys = utils_nb.findValidDeweysFromTrain(self.y_test, label_index_vector)

        test_corpus_df = test_corpus_df.loc[test_corpus_df['dewey'].isin(validDeweys)]
        #print(test_corpus_df.describe())

        self.y_test = test_corpus_df['dewey']
        self.x_test = test_corpus_df['text']
        self.correct_deweys = self.y_test.values

        test_labels = []
        for dewey in self.y_test:
            test_labels.append(label_index_vector[dewey.strip()])

        test_sequences = train_tokenizer.texts_to_sequences(self.x_test)
        test_sequence_matrix = train_tokenizer.sequences_to_matrix(test_sequences, mode=vectorization_type)

        x_test = pad_sequences(test_sequence_matrix, maxlen=max_sequence_length)
        y_test = to_categorical(np.asarray(test_labels))

        return x_test, y_test
 def testFolder2Fasttext(self, pathToTestSet):
     test_corpus_df = utils_nb.get_articles_from_folder(pathToTestSet)
     ###Filtering articles by frequency of articles per dewey
     test_corpus_df = test_corpus_df.loc[test_corpus_df['dewey'].isin(self.validDeweys)]
     self.y_test = test_corpus_df['dewey'].values
     self.x_test = test_corpus_df['text'].values
     self.correct_deweys = self.y_test
Exemple #3
0
    def fit(self):
        self.fasttext2sklearn()
        #tfidf = TfidfVectorizer(norm = 'l2', min_df = 2, use_idf = True, smooth_idf= False, sublinear_tf = True, ngram_range = (1,4),
        #                        max_features = 20000)

        if self.vectorizationType == "tfidf":
            vectorizer = TfidfVectorizer()
            print("starter transformering")
            x_train_vectorized = vectorizer.fit_transform(self.x_train)
        else:
            if self.vectorizationType == "count":
                vectorizer = CountVectorizer()
                x_train_vectorized = vectorizer.fit_transform(self.x_train)
        self.vectorizer = vectorizer
        print("Transformering gjennomført")
        test_corpus_df = utils_nb.get_articles_from_folder(self.test_set)
        test_corpus_df = test_corpus_df.loc[test_corpus_df['dewey'].isin(
            self.validDeweys)]

        self.y_test = test_corpus_df['dewey']
        self.x_test = test_corpus_df['text']
        self.correct_deweys = test_corpus_df['dewey'].values

        x_test_vectorized = vectorizer.transform(self.x_test)
        self.x_test = x_test_vectorized
        print("Starter trening")

        mod = LogisticRegression()
        mod.fit(x_train_vectorized, self.y_train)
        #self.model = logMod
        self.model = mod
        self.saveModel()
Exemple #4
0
    def fasttext2sklearn(self):

        corpus_df = utils_nb.get_articles_from_folder(self.training_set)
        ###Filtering articles by frequency of articles per dewey
        corpus_df = corpus_df.groupby('dewey')[
            'text', 'file_name',
            'dewey'].filter(lambda x: len(x) >= self.minNumArticlesPerDewey)
        if self.strictArticleSelection:
            corpus_df = utils_nb.getStrictArticleSelection(
                corpus_df, self.minNumArticlesPerDewey)
        print(corpus_df.describe())
        self.y_train = corpus_df['dewey']
        self.x_train = corpus_df['text']
        self.findValidDeweysSklearn()
Exemple #5
0
    def fit_w_tuning(self):
        print("Her vil det tunes")
        self.fasttext2sklearn()
        #tfidf = TfidfVectorizer(norm = 'l2', min_df = 2, use_idf = True, smooth_idf= False, sublinear_tf = True, ngram_range = (1,4),
        #                        max_features = 20000)

        if self.vectorizationType == "tfidf":
            vectorizer = TfidfVectorizer()
            print("starter transformering")
            x_train_vectorized = vectorizer.fit_transform(self.x_train)
        else:
            if self.vectorizationType == "count":
                vectorizer = CountVectorizer()
                x_train_vectorized = vectorizer.fit_transform(self.x_train)
        print("Transformering gjennomført")
        test_corpus_df = utils_nb.get_articles_from_folder(self.test_set)
        test_corpus_df = test_corpus_df.loc[test_corpus_df['dewey'].isin(
            self.validDeweys)]

        self.y_test = test_corpus_df['dewey']
        self.x_test = test_corpus_df['text']
        self.correct_deweys = test_corpus_df['dewey'].values

        x_test_vectorized = vectorizer.transform(self.x_test)
        self.x_test = x_test_vectorized
        print("Starter trening")
        optimization_params = {
            'C': [1, 10, 100, 100],
            'penalty': ['l1', 'l2'],
            'class_weight': [None, 'balanced'],
            'multi_class': ['ovr', 'multinomial']
        }
        mod = LogisticRegression()
        grid = GridSearchCV(mod, optimization_params, cv=4, scoring='accuracy')
        best_model = grid.fit(x_train_vectorized, self.y_train)

        # View best hyperparameters
        print('Best Penalty:',
              best_model.best_estimator_.get_params()['penalty'])
        print('Best C:', best_model.best_estimator_.get_params()['C'])
        print('Best class weight',
              best_model.best_estimator__.get_params()['class_weight'])
        print('Best multi_class',
              best_model.best_estimator_.get_params()['multi_class'])
        #mod.fit(x_train_vectorized, self.y_train)
        #self.model = logMod
        self.model = best_model
        self.saveModel()
    def trainFolder2fasttext(self):
        corpus_df = utils_nb.get_articles_from_folder(self.trainingSetPath)
        ###Filtering articles by frequency of articles per dewey
        corpus_df = corpus_df.groupby('dewey')['text', 'file_name', 'dewey'].filter(
                    lambda x: len(x) >= self.minNumArticlesPerDewey)

        if self.strictArticleSelection:
           corpus_df = utils_nb.getStrictArticleSelection(corpus_df, self.minNumArticlesPerDewey)
        print(corpus_df.describe())
        self.y_train = corpus_df["dewey"].values
        self.x_train = corpus_df["text"].values
        print(len(self.y_train))
        print(len(self.x_train))
        self.findValidDeweysFT()
        fasttextInputFile = open(self.tmp_ft_file_path, "w")

        for i in range(0,len(self.y_train)):
            fasttextInputFile.write("__label__"+str(self.y_train[i])+" " + str(self.x_train[i]) + '\n')
Exemple #7
0
    def fasttextTrain2mlp(self, FASTTEXT_TRAIN_FILE, MAX_SEQUENCE_LENGTH, VOCAB_SIZE, VECTORIZATION_TYPE, minNumArticlesPerDewey):
        '''Converting training_set from fasttext format to MLP-format'''

        corpus_df = utils_nb.get_articles_from_folder(FASTTEXT_TRAIN_FILE)

        corpus_df = corpus_df.groupby('dewey')['text', 'file_name', 'dewey'].filter(lambda x: len(x) >= minNumArticlesPerDewey)
        if self.strictArticleSelection:
           corpus_df = utils_nb.getStrictArticleSelection(corpus_df, self.minNumArticlesPerDewey)

        print(corpus_df.describe())

        y_train = corpus_df['dewey']
        x_train = corpus_df['text']
        print(len(y_train))
        print(len(x_train))
        labels_index = {}
        labels = []
        for dewey in set(y_train):
            label_id = len(labels_index)
            labels_index[dewey] = label_id
        for dewey in y_train:
            labels.append(labels_index[dewey])
        print("length of labels indexes: {} ".format(len(labels_index)))
        # print(labels_index)
        print("Length of labels:{}".format(len(labels)))
        num_classes = len(set(y_train))
        # Preparing_training_set
        tokenizer = Tokenizer(num_words=VOCAB_SIZE)
        tokenizer.fit_on_texts(x_train)
        sequences = tokenizer.texts_to_sequences(x_train)
        sequence_matrix = tokenizer.sequences_to_matrix(sequences, mode=VECTORIZATION_TYPE)

        data = pad_sequences(sequence_matrix, maxlen=MAX_SEQUENCE_LENGTH)

        labels = to_categorical(np.asarray(labels))

        print(labels.shape)

        x_train = data
        y_train = labels

        return x_train, y_train, tokenizer, num_classes, labels_index  # x_test, y_test, num_classes
Exemple #8
0
    def fasttextTrain2CNN(self, training_set, max_sequence_length, vocab_size, minNumArticlesPerDewey):
        '''Transforming training set from fasttext format to CNN format.'''
        corpus_df = utils_nb.get_articles_from_folder(training_set)
        ###Filtering articles by frequency of articles per dewey
        corpus_df = corpus_df.groupby('dewey')['text', 'file_name', 'dewey'].filter(lambda x: len(x) >= minNumArticlesPerDewey)
        if self.strictArticleSelection:
           corpus_df = utils_nb.getStrictArticleSelection(corpus_df, self.minNumArticlesPerDewey)
        print(corpus_df.describe())

        self.y_train = corpus_df['dewey'].values
        self.x_train = corpus_df['text'].values
        print(len(self.y_train))
        print(len(self.x_train))
        labels_index = {}
        labels = []
        for dewey in set(self.y_train):
            label_id = len(labels_index)
            labels_index[dewey] = label_id
        for dewey in self.y_train:
            labels.append(labels_index[dewey])

        num_classes = len(set(corpus_df['dewey'].values))

        tokenizer = Tokenizer(num_words= vocab_size)
        tokenizer.fit_on_texts(self.x_train)
        sequences = tokenizer.texts_to_sequences(self.x_train)

        #print(sequences)
        word_index = tokenizer.word_index
        print('Found %s unique tokens.' % len(word_index))

        x_train = pad_sequences(sequences, maxlen=max_sequence_length)

        y_train = to_categorical(np.asarray(labels))

        return x_train, y_train, word_index, labels_index, tokenizer, num_classes
Exemple #9
0
    def findFeatureImportance(self):
        self.fasttext2sklearn()
        #tfidf = TfidfVectorizer(norm = 'l2', min_df = 2, use_idf = True, smooth_idf= False, sublinear_tf = True, ngram_range = (1,4),
        #                        max_features = 20000)

        if self.vectorizationType == "tfidf":
            vectorizer = TfidfVectorizer()
            print("starter transformering")
            x_train_vectorized = vectorizer.fit_transform(self.x_train)
        else:
            if self.vectorizationType == "count":
                vectorizer = CountVectorizer(min_df=10)
                x_train_vectorized = vectorizer.fit_transform(self.x_train)
        print("Transformering gjennomført")
        test_corpus_df = utils_nb.get_articles_from_folder(self.test_set)
        test_corpus_df = test_corpus_df.loc[test_corpus_df['dewey'].isin(
            self.validDeweys)]

        self.y_test = test_corpus_df['dewey']
        self.x_test = test_corpus_df['text']
        self.correct_deweys = test_corpus_df['dewey'].values

        x_test_vectorized = vectorizer.transform(self.x_test)
        self.x_test = x_test_vectorized
        print("Starter trening")

        forest = ExtraTreesClassifier(n_estimators=250,
                                      random_state=0,
                                      max_features=20)
        forest.fit(x_train_vectorized, self.y_train)
        print("most important features + \n")

        importances = forest.feature_importances_
        std = np.std(
            [tree.feature_importances_ for tree in forest.estimators_], axis=0)
        indices = np.argsort(importances)[::-1]

        # Print the feature ranking
        print("Feature ranking:")
        feature_importance_file = open(
            "/home/ubuntu/PycharmProjects_saved/tgpl_w_oop/randFeaturesImportance.txt",
            "w")
        feature_names = vectorizer.get_feature_names()
        for f in range(x_train_vectorized.shape[1]):
            #print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

            feature_importance_file.write(
                "%d. feature %s (%f) \n" %
                (f + 1, feature_names[indices[f]], importances[indices[f]]))
        print(feature_names)
        # Plot the feature importances of the forest
        # plt.figure()
        # plt.title("Feature importances")
        # plt.bar(range(x_train_vectorized.shape[1]), importances[indices],
        #         color="r", yerr=std[indices], align="center")
        # plt.xticks(range(x_train_vectorized.shape[1]), indices)
        # plt.xlim([-1, x_train_vectorized.shape[1]])
        # plt.show()

        #file.write(str(mod.feature_importances_))
        #print(mod.feature_importances_)
        #self.model = logMod
        self.model = forest
Exemple #10
0
    def predict(self, test_set):
        '''Test module for CNN'''
        test_corpus_df = utils_nb.get_articles_from_folder(test_set)
        k_top_labels = self.kPreds
        #Loading model


        model = load_model(self.modelDir+'/model.bin')

        # loading tokenizer
        with open(self.modelDir+'/tokenizer.pickle', 'rb') as handle:
            tokenizer = pickle.load(handle)
        # loading label indexes
        with open(self.modelDir + '/label_indexes.pickle', 'rb') as handle:
            labels_index = pickle.load(handle)

        # Loading parameters like max_sequence_length, vocabulary_size and vectorization_type
        # with open(self.modelDir+'/model_stats', 'r') as params_file:
        #     params_data = params_file.read()
        #
        # re_max_seq_length = re.search('length:(.+?)\n', params_data)
        # if re_max_seq_length:
        #         self.maxSequenceLength = int(re_max_seq_length.group(1))
        #         print("Max sequence length: {}".format(MAX_SEQUENCE_LENGTH))
        # re_vocab_size = re.search('size:(.+?)\n', params_data)
        # if re_vocab_size:
        #     vocab_size = int(re_vocab_size.group(1))
        #     print("The vocabulary size: {}".format(vocab_size))
        # if isMajority_rule == True:
        #     predictions, test_accuracy = cnn_majority_rule_test(test_set_dewey=test_set, MODEL=model,
        #                                                         MAX_SEQUENCE_LENGTH = MAX_SEQUENCE_LENGTH,
        #                                                         TRAIN_TOKENIZER = tokenizer, LABEL_INDEX_VECTOR = labels_index,
        #                                                         k_output_labels=k_top_labels)


        # test_labels = []
        # valid_deweys = set()
        # # Finding valid deweys based on training set
        # for dewey in self.y_test:
        #      ##If statement to ensure that you have the same deweys in
        #      if labels_index[dewey]:
        #         #test_labels.append(labels_index[dewey])
        #         valid_deweys.update(dewey)
        self.y_test = test_corpus_df['dewey']
        self.x_test = test_corpus_df['text']

        validDeweys = utils_nb.findValidDeweysFromTrain(self.y_test, labels_index)

        #test_corpus_df = test_corpus_df[test_corpus_df['dewey'].isin(validDeweys)]
        test_corpus_df = test_corpus_df.loc[test_corpus_df['dewey'].isin(validDeweys)]
        print(test_corpus_df.describe())

        self.y_test = test_corpus_df['dewey']
        self.x_test = test_corpus_df['text']
        self.correct_deweys = self.y_test.values
        test_labels = []
        for dewey in self.y_test:
                test_labels.append(labels_index[dewey])

        test_sequences = tokenizer.texts_to_sequences(self.x_test)
        test_word_index = tokenizer.word_index
        self.x_test = pad_sequences(test_sequences, maxlen=self.maxSequenceLength)

        self.y_test = to_categorical(test_labels)

        test_score, self.accuracy = model.evaluate(self.x_test, self.y_test, batch_size= self.batchSize, verbose=1)
        self.predictions = utils_nb.prediction(model, self.x_test, k_top_labels, labels_index)

        #Writing results to txt-file.
        with open(self.modelDir+"/result.txt",'a') as result_file:
            result_file.write('test_set:'+test_set+'\n'+
                              #'Test_score:'+ str(test_score)+ '\n'
                              'Test_accuracy:' + str(self.accuracy)+'\n\n')