def __init__(self, db):
     AbstractController.__init__(self, db)
     self._target_field = self._config_parser.eval(self.__class__.__name__,
                                                   "target_field")
     self._select_features = self._config_parser.eval(
         self.__class__.__name__, "select_features")
     self._remove_features = self._config_parser.eval(
         self.__class__.__name__, "remove_features")
     self._k = self._config_parser.eval(self.__class__.__name__, "k")
     self._num_of_features_to_select = self._config_parser.eval(
         self.__class__.__name__, "num_of_features_to_select")
     self._classifiers_with_parameters_dict = self._config_parser.eval(
         self.__class__.__name__, "classifiers_with_parameters_dict")
     self._compare_matics_by_order = self._config_parser.eval(
         self.__class__.__name__, "compare_matrics_by_order")
     self._label_text_to_value = self._config_parser.eval(
         self.__class__.__name__, "label_text_to_value")
     self._results_file_path = "data/output/expermintal_environment/classification_results_refactored.csv"
     self._feature_importance_file = 'data\\output\\expermintal_environment\\feature_importance.csv'
     self._saved_classifier_path = self._config_parser.eval(
         self.__class__.__name__, "saved_classifier_path")
     self._results_dictionary = {}
     self._classifiers_by_name = {}
     self._data_handler = Data_Handler(
         db, targeted_class_name=self._target_field)
    def setUp(self):
        self._db = DB()
        self._db.setUp()
        self._data_handler = Data_Handler(self._db, 'author_type')
        self._authors_to_author_features_dict = {}

        self._fill_empty= True
        self._remove_features = []
        self._select_features = []
        self._label_text_to_value = {'good':0,'bad':1}
Esempio n. 3
0
    def __init__(self, db):
        AbstractController.__init__(self, db)
        self._target_field = self._config_parser.eval(self.__class__.__name__,
                                                      "target_field")
        self._data_handler = Data_Handler(
            db, targeted_class_name=self._target_field)
        self._classifier_file_path_and_name = self._config_parser.eval(
            self.__class__.__name__, "classifier_file_path_and_name")
        self._selected_feature_file_path_and_name = self._config_parser.eval(
            self.__class__.__name__, "selected_feature_file_path_and_name")
        self._saved_prediction_file_path_and_name = self._config_parser.eval(
            self.__class__.__name__, "saved_prediction_file_path_and_name")

        self._fill_empty = self._config_parser.eval(self.__class__.__name__,
                                                    "fill_empty")
        self._remove_features = self._config_parser.eval(
            self.__class__.__name__, "remove_features")
        self._select_features = self._config_parser.eval(
            self.__class__.__name__, "select_features")
        self._label_text_to_value = self._config_parser.eval(
            self.__class__.__name__, "label_text_to_value")

        self._classifier = self._load_file(self._classifier_file_path_and_name)
class Data_Handler_Unittests(TestCase):
    def setUp(self):
        self._db = DB()
        self._db.setUp()
        self._data_handler = Data_Handler(self._db, 'author_type')
        self._authors_to_author_features_dict = {}

        self._fill_empty= True
        self._remove_features = []
        self._select_features = []
        self._label_text_to_value = {'good':0,'bad':1}

    def tearDown(self):
        self._db.session.close()
        self._db.deleteDB()
        self._db.session.close()

    def test_basic_case(self):
        self._create_author_with_features('1','good',(10,11,12,13,14,15,16))
        self._create_author_with_features('2','bad', (20,21,22,23,24,25,26))
        self._create_author_with_features('3','good', (30,31,32,33,34,35,36))
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(self._remove_features, self._select_features, self._label_text_to_value)

        self.assertEqual(1,1)

    def test_remove_by_prefix(self):
        self._create_author('123','bad')
        self._create_author_feature_with_name('123',3,'feature_test')
        self._create_author_feature_with_name('123', 5, 'feature_test2')
        self._create_author_feature_with_name('123', 5, 'bla_bla')
        self._create_author_feature_with_name('123', 5, 'dada')
        self._create_author_feature_with_name('123', 6, 'bla_bli')
        self._data_handler._remove_features_by_prefix = ['feature_test','bla']
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(self._remove_features, self._select_features, self._label_text_to_value)
        feature_num = len(authors_features_dataframe.columns)
        self.assertEqual(feature_num, 1)


    def test_remove_by_prefix_2(self):
        self._create_author('123','bad')
        self._create_author_feature_with_name('123',3,'feature_test')
        self._create_author_feature_with_name('123', 5, 'feature_test2')
        self._create_author_feature_with_name('123', 5, 'bla_bla')
        self._create_author_feature_with_name('123', 5, 'dada')
        self._create_author_feature_with_name('123', 6, 'bla_bli')
        self._data_handler._remove_features_by_prefix = ['feature_test']
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(
            self._remove_features, self._select_features, self._label_text_to_value)
        feature_num = len(authors_features_dataframe.columns)
        self.assertEqual(3, feature_num)

    def test_select_by_prefix(self):
        self._create_author('123','bad')
        self._create_author_feature_with_name('123',3,'feature_test')
        self._create_author_feature_with_name('123', 5, 'feature_test2')
        self._create_author_feature_with_name('123', 5, 'bla_bla')
        self._create_author_feature_with_name('123', 5, 'dada')
        self._create_author_feature_with_name('123', 6, 'bla_bli')
        self._data_handler._select_features_by_prefix = ['feature_test']
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(
            self._remove_features, self._select_features, self._label_text_to_value)
        feature_num = len(authors_features_dataframe.columns)
        self.assertEqual(2, feature_num)

    def test_select_by_prefix2(self):
        self._create_author('123','bad')
        self._create_author_feature_with_name('123',3,'feature_test')
        self._create_author_feature_with_name('123', 5, 'feature_test2')
        self._create_author_feature_with_name('123', 5, 'bla_bla')
        self._create_author_feature_with_name('123', 5, 'bloom_bla')
        self._create_author_feature_with_name('123', 5, 'blada')
        self._create_author_feature_with_name('123', 6, 'bla_bli')
        self._data_handler._select_features_by_prefix = ['bla']
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(
            self._remove_features, self._select_features, self._label_text_to_value)
        feature_num = len(authors_features_dataframe.columns)
        self.assertEqual(3, feature_num)

    def test_fill_and_drop_nan(self):
        self._create_author_with_features('1','good',(10,None,12,None))
        self._create_author_with_features('2', 'bad', (20, 24, 22,None))
        self._create_author_with_features('3', 'bad', (30, 34, 32,None))
        self._data_handler._fill_empty = 'zero'
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(self._remove_features, self._select_features, self._label_text_to_value)
        null_val = authors_features_dataframe.iloc[0][u'1']
        self.assertEqual(null_val,0)
        did_remove_empty_column = u'3' not in authors_features_dataframe.columns
        self.assertTrue(did_remove_empty_column)
        self._data_handler._fill_empty= 'mean'
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(self._remove_features, self._select_features, self._label_text_to_value)
        null_val = authors_features_dataframe.iloc[0][u'1']
        self.assertEqual(null_val,(24+34)/2)

    def test_get_split(self):
        self._auto_create_authors(4,7)
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(
            self._remove_features, self._select_features, self._label_text_to_value)
        test_set, train_set, test_labels, train_labels = self._data_handler.get_the_k_fragment_from_dataset(authors_features_dataframe,
                                                                                                            authors_labels,
                                                                    0, 2)
        self.assertEqual(test_set.iloc[0][u'0'],11)
        self.assertEqual(test_set.iloc[1][u'0'],21)
        test_set, train_set, test_labels, train_labels = self._data_handler.get_the_k_fragment_from_dataset(
            authors_features_dataframe,
            authors_labels,
            1, 2)
        self.assertEqual(test_set.iloc[0][u'0'],31)
        self.assertEqual(test_set.iloc[1][u'0'],41)

    def test_train_and_test_differ(self):
        author_number = 7
        self._auto_create_authors(author_number,9)
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(
            self._remove_features, self._select_features, self._label_text_to_value)
        test_set, train_set, test_labels, train_labels = self._data_handler.get_the_k_fragment_from_dataset(authors_features_dataframe,
                                                                                                            authors_labels,
                                                                                                            0, 7)
        for num in range(author_number):
            author_guid =(num+1)*10+1
            is_in_both = self._is_val_in_datatframe(test_set,author_guid)==self._is_val_in_datatframe(train_set,author_guid)
            if is_in_both:
                logging.info("in both " + str(author_guid))
            self.assertFalse(is_in_both)

        test_set, train_set, test_labels, train_labels = self._data_handler.get_the_k_fragment_from_dataset(authors_features_dataframe,
                                                                                                            authors_labels,
                                                                                                            6, 7)
        for num in range(author_number):
            author_guid =(num+1)*10+1
            is_in_both = self._is_val_in_datatframe(test_set,author_guid)==self._is_val_in_datatframe(train_set,author_guid)
            if is_in_both:
                logging.info("in both "+str(author_guid))
            self.assertFalse(is_in_both)

    def _auto_create_authors(self, author_num, num_of_features):
        for num in range(author_num):
            author_name = num+1
            feature = []
            for feature_name in range(num_of_features):
                feature.append(str(author_name*10+feature_name+1))
            author_type = str(author_name*1000+author_name)
            self._create_author_with_features(str(author_name),author_type, feature)

    def _compare_authors_features_to_author(self):
        pass
    
    def _create_author_with_features(self, author_guid, author_type, feature_values):
        self._create_author(author_guid,author_type)
        for feature_value in feature_values:
            self._create_author_feature(author_guid,feature_value)
        self._db.session.commit()

    def _create_author(self, guid, author_type):
        author = Author()
        author.name = unicode(guid)
        author.domain = u'Microblog'
        author.author_guid = unicode(guid)
        author.author_screen_name = u'TestUser1'
        author.author_type = author_type
        author.domain = u'Restaurant'
        author.author_osn_id = 1

        self._authors_to_author_features_dict[author.author_guid]=[]
        self._db.add_author(author)

    def _create_author_feature(self, author_guid, value):
        feature_name = str(len(self._authors_to_author_features_dict[author_guid]))
        self._create_author_feature_with_name(author_guid, value, feature_name)

    def _create_author_feature_with_name(self, author_guid, value, feature_name):
        author_feature = AuthorFeatures()
        author_feature.author_guid = author_guid
        author_feature.window_start = date('2010-01-01 00:00:00')
        author_feature.window_end = date('2020-01-01 23:59:59')
        author_feature.attribute_name = feature_name
        author_feature.attribute_value=value
        self._authors_to_author_features_dict[author_guid].append(author_feature)
        self._db.update_author_features((author_feature))
        self._db.session.commit()

    def _is_val_in_datatframe(self, df, value):
        for row in range(df.shape[0]):  # df is the DataFrame
            for col in range(df.shape[1]):
                if df.iloc[row][col] == value:
                    return True
        return False
    def _get_random_guid(self):
        return unicode(uuid.uuid4())
Esempio n. 5
0
class Classifier_Runner(AbstractController):
    def __init__(self, db):
        AbstractController.__init__(self, db)
        self._target_field = self._config_parser.eval(self.__class__.__name__,
                                                      "target_field")
        self._data_handler = Data_Handler(
            db, targeted_class_name=self._target_field)
        self._classifier_file_path_and_name = self._config_parser.eval(
            self.__class__.__name__, "classifier_file_path_and_name")
        self._selected_feature_file_path_and_name = self._config_parser.eval(
            self.__class__.__name__, "selected_feature_file_path_and_name")
        self._saved_prediction_file_path_and_name = self._config_parser.eval(
            self.__class__.__name__, "saved_prediction_file_path_and_name")

        self._fill_empty = self._config_parser.eval(self.__class__.__name__,
                                                    "fill_empty")
        self._remove_features = self._config_parser.eval(
            self.__class__.__name__, "remove_features")
        self._select_features = self._config_parser.eval(
            self.__class__.__name__, "select_features")
        self._label_text_to_value = self._config_parser.eval(
            self.__class__.__name__, "label_text_to_value")

        self._classifier = self._load_file(self._classifier_file_path_and_name)

    def predict_on_all_unlabled(self):
        dataframe = self._data_handler.get_unlabeled_authors_feature_dataframe_for_classification(
            self._fill_empty, self._remove_features, self._select_features)
        features_names = self._load_file(
            self._selected_feature_file_path_and_name)
        dataframe = dataframe[features_names]
        dataframe = self._data_handler._replace_missing_values(dataframe)
        labels = self._classifier.predict(dataframe)
        self.save_prediction_to_file(dataframe, labels,
                                     self._label_text_to_value)

    def save_prediction_to_file(self, dataframe, labels, optinal_classes):
        inv_dict = self._invert_dict(optinal_classes)
        index_field = dataframe.index
        tuples = [(index_field[i], labels[i], inv_dict[labels[i]])
                  for i in range(len(labels))]
        try:
            with open(self._saved_prediction_file_path_and_name,
                      'wb') as csv_file:
                wr = csv.writer(csv_file, delimiter=',')
                wr.writerow(('index_field_value', 'label', 'textual_label'))
                for cdr in tuples:
                    row = cdr
                    wr.writerow(row)
        except Exception as e:
            logging.info(
                "could not open the prediction file, additional info: " + e)

    def _load_file(self, file_name):
        try:
            with open(file_name, 'rb') as f:
                clf = joblib.load(f)
                return clf
        except Exception as e:
            logging.info("could not open the file:" + file_name +
                         "\n additional info: " + e)

    def _invert_dict(self, dict):
        try:
            inv_dict = {v: k for k, v in dict.items()}
            return inv_dict
        except Exception as e:
            logging.info("verbal to numeral classification is not injective")
class Classifier_Trainer(AbstractController):
    def __init__(self, db):
        AbstractController.__init__(self, db)
        self._target_field = self._config_parser.eval(self.__class__.__name__,
                                                      "target_field")
        self._select_features = self._config_parser.eval(
            self.__class__.__name__, "select_features")
        self._remove_features = self._config_parser.eval(
            self.__class__.__name__, "remove_features")
        self._k = self._config_parser.eval(self.__class__.__name__, "k")
        self._num_of_features_to_select = self._config_parser.eval(
            self.__class__.__name__, "num_of_features_to_select")
        self._classifiers_with_parameters_dict = self._config_parser.eval(
            self.__class__.__name__, "classifiers_with_parameters_dict")
        self._compare_matics_by_order = self._config_parser.eval(
            self.__class__.__name__, "compare_matrics_by_order")
        self._label_text_to_value = self._config_parser.eval(
            self.__class__.__name__, "label_text_to_value")
        self._results_file_path = "data/output/expermintal_environment/classification_results_refactored.csv"
        self._feature_importance_file = 'data\\output\\expermintal_environment\\feature_importance.csv'
        self._saved_classifier_path = self._config_parser.eval(
            self.__class__.__name__, "saved_classifier_path")
        self._results_dictionary = {}
        self._classifiers_by_name = {}
        self._data_handler = Data_Handler(
            db, targeted_class_name=self._target_field)

    def execute(self, window_start=None):
        logging.info("started training classifiers")
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(
            self._remove_features, self._select_features,
            self._label_text_to_value)
        # self._calculate_feature_variance(authors_features_dataframe) # this is for test reasons
        self.calculate_feature_importance(authors_features_dataframe,
                                          authors_labels)
        print("==============================")
        for num_of_features in self._num_of_features_to_select:
            reduced_authors_features_dataframe, selected_feature_names = self._find_k_best_and_reduce_dimensions(
                num_of_features, authors_features_dataframe, authors_labels)
            for classifier_dictionary in self._classifiers_with_parameters_dict:
                for number_of_fragments in self._k:
                    classifier_name = classifier_dictionary["name"]
                    msg = "\r Classifier name: {0}, Number of features to select {1}, k = {2}".format(
                        classifier_name, num_of_features, number_of_fragments)
                    print(msg, end="")
                    current_experiment_result_sum = Result_Container()
                    for fragment_index in range(number_of_fragments):
                        current_experiment_result_sum = self.run_experiment_on_fragment(
                            authors_labels, classifier_dictionary,
                            current_experiment_result_sum,
                            reduced_authors_features_dataframe, fragment_index,
                            number_of_fragments)
                    classifier_result = Classifier_Result(
                        number_of_fragments, num_of_features,
                        len(authors_features_dataframe.columns),
                        classifier_dictionary["name"],
                        classifier_dictionary["params"],
                        selected_feature_names)
                    self.calculate_average_result_of_classification_on_fragments(
                        classifier_dictionary, current_experiment_result_sum,
                        number_of_fragments, classifier_result,
                        num_of_features)
        self._save_results_to_csv(self._results_file_path)
        self.summarize_and_score_best_classifier(authors_features_dataframe,
                                                 authors_labels)
        exit(0)

    def summarize_and_score_best_classifier(self, authors_features_dataframe,
                                            authors_labels):
        best_classifier_name, best_classifier_results = self.find_best_classifier(
        )
        print(
            "==============================================\n==============================================\n=============================================="
        )
        print("best_classifier= " + best_classifier_results.classifier_name +
              " \nresults: " + best_classifier_results.to_string())
        reduced_authors_features_dataframe, selected_feature_names = self._find_k_best_and_reduce_dimensions(
            best_classifier_results.num_of_feature_selected,
            authors_features_dataframe, authors_labels)
        classifier = self._classifiers_by_name[best_classifier_name].fit(
            reduced_authors_features_dataframe, authors_labels)
        logging.info("saving best classifier pickle")
        self._save_to_disk(classifier,
                           best_classifier_results.classifier_name + '.pkl')
        selected_features_names = best_classifier_results.selected_features_names
        self._save_to_disk(selected_features_names,
                           'selected_features_names.pkl')

    def calculate_average_result_of_classification_on_fragments(
            self, classifier_dictionary, current_experiment_result_sum, k,
            classifier_results, num_of_features):
        experiment_name = self.get_experminet_name(
            k, classifier_dictionary["name"], classifier_dictionary["params"],
            num_of_features)
        current_experiment_avg_result = current_experiment_result_sum.divide_by_scalar(
            k)
        classifier_results.set_results(current_experiment_avg_result)
        logging.info("current result:" + classifier_results.to_string())
        self._classifiers_by_name[
            experiment_name] = self.get_classifier_instance(
                classifier_dictionary)
        self._results_dictionary[experiment_name] = classifier_results

    def run_experiment_on_fragment(self, authors_labels, classifier_dictionary,
                                   current_experiment_result_sum, data_frame,
                                   i, k):
        test_set, train_set, test_labels, train_labels = self._data_handler.get_the_k_fragment_from_dataset(
            data_frame, authors_labels, i, k)
        fitted_classifier = self.run_classifier(classifier_dictionary,
                                                train_set, train_labels)
        current_results = self.evaluate_classifier(fitted_classifier, test_set,
                                                   test_labels)
        current_experiment_result_sum = current_experiment_result_sum.add(
            current_results)
        return current_experiment_result_sum

    def get_experminet_name(self, k, classifier, parameters, num_of_features,
                            **kwargs):
        experiment_str = "K: " + str(k) + "_classifierName:" + str(
            classifier) + "_parameters:" + str(
                parameters) + "_numOfFeatures:" + str(num_of_features)
        for arg in kwargs:
            experiment_str += " " + str(arg)
        return experiment_str

    def run_classifier(self, classifier_dict, X, Y):
        classifier = self.get_classifier_instance(classifier_dict)
        # logging.info("started fitting classifier - "+classifier_name)
        classifier.fit(X, Y)
        # logging.info("finished fitting classifier - " + classifier_name)
        return classifier

    def get_classifier_instance(self, classifier_dict):
        classifier_name = classifier_dict["name"]
        params = classifier_dict["params"]
        classifier = eval(classifier_name + "(" + params + ")")
        return classifier

    def evaluate_classifier(self, classifier, test_features, test_labels):
        predicted = classifier.predict(test_features)
        actual = test_labels

        performance_report = precision_recall_fscore_support(
            actual, predicted, average='weighted')
        precision = performance_report[0]
        recall = performance_report[1]
        f1 = performance_report[2]
        accuracy = accuracy_score(actual, predicted)
        try:
            # num_of_classes= len(set(actual))
            # # num_of_classes = predicted.unique()
            # if num_of_classes>2:
            #     auc = self._calculate_weighted_auc(PerformanceMeasures.AUC, actual,predicted)
            # elif num_of_classes==2:
            auc = roc_auc_score(actual, predicted)
        except Exception as e:
            auc = -1
        results = Result_Container(precision, recall, accuracy, auc, f1)
        return results

    def find_best_classifier(self):
        best_classifier_name_and_scores = ('empty_classifier',
                                           Classifier_Result())
        for classifier_name in list(self._results_dictionary.keys()):
            classifier_results = self._results_dictionary[classifier_name]
            current_classifier_result_tupple = (classifier_name,
                                                classifier_results)
            best_classifier_name_and_scores = self._compare_results(
                best_classifier_name_and_scores,
                current_classifier_result_tupple)
        best_classifier = best_classifier_name_and_scores[0]
        classifier_results = best_classifier_name_and_scores[1]
        return best_classifier, classifier_results

    def _find_k_best_and_reduce_dimensions(self, num_of_features,
                                           labeled_author_features_dataframe,
                                           targeted_class_series):
        if num_of_features == 'all':
            return labeled_author_features_dataframe, labeled_author_features_dataframe.columns
        k_best_classifier = SelectKBest(score_func=f_classif,
                                        k=num_of_features)

        k_best_classifier = k_best_classifier.fit(
            labeled_author_features_dataframe, targeted_class_series)
        k_best_features = k_best_classifier.fit_transform(
            labeled_author_features_dataframe, targeted_class_series)
        reduced_dataframe_column_names = self._get_k_best_feature_names(
            k_best_classifier, labeled_author_features_dataframe)

        print("Best features found are: ")
        print(', '.join(reduced_dataframe_column_names))

        reduced_dataframe = pd.DataFrame(
            k_best_features, columns=reduced_dataframe_column_names)

        return reduced_dataframe, reduced_dataframe_column_names

    def _compare_results(self, result1, result2):
        result_1_first_val, result_1_second_val, result_2_first_val, result_2_second_val = self.get_comparable_results_fields(
            result1, result2)
        if result_1_first_val == result_2_first_val:
            if result_1_second_val > result_2_second_val:
                return result1
            else:
                return result2
        elif result_1_first_val > result_2_first_val:
            return result1
        return result2

    def _get_k_best_feature_names(self, k_best_classifier, original_dataframe):
        mask = k_best_classifier.get_support()
        best_feature_names = []
        column_names = list(original_dataframe.columns.values)
        for boolean_value, feature_name in zip(mask, column_names):
            if boolean_value == True:
                best_feature_names.append(feature_name)
        return best_feature_names

    def get_comparable_results_fields(self, classifier_1_result,
                                      classifier_2_result):
        first_param = self._compare_matics_by_order[0]
        second_param = self._compare_matics_by_order[1]
        result1 = classifier_1_result[1].result_container
        result2 = classifier_2_result[1].result_container
        result_1_first_val = getattr(result1, first_param)
        result_2_first_val = getattr(result2, first_param)
        result_1_second_val = getattr(result1, second_param)
        result_2_second_val = getattr(result2, second_param)
        return result_1_first_val, result_1_second_val, result_2_first_val, result_2_second_val

    def _save_results_to_csv(self, _results_file_path):
        with open(self._results_file_path, 'w') as csv_file:
            wr = csv.writer(csv_file, delimiter=',')
            wr.writerow(Classifier_Result.columns)
            for cdr in list(self._results_dictionary.values()):
                row = cdr.to_list()
                wr.writerow(row)

    def _save_to_disk(self, classifier, classifier_name):
        file_path = self._saved_classifier_path + "/" + classifier_name
        with open(file_path, 'wb') as fid:
            joblib.dump(classifier, fid)

    def _calculate_feature_variance(self, labeled_author_features_dataframe):
        variance_list = []
        for column in labeled_author_features_dataframe.columns:
            value = labeled_author_features_dataframe[column].var()
            variance_list.append(value)
        print(variance_list)
        return variance_list

    def _calculate_weighted_auc(self, performance_measure_name, real_array,
                                predictions_proba):
        num_of_classes = len(set(real_array))
        total = len(predictions_proba)
        weighted_auc_score = 0.0
        performance_measure_score = 0
        for i in range(num_of_classes):
            curr_real = []
            curr_preds = []
            pos_count = 0.0
            for j, value in enumerate(real_array):
                if value == i:
                    curr_real.append(1)
                    pos_count += 1
                else:
                    curr_real.append(0)
                curr_preds.append(predictions_proba[j][i])
            weight = pos_count / total
            if performance_measure_name == PerformanceMeasures.AUC:
                performance_measure_score = roc_auc_score(curr_real,
                                                          curr_preds,
                                                          average="weighted")
            weighted_auc_score += performance_measure_score * weight
        return weighted_auc_score

    def calculate_feature_importance(self, X, y):
        # X, y = make_classification(n_samples=1000,
        #                            n_features=10,
        #                            n_informative=3,
        #                            n_redundant=0,
        #                            n_repeated=0,
        #                            n_classes=2,
        #                            random_state=0,
        #                            shuffle=False)

        # Build a forest and compute the feature importances
        column_headers = list(X.columns.values)
        forest = ExtraTreesClassifier(n_estimators=len(column_headers),
                                      random_state=0,
                                      criterion='entropy')
        forest.fit(X, y)
        # column_headers = forest.estimator_params

        importances = forest.feature_importances_
        std = np.std([
            tree.tree_.compute_feature_importances(normalize=False)
            for tree in forest.estimators_
        ],
                     axis=0)
        # indices = np.argsort(std)[::-1]

        # Print the feature ranking
        csv_file = open(self._feature_importance_file, 'wb')
        wr = csv.writer(csv_file, delimiter=',')
        wr.writerow(['feature', 'score'])

        print("Feature ranking:")

        for f in range(X.shape[1]):
            row = [column_headers[f]]
            # row.append(importances[indices[f]])
            row.append(std[f])
            wr.writerow(row)
            # print(str(f + 1)+" feature " +str(column_headers[f])+"   " +str(importances[indices[f]]))
            print(
                str(f + 1) + " feature " + str(column_headers[f]) + "   " +
                str(std[f]))
        csv_file.close()
        # Plot the feature importances of the forest
        plt.figure()
        plt.title("Feature importances")
        plt.bar(list(range(X.shape[1])),
                std,
                color="r",
                yerr=std,
                align="center")
        plt.xticks(list(range(X.shape[1])), column_headers)
        plt.xlim([-1, X.shape[1]])