def get_training_data_matrix(self, normalize, ablation_features=(), toExclude=()):
        """ Process the training data.
        Args:
        — normalize: a boolean flag if the data should be normalized in a feature matrix;
        — toExclude: a list/tuple of paradigms that cannot be used in the training data (for cross-validation);
        — ablate: a list of features to exclude during the ablation study.
        Return:
        - headlines;
        - a training sparse scipy matrix;
        - a list of targets.


        """
        assert isinstance(normalize, bool)
        self._check_if_ablation_appropriate(ablation_features)

        # additional data initialization:
        self._read_category_val_alternations(self.categoryPath)  # self.categoryDescription
        self._read_paradigm_lengths()  # self.pLengths
        # and this is a table maker itself
        setParadigms = set()
        with codecs.open(self.MLDataPath, 'r', 'utf-8-sig') as f:
            data = json.loads(f.read())
            processedData = []
            targets = []
            for lexeme in data:
                if lexeme["paradigm"] in toExclude:
                    continue
                else: setParadigms.add(lexeme["paradigm"])

                lexemeFeatureDic = self._convert_lexeme_to_feature_dic(lexeme, ablation_features)
                processedData.append(lexemeFeatureDic)

                sampleEval = FeatureExtractor.is_positive_example(lexeme)
                targets.append(sampleEval)

            headlines, matrix = self._dic_list_to_matrix(processedData, normalize)
            if setParadigms:
                logging.info("Training set paradigms: %s", u" ".join(list(setParadigms)))
            else:
                logging.critical("Training set is empty.")

            return headlines, matrix, targets
 def _category_entropy_variance(self, lexeme):
     return FeatureExtractor.category_entropy_variance(lexeme, self.categoryDescription)
 def _number_of_one_value_categories(self, lexeme):
     return FeatureExtractor.number_of_one_value_categories(lexeme, self.categoryDescription)
 def _entropy_to_paradigm_length(self, lexeme):
     return FeatureExtractor.entropy_to_paradigm_length(lexeme, self.pLengths)
 def _part_of_found_gramm(self, lexeme):
     return FeatureExtractor.part_of_found_grammars(lexeme, self.pLengths)
 def _part_of_found_flex(self, lexeme):
     return FeatureExtractor.part_of_found_flex(lexeme, self.pLengths)
 def _min_category_entropy(self, lexeme):
     return FeatureExtractor.min_category_entropy(lexeme, self.categoryDescription)