Example #1
0
    def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService,
              files: Iterator[File], **data) -> FormatModel:
        """
        Train a model given the files available.

        :param ptr: Git repository state pointer.
        :param config: configuration dict.
        :param data: contains "files" - the list of files in the pointed state.
        :param data_service: connection to the Lookout data retrieval service.
        :param files: iterator of File records from the data service.
        :return: AnalyzerModel containing the learned rules, per language.
        """
        _log = logging.getLogger(cls.__name__)
        train_config = cls._load_config(config)["train"]
        _log.info("train %s %s %s %s", __version__, ptr.url, ptr.commit,
                  pformat(train_config, width=4096, compact=True))
        model = FormatModel().generate(cls, ptr)
        for language, files in files_by_language(files).items():
            try:
                lang_config = train_config[language]
            except KeyError:
                _log.warning("language %s is not supported, skipped", language)
                continue
            _log.info("effective train config for %s:\n%s", language,
                      pformat(lang_config, width=120, compact=True))
            random_state = lang_config["random_state"]
            files = filter_files(
                files, lang_config["line_length_limit"], lang_config["overall_size_limit"],
                random_state, _log)
            submit_event("%s.train.%s.files" % (cls.name, language), len(files))
            if len(files) == 0:
                _log.info("zero files after filtering, language %s is skipped.", language)
                continue
            try:
                fe = FeatureExtractor(language=language, **lang_config["feature_extractor"])
            except ImportError:
                _log.warning("skipped %d %s files - not supported", len(files), language)
                continue
            else:
                _log.info("training on %d %s files", len(files), language)
            train_files, test_files = FormatAnalyzer.split_train_test(
                files, lang_config["test_dataset_ratio"], random_state=random_state)
            # ensure that the features are reproducible
            train_files = sorted(train_files, key=lambda x: x.path)
            test_files = sorted(test_files, key=lambda x: x.path)
            X_train, y_train, _ = fe.extract_features(train_files)
            X_train, selected_features = fe.select_features(X_train, y_train)
            if test_files:
                X_test, y_test, _ = fe.extract_features(test_files)
            if lang_config["test_dataset_ratio"]:
                _log.debug("Real test ratio is %.3f",
                           X_test.shape[0] / (X_test.shape[0] + X_train.shape[0])
                           if test_files else 0)
            lang_config["feature_extractor"]["selected_features"] = selected_features
            lang_config["feature_extractor"]["label_composites"] = fe.labels_to_class_sequences
            lower_bound_instances = lang_config["lower_bound_instances"]
            if X_train.shape[0] < lower_bound_instances:
                _log.warning("skipped %d %s files: too few samples (%d/%d)",
                             len(files), language, X_train.shape[0], lower_bound_instances)
                continue
            _log.info("extracted %d samples to train, searching for the best hyperparameters",
                      X_train.shape[0])
            optimizer = Optimizer(**lang_config["optimizer"], random_state=random_state)
            best_score, best_params = optimizer.optimize(X_train, y_train)
            if _log.isEnabledFor(logging.DEBUG):
                _log.debug("score of the best estimator found: %.6f", best_score)
                _log.debug("params of the best estimator found: %s", str(best_params))
                _log.debug("training the model with complete data")
            else:
                _log.info("finished hyperopt at %.6f, training the full model", -best_score)
            lang_config["trainable_rules"].update(best_params)
            trainable_rules = TrainableRules(**lang_config["trainable_rules"],
                                             random_state=random_state,
                                             origin_config=lang_config)
            trainable_rules.fit(X_train, y_train)
            importances = trainable_rules.feature_importances_
            _log.debug(
                "feature importances from %s:\n\t%s",
                lang_config["trainable_rules"]["base_model_name"],
                "\n\t".join("%-55s %.5E" % (fe.feature_names[i], importances[i])
                            for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5))
            trainable_rules.prune_categorical_attributes(fe)
            _log.info("obtained %d rules, generating the classification report",
                      len(trainable_rules.rules))
            trainable_rules.rules.generate_classification_report(
                X_train, y_train, "train", fe.composite_class_representations)
            if test_files:
                trainable_rules.rules.generate_classification_report(
                    X_test, y_test, "test", fe.composite_class_representations)
            submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules))
            if trainable_rules.rules.rules:
                model[language] = trainable_rules.rules
            else:
                _log.warning("model for %s has 0 rules. Skipped.", language)
        _log.info("trained %s", model)
        return model
Example #2
0
    def train(cls, ptr: ReferencePointer, config: Mapping[str, Any],
              data_service: DataService, **data) -> FormatModel:
        """
        Train a model given the files available.

        :param ptr: Git repository state pointer.
        :param config: configuration dict.
        :param data: contains "files" - the list of files in the pointed state.
        :param data_service: connection to the Lookout data retrieval service.
        :return: AnalyzerModel containing the learned rules, per language.
        """
        _log = logging.getLogger(cls.__name__)
        _log.info("train %s %s %s", ptr.url, ptr.commit,
                  pformat(config, width=4096, compact=True))
        model = FormatModel().construct(cls, ptr)
        config = cls._load_train_config(config)
        for language, files in files_by_language(data["files"]).items():
            try:
                lang_config = config[language]
            except KeyError:
                _log.warning("language %s is not supported, skipped", language)
                continue
            files = filter_files(files, lang_config["line_length_limit"], _log)
            submit_event("%s.train.%s.files" % (cls.name, language),
                         len(files))
            if len(files) == 0:
                _log.info(
                    "zero files after filtering, language %s is skipped.",
                    language)
                continue
            try:
                fe = FeatureExtractor(language=language,
                                      **lang_config["feature_extractor"])
            except ImportError:
                _log.warning("skipped %d %s files - not supported", len(files),
                             language)
                continue
            else:
                _log.info("training on %d %s files", len(files), language)
            # we sort to make the features reproducible
            X, y, _ = fe.extract_features(sorted(files, key=lambda x: x.path))
            X, selected_features = fe.select_features(X, y)
            lang_config["feature_extractor"][
                "selected_features"] = selected_features
            lang_config["feature_extractor"][
                "label_composites"] = fe.labels_to_class_sequences
            lower_bound_instances = lang_config["lower_bound_instances"]
            if X.shape[0] < lower_bound_instances:
                _log.warning("skipped %d %s files: too few samples (%d/%d)",
                             len(files), language, X.shape[0],
                             lower_bound_instances)
                continue
            _log.debug("training the rules model")
            optimizer = Optimizer(
                n_jobs=lang_config["n_jobs"],
                n_iter=lang_config["n_iter"],
                cv=lang_config["cv"],
                random_state=lang_config["trainable_rules"]["random_state"])
            best_score, best_params = optimizer.optimize(X, y)
            _log.debug("score of the best estimator found: %.6f", best_score)
            _log.debug("params of the best estimator found: %s",
                       str(best_params))
            _log.debug("training the model with complete data")
            lang_config["trainable_rules"].update(best_params)
            trainable_rules = TrainableRules(**lang_config["trainable_rules"],
                                             origin_config=lang_config)
            trainable_rules.fit(X, y)
            importances = trainable_rules.feature_importances_
            _log.debug(
                "feature importances from %s:\n\t%s",
                lang_config["trainable_rules"]["base_model_name"], "\n\t".join(
                    "%-55s %.5E" % (fe.feature_names[i], importances[i])
                    for i in numpy.argsort(-importances)[:25]
                    if importances[i] > 1e-5))
            submit_event("%s.train.%s.rules" % (cls.name, language),
                         len(trainable_rules.rules))
            # TODO(vmarkovtsev): save the achieved precision, recall, etc. to the model
            # throw away imprecise classes
            if trainable_rules.rules.rules:
                model[language] = trainable_rules.rules
            else:
                _log.warning("model for %s has 0 rules. Skipping.", language)
        _log.info("trained %s", model)
        return model