def test_save_and_load(self): fm1 = FormatModel() fm1["js"] = self.rules fm1["js2"] = self.rules fm1["js3"] = self.rules with tempfile.NamedTemporaryFile(prefix="lookout-") as f: fm1.save(f.name) fm2 = FormatModel().load(f.name) compare_models(self, fm1, fm2)
def analyze_files( analyzer_type: Type[FormatAnalyzer], config: dict, model_path: str, language: str, bblfsh_addr: str, input_pattern: str, log: logging.Logger, ) -> List[Comment]: """Run the model, record the fixes for each file and return them.""" class FakePointer: def to_pb(self): return None model = FormatModel().load(model_path) if language not in model: raise NotFittedError() rules = model[language] client = bblfsh.BblfshClient(bblfsh_addr) files = parse_files( filepaths=glob.glob(input_pattern, recursive=True), line_length_limit=rules.origin_config["line_length_limit"], overall_size_limit=rules.origin_config["overall_size_limit"], client=client, language=language, log=log) log.info("Model parameters: %s" % rules.origin_config) log.info("Rules stats: %s" % rules) log.info("Number of files: %s" % (len(files))) return analyzer_type(model, input_pattern, config).analyze(FakePointer(), None, data_service=FakeDataService( client, files, []))
def test_len(self): fm = FormatModel() self.assertEqual(len(fm), 0) fm["js"] = self.rules self.assertEqual(len(fm), 1) fm["js2"] = self.rules self.assertEqual(len(fm), 2)
def test_dump(self): fm = FormatModel() self.assertEqual( fm.dump(), "<unknown name>/[1, 0, 0] <unknown url> <unknown commit>") DUMP = """style.format.analyzer.FormatAnalyzer/[3] https://github.com/jquery/jquery c2026b117d1ca5b2e42a52c7e2a8ae8988cf0d4b # javascript 1269 rules, avg.len. 19.1""" # noqa self.assertEqual(self.fm.dump(), DUMP)
def return_features() -> Response: """Featurize the given code.""" body = request.get_json() code = body["code"] babelfish_address = body["babelfish_address"] language = body["language"] client = BblfshClient(babelfish_address) res = client.parse(filename="", contents=code.encode(), language=language) if res.status != 0: abort(500) model = FormatModel().load(str(Path(__file__).parent / "models" / "model.asdf")) if language not in model: raise NotFittedError() rules = model[language] file = UnicodeFile(content=code, uast=res.uast, language="javascript", path="path") config = rules.origin_config["feature_extractor"] config["return_sibling_indices"] = True fe = FeatureExtractor(language=language, **config) res = fe.extract_features([file]) if res is None: abort(500) X, y, (vnodes_y, vnodes, vnode_parents, node_parents, sibling_indices) = res y_pred, rule_winners, rules, grouped_quote_predictions = rules.predict( X=X, vnodes_y=vnodes_y, vnodes=vnodes, feature_extractor=fe) refuse_to_predict = y_pred < 0 checker = UASTStabilityChecker(fe) _, _, _, _, safe_preds = checker.check( y=y, y_pred=y_pred, vnodes_y=vnodes_y, vnodes=vnodes, files=[file], stub=client._stub, vnode_parents=vnode_parents, node_parents=node_parents, rule_winners=rule_winners, grouped_quote_predictions=grouped_quote_predictions) break_uast = [False] * X.shape[0] for wrong_pred in set(range(X.shape[0])).difference(safe_preds): break_uast[wrong_pred] = True labeled_indices = {id(vnode): i for i, vnode in enumerate(vnodes_y)} app.logger.info("returning features of shape %d, %d" % X.shape) app.logger.info("length of rules: %d", len(rules)) return jsonify({ "code": code, "features": _input_matrix_to_descriptions(X, fe), "ground_truths": y.tolist(), "predictions": y_pred.tolist(), "refuse_to_predict": refuse_to_predict.tolist(), "sibling_indices": sibling_indices, "rules": _rules_to_jsonable(rules, fe), "winners": rule_winners.tolist(), "break_uast": break_uast, "feature_names": fe.feature_names, "class_representations": fe.composite_class_representations, "class_printables": fe.composite_class_printables, "vnodes": list(map(partial(_vnode_to_jsonable, labeled_indices=labeled_indices), vnodes)), "config": _mapping_to_jsonable(rules.origin_config)})
def test_train_cutoff_labels(self): self.data_service = FakeDataService( self.bblfsh_client, files=self.base_files.values(), changes=[]) model1 = FormatAnalyzer.train(self.ptr, get_config(), self.data_service) self.assertIsInstance(model1, FormatModel) self.assertIn("javascript", model1, str(model1)) model2 = FormatAnalyzer.train(self.ptr, get_config(), self.data_service) self.assertEqual(model1["javascript"].rules, model2["javascript"].rules) self.assertGreater(len(model1["javascript"]), 5) # Check that model can be saved without problems and then load back with TemporaryFile(prefix="analyzer_model-", suffix=".asdf") as f: model2.save(f) f.seek(0) model3 = FormatModel().load(f) compare_models(self, model2, model3)
def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService, **data) -> FormatModel: """ Train a model given the files available or load the existing model. If you set config["model"] to path in the file system model will be loaded otherwise a model is trained in a regular way. :param ptr: Git repository state pointer. :param config: Configuration dict. :param data: Contains "files" - the list of files in the pointed state. :param data_service: Connection to the Lookout data retrieval service. :return: FormatModel containing the learned rules, per language. """ return FormatModel().load(config["model"]) if "model" in config else \ super().train(ptr, config, data_service)
def setUp(self): (self.train_x, self.test_x, self.train_y, self.test_y), _, _ = load_abalone_data() self.config = { "trainable_rules": { "base_model_name": "sklearn.tree.DecisionTreeClassifier", "prune_branches_algorithms": [], "prune_attributes": False, "min_samples_leaf": 26, "random_state": 1989, }, } trainer = TrainableRules(**self.config["trainable_rules"], origin_config=self.config) trainer.fit(self.test_x, self.test_y) self.rules = trainer.rules self.fm = FormatModel().load( os.path.join(os.path.dirname(__file__), "model_jquery.asdf")) self.maxDiff = None
def analyze_files(analyzer_type: Type[FormatAnalyzer], config: dict, model_path: str, language: str, bblfsh: str, input_pattern: str, log: logging.Logger, ) -> List[FileFix]: """Run the model, record the fixes for each file and return them.""" class FakePointer: def to_pb(self): return None model = FormatModel().load(model_path) if language not in model: raise NotFittedError() rules = model[language] client = BblfshClient(bblfsh) files = prepare_files(glob.glob(input_pattern, recursive=True), client, language) log.info("Model parameters: %s" % rules.origin_config) log.info("Rules stats: %s" % rules) log.info("Number of files: %s" % (len(files))) return analyzer_type(model, input_pattern, config).analyze( FakePointer(), None, data_service=FakeDataService(client, files, []))
def train(cls, ptr: ReferencePointer, config: Dict[str, Any], data_request_stub: DataStub, **data) -> AnalyzerModel: """ Train a model given the files available. :param ptr: Git repository state pointer. :param config: configuration dict. :param data: contains "files" - the list of files in the pointed state. :param data_request_stub: connection to the Lookout data retrieval service, not used. :return: AnalyzerModel containing the learned rules, per language. """ config = cls._load_train_config(config) cls.log.info("train %s %s %s", ptr.url, ptr.commit, pformat(config, width=4096, compact=True)) files_by_language = cls._files_by_language(data["files"]) model = FormatModel().construct(cls, ptr) for language, files in files_by_language.items(): language = language.lower() try: fe = FeatureExtractor( language=language, siblings_window=config["siblings_window"], parents_depth=config["parents_depth"]) except ImportError: cls.log.warning("skipped %d %s files - not supported", len(files), language) continue else: cls.log.info("training on %d %s files", len(files), language) # we sort to make the features reproducible X, y, _ = fe.extract_features(f[1] for f in sorted(files.items())) lower_bound_instances = config["lower_bound_instances"] if X.shape[0] < lower_bound_instances: cls.log.warning("skipped %d %s files: too few samples (%d/%d)", len(files), language, X.shape[0], lower_bound_instances) continue cls.log.debug("training the rules model") bscv = BayesSearchCV(TrainableRules( prune_branches_algorithms=config["prune_branches_algorithms"], prune_attributes=config["prune_attributes"], top_down_greedy_budget=config["top_down_greedy_budget"], uncertain_attributes=config["uncertain_attributes"], prune_dataset_ratio=config["prune_dataset_ratio"], n_estimators=config["n_estimators"], random_state=config["random_state"]), { "base_model_name": Categorical([ "sklearn.ensemble.RandomForestClassifier", "sklearn.tree.DecisionTreeClassifier" ]), "max_depth": Categorical([None, 5, 10]), "max_features": Categorical([None, "auto"]), "min_samples_split": Integer(2, 20), "min_samples_leaf": Integer(1, 20) }, n_jobs=-1, n_iter=config["n_iter"], random_state=config["random_state"]) bscv.fit(X, y) cls.log.debug("score of the best estimator found: %.3f", bscv.best_score_) cls.log.debug("params of the best estimator found: %s", str(bscv.best_params_)) cls.log.debug("training the model with complete data") trainable_rules = TrainableRules( prune_branches_algorithms=["reduced-error"], prune_attributes=True, random_state=42, uncertain_attributes=True, **bscv.best_params_) trainable_rules.fit(X, y) model[language] = trainable_rules.rules cls.log.info("trained %s", model) return model
def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService, files: Iterator[File], **data) -> FormatModel: """ Train a model given the files available. :param ptr: Git repository state pointer. :param config: configuration dict. :param data: contains "files" - the list of files in the pointed state. :param data_service: connection to the Lookout data retrieval service. :param files: iterator of File records from the data service. :return: AnalyzerModel containing the learned rules, per language. """ _log = logging.getLogger(cls.__name__) train_config = cls._load_config(config)["train"] _log.info("train %s %s %s %s", __version__, ptr.url, ptr.commit, pformat(train_config, width=4096, compact=True)) model = FormatModel().generate(cls, ptr) for language, files in files_by_language(files).items(): try: lang_config = train_config[language] except KeyError: _log.warning("language %s is not supported, skipped", language) continue _log.info("effective train config for %s:\n%s", language, pformat(lang_config, width=120, compact=True)) random_state = lang_config["random_state"] files = filter_files( files, lang_config["line_length_limit"], lang_config["overall_size_limit"], random_state, _log) submit_event("%s.train.%s.files" % (cls.name, language), len(files)) if len(files) == 0: _log.info("zero files after filtering, language %s is skipped.", language) continue try: fe = FeatureExtractor(language=language, **lang_config["feature_extractor"]) except ImportError: _log.warning("skipped %d %s files - not supported", len(files), language) continue else: _log.info("training on %d %s files", len(files), language) train_files, test_files = FormatAnalyzer.split_train_test( files, lang_config["test_dataset_ratio"], random_state=random_state) # ensure that the features are reproducible train_files = sorted(train_files, key=lambda x: x.path) test_files = sorted(test_files, key=lambda x: x.path) X_train, y_train, _ = fe.extract_features(train_files) X_train, selected_features = fe.select_features(X_train, y_train) if test_files: X_test, y_test, _ = fe.extract_features(test_files) if lang_config["test_dataset_ratio"]: _log.debug("Real test ratio is %.3f", X_test.shape[0] / (X_test.shape[0] + X_train.shape[0]) if test_files else 0) lang_config["feature_extractor"]["selected_features"] = selected_features lang_config["feature_extractor"]["label_composites"] = fe.labels_to_class_sequences lower_bound_instances = lang_config["lower_bound_instances"] if X_train.shape[0] < lower_bound_instances: _log.warning("skipped %d %s files: too few samples (%d/%d)", len(files), language, X_train.shape[0], lower_bound_instances) continue _log.info("extracted %d samples to train, searching for the best hyperparameters", X_train.shape[0]) optimizer = Optimizer(**lang_config["optimizer"], random_state=random_state) best_score, best_params = optimizer.optimize(X_train, y_train) if _log.isEnabledFor(logging.DEBUG): _log.debug("score of the best estimator found: %.6f", best_score) _log.debug("params of the best estimator found: %s", str(best_params)) _log.debug("training the model with complete data") else: _log.info("finished hyperopt at %.6f, training the full model", -best_score) lang_config["trainable_rules"].update(best_params) trainable_rules = TrainableRules(**lang_config["trainable_rules"], random_state=random_state, origin_config=lang_config) trainable_rules.fit(X_train, y_train) importances = trainable_rules.feature_importances_ _log.debug( "feature importances from %s:\n\t%s", lang_config["trainable_rules"]["base_model_name"], "\n\t".join("%-55s %.5E" % (fe.feature_names[i], importances[i]) for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5)) trainable_rules.prune_categorical_attributes(fe) _log.info("obtained %d rules, generating the classification report", len(trainable_rules.rules)) trainable_rules.rules.generate_classification_report( X_train, y_train, "train", fe.composite_class_representations) if test_files: trainable_rules.rules.generate_classification_report( X_test, y_test, "test", fe.composite_class_representations) submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules)) if trainable_rules.rules.rules: model[language] = trainable_rules.rules else: _log.warning("model for %s has 0 rules. Skipped.", language) _log.info("trained %s", model) return model
def dump_rule_entry(model, hash): """Command-line entry for "tool rule".""" model = FormatModel().load(model) dump_rule(model, hash)
def quality_report_noisy(bblfsh: str, language: str, confidence_threshold: float, support_threshold: int, precision_threshold: float, dir_output: str, repos: Optional[str] = None) -> None: """ Generate a quality report on the artificial noisy dataset including a precision-recall curve. :param bblfsh: Babelfish client. Babelfish server should be started accordingly. :param language: Language to consider, others will be discarded. :param confidence_threshold: Confidence threshold to filter relevant rules. :param support_threshold: Support threshold to filter relevant rules. :param precision_threshold: Precision threshold tolerated by the model. \ Limit drawn as a red horizontal line on the figure. :param dir_output: Path to the output directory where to store the quality report in Markdown \ and the precision-recall curve in png format. :param repos: Input list of urls or paths to the repositories to analyze. \ Should be strings separated by newlines. """ log = logging.getLogger("quality_report_noisy") repo_names = [] last_accepted_rule = {} precisions, recalls, accepted_rules = (defaultdict(list) for _ in range(3)) n_mistakes, prec_max_rec, confidence_threshold_exp, max_rec, \ n_rules, n_rules_filtered = ({} for _ in range(6)) if repos is None: repos = REPOSITORIES try: client = BblfshClient(bblfsh) log.info("Repositories: %s", repos) with tempfile.TemporaryDirectory() as tmpdirname: for repo_path in repos.splitlines(): repo = repo_path.split("/")[-1] if repo_path.startswith("https://github.com"): log.info("Fetching %s", repo_path) git_dir = os.path.join(tmpdirname, repo) git_dir_noisy = os.path.join(tmpdirname, repo + "_noisy") cmd1 = "git clone --single-branch --branch master %s %s" % ( repo_path, git_dir) cmd2 = "git clone --single-branch --branch style-noise-1-per-file %s %s" \ % (repo_path, git_dir_noisy) try: for cmd in (cmd1, cmd2): log.debug("Running: %s", cmd) subprocess.check_call(cmd.split()) except subprocess.CalledProcessError as e: raise ConnectionError("Unable to fetch repository %s" % repo_path) from e input_pattern = os.path.join(git_dir, "**", "*.js") input_pattern_noisy = os.path.join(git_dir_noisy, "**", "*.js") model_path = os.path.join(git_dir_noisy, "style-analyzer-model", "model.asdf") else: input_pattern = os.path.join(repo_path, "**", "*.js") input_pattern_noisy = os.path.join(repo_path + "_noisy", "**", "*.js") model_path = os.path.join(repo_path, "model.asdf") true_content = get_content_from_repo(input_pattern) noisy_content = get_content_from_repo(input_pattern_noisy) true_files, noisy_files, start_changes = get_difflib_changes( true_content, noisy_content) if not true_files: raise ValueError( "Noisy repo should count at least one artificial mistake" ) log.info( "Number of files modified by adding style noise: %d / %d", len(true_files), len(true_content)) del true_content, noisy_content analyzer = FormatModel().load(model_path) rules = analyzer[language] feature_extractor = FeatureExtractor( language=language, **rules.origin_config["feature_extractor"]) vnodes_y_true = files2vnodes(true_files, feature_extractor, client) mispreds_noise = files2mispreds(noisy_files, feature_extractor, rules, client, log) diff_mispreds = get_diff_mispreds(mispreds_noise, start_changes) changes_count = len(start_changes) n_rules[repo] = len(rules.rules) rules_id = [(i, r.stats.conf) for i, r in enumerate(rules.rules) if r.stats.conf > confidence_threshold and r.stats.support > support_threshold] rules_id = sorted(rules_id, key=lambda k: k[1], reverse=True) for i in range(len(rules_id)): filtered_mispreds = { k: m for k, m in diff_mispreds.items() if any(r[0] == m.rule for r in rules_id[:i + 1]) } style_fixes = get_style_fixes(filtered_mispreds, vnodes_y_true, true_files, noisy_files, feature_extractor) precision, recall, f1_score = compute_metrics( changes_count=changes_count, predictions_count=len(filtered_mispreds), true_positive=len(style_fixes)) precisions[repo].append(round(precision, 3)) recalls[repo].append(round(recall, 3)) log.info([m.node.path for m in style_fixes]) print("recall x:", recalls[repo]) print("precision y:", precisions[repo]) # compute some stats and quality metrics for the model's evaluation repo_names.append(repo) n_mistakes[repo] = len(true_files) prec_max_rec[repo] = precisions[repo][-1] max_rec[repo] = max(recalls[repo]) n_rules_filtered[repo] = len(rules_id) # compute the confidence and recall limit for the given precision threshold for i, (prec, rec) in enumerate(zip(precisions[repo], recalls[repo])): if prec >= precision_threshold: accepted_rules[repo].append((i, rules_id[i][1], rec)) last_accepted_rule[repo] = min(accepted_rules[repo], key=itemgetter(1)) confidence_threshold_exp[repo] = (last_accepted_rule[repo][0], last_accepted_rule[repo][1]) finally: client._channel.close() # compute the index of the last accepted rule according to the maximum confidence threshold limit_conf_id = {} max_confidence_threshold_exp = max(confidence_threshold_exp.values(), key=itemgetter(1)) for repo, rules in accepted_rules.items(): for rule in rules: if rule[1] < max_confidence_threshold_exp[1]: break limit_conf_id[repo] = rule[0] # compile the precision-recall curves path_to_figure = os.path.join(dir_output, "pr_curves.png") plot_curve(repo_names, recalls, precisions, precision_threshold, limit_conf_id, path_to_figure) # compile the markdown template for the report through jinja2 loader = jinja2.FileSystemLoader( (os.path.join(os.path.dirname(__file__), "..", "templates"), ), followlinks=True) env = jinja2.Environment( trim_blocks=True, lstrip_blocks=True, keep_trailing_newline=True, ) env.globals.update(range=range) template = loader.load(env, "noisy_quality_report.md.jinja2") report = template.render(repos=repo_names, n_mistakes=n_mistakes, prec_max_rec=prec_max_rec, confidence_threshold_exp=round( max_confidence_threshold_exp[1], 2), max_rec=max_rec, confidence_threshold=confidence_threshold, support_threshold=support_threshold, n_rules=n_rules, n_rules_filtered=n_rules_filtered, path_to_figure=path_to_figure) # write the quality report repo_pathrt = os.path.join(dir_output, "report_noise.md") with open(repo_pathrt, "w", encoding="utf-8") as f: f.write(report)
def visualize(input_filename: str, bblfsh: str, language: str, model_path: str) -> None: """Visualize the errors made on a single file.""" model = FormatModel().load(model_path) rules = model[language] print("Model parameters: %s" % rules.origin) print("Stats about rules: %s" % rules) client = BblfshClient(bblfsh) file = prepare_file(input_filename, client, language) fe = FeatureExtractor(language=language, **rules.origin_config["feature_extractor"]) X, y, vnodes_y, vnodes = fe.extract_features([file]) y_pred, _, _ = rules.predict(X, vnodes_y, vnodes, fe) # collect lines with mispredictions - could be removed mispred_lines = set() lines = set() for gt, pred, node in zip(y, y_pred, vnodes_y): lines.add((node.path, node.start.line)) if gt != pred: mispred_lines.add((node.path, node.start.line)) print("Number of lines with mispredictions: %s out of %s mispredicted" % (len(mispred_lines), len(lines))) # collect mispredictions and all other predictions for each line with mistake mispred = defaultdict(list) for gt, pred, node in zip(y, y_pred, vnodes_y): if (node.path, node.start.line) in mispred_lines: mispred[(node.path, node.start.line)].append(Misprediction(gt, pred, node)) # sort each line for value in mispred.values(): value.sort(key=lambda k: k.node.start.offset) # final mispredictions final_mispred = [] for line in sorted(mispred): gt = [m.y for m in mispred[line]] pred = [m.pred for m in mispred[line]] s = SequenceMatcher(None, gt, pred) blocks = s.get_matching_blocks() if blocks[0].a != 0: # mispredictions before the first matching block final_mispred.extend(mispred[line][:blocks[0].a]) for i in range(len(blocks) - 1): final_mispred.extend(mispred[line][blocks[i].a:blocks[i + 1].a]) if blocks[-1].a != len(mispred[line]): # mispredictions after the last matching block final_mispred.extend(mispred[line][blocks[-1].a:]) mispred = sorted([misp for misp in final_mispred if misp.y != misp.pred], key=lambda r: r.node.start.offset) new_content = ENDC old_content = file.content.decode("utf-8") for i in range(len(mispred)): wrong = mispred[i] start = wrong.node.start.offset end = wrong.node.end.offset if end == start: end = start + len(wrong.node.value) if i == 0 and start != 0: new_content += old_content[:start] new_content += GREEN + CLASSES[wrong.y] + RED + CLASSES[ wrong.pred] + ENDC if i == len(mispred) - 1: if end != len(old_content): new_content += old_content[end:] else: new_content += old_content[end:mispred[i + 1].node.start.offset] print("Visualization:\n" + new_content)
def test_dump(self): fm = FormatModel() self.assertEqual(fm.dump(), "generic/[1, 0, 0] <unknown url> <unknown commit>") DUMP = """code-format/[1] file:///var/folders/kw/93jybvs16_954hytgsq6ld7r0000gn/T/top-repos-quality-repos-jigt1n8g/jquery dae5f3ce3d2df27873d01f0d9682f6a91ad66b87 # javascript 1159 rules, avg.len. 12.7 ## train PPCR: 0.993413 ### report macro {'f1-score': 0.7270769669476458, 'precision': 0.8106858458605273, 'recall': 0.7061608014058862, 'support': 163931} micro {'f1-score': 0.9704570825530253, 'precision': 0.9704570825530253, 'recall': 0.9704570825530253, 'support': 163931} weighted {'f1-score': 0.9682573644719648, 'precision': 0.9688067324990776, 'recall': 0.9704570825530253, 'support': 163931} ### report_full macro {'f1-score': 0.7207757082876136, 'precision': 0.8106858458605273, 'recall': 0.6958571203075536, 'support': 165018} micro {'f1-score': 0.9672502424387974, 'precision': 0.9704570825530253, 'recall': 0.9640645262941012, 'support': 165018} weighted {'f1-score': 0.964254372281313, 'precision': 0.967999533892513, 'recall': 0.9640645262941012, 'support': 165018} ## test PPCR: 0.992673 ### report macro {'f1-score': 0.670106403195044, 'precision': 0.7675483060510728, 'recall': 0.667193540547618, 'support': 39563} micro {'f1-score': 0.9646134014104087, 'precision': 0.9646134014104087, 'recall': 0.9646134014104087, 'support': 39563} weighted {'f1-score': 0.9623528642977015, 'precision': 0.964064574202937, 'recall': 0.9646134014104087, 'support': 39563} ### report_full macro {'f1-score': 0.6645299678762785, 'precision': 0.7675483060510728, 'recall': 0.6569592424077594, 'support': 39855} micro {'f1-score': 0.961066760683976, 'precision': 0.9646134014104087, 'recall': 0.9575461046292811, 'support': 39855} weighted {'f1-score': 0.9579239894541836, 'precision': 0.9627543953777487, 'recall': 0.9575461046292811, 'support': 39855}""" # noqa self.assertEqual(self.fm.dump(), DUMP)
def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService, **data) -> FormatModel: """ Train a model given the files available. :param ptr: Git repository state pointer. :param config: configuration dict. :param data: contains "files" - the list of files in the pointed state. :param data_service: connection to the Lookout data retrieval service. :return: AnalyzerModel containing the learned rules, per language. """ _log = logging.getLogger(cls.__name__) _log.info("train %s %s %s", ptr.url, ptr.commit, pformat(config, width=4096, compact=True)) model = FormatModel().construct(cls, ptr) config = cls._load_train_config(config) for language, files in files_by_language(data["files"]).items(): try: lang_config = config[language] except KeyError: _log.warning("language %s is not supported, skipped", language) continue files = filter_files(files, lang_config["line_length_limit"], _log) submit_event("%s.train.%s.files" % (cls.name, language), len(files)) if len(files) == 0: _log.info( "zero files after filtering, language %s is skipped.", language) continue try: fe = FeatureExtractor(language=language, **lang_config["feature_extractor"]) except ImportError: _log.warning("skipped %d %s files - not supported", len(files), language) continue else: _log.info("training on %d %s files", len(files), language) # we sort to make the features reproducible X, y, _ = fe.extract_features(sorted(files, key=lambda x: x.path)) X, selected_features = fe.select_features(X, y) lang_config["feature_extractor"][ "selected_features"] = selected_features lang_config["feature_extractor"][ "label_composites"] = fe.labels_to_class_sequences lower_bound_instances = lang_config["lower_bound_instances"] if X.shape[0] < lower_bound_instances: _log.warning("skipped %d %s files: too few samples (%d/%d)", len(files), language, X.shape[0], lower_bound_instances) continue _log.debug("training the rules model") optimizer = Optimizer( n_jobs=lang_config["n_jobs"], n_iter=lang_config["n_iter"], cv=lang_config["cv"], random_state=lang_config["trainable_rules"]["random_state"]) best_score, best_params = optimizer.optimize(X, y) _log.debug("score of the best estimator found: %.6f", best_score) _log.debug("params of the best estimator found: %s", str(best_params)) _log.debug("training the model with complete data") lang_config["trainable_rules"].update(best_params) trainable_rules = TrainableRules(**lang_config["trainable_rules"], origin_config=lang_config) trainable_rules.fit(X, y) importances = trainable_rules.feature_importances_ _log.debug( "feature importances from %s:\n\t%s", lang_config["trainable_rules"]["base_model_name"], "\n\t".join( "%-55s %.5E" % (fe.feature_names[i], importances[i]) for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5)) submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules)) # TODO(vmarkovtsev): save the achieved precision, recall, etc. to the model # throw away imprecise classes if trainable_rules.rules.rules: model[language] = trainable_rules.rules else: _log.warning("model for %s has 0 rules. Skipping.", language) _log.info("trained %s", model) return model