def generate_file_fixes(self, data_service: DataService, changes: Sequence[Change], ) -> Iterator[FileFix]: """ Generate all data required for any type of further processing. Next processing can be comment generation or performance report generation. :param data_service: Connection to the Lookout data retrieval service. :param changes: The list of changes in the pointed state. :return: Iterator with unrendered data per comment. """ log = self._log base_files_by_lang = files_by_language(c.base for c in changes) head_files_by_lang = files_by_language(c.head for c in changes) processed_files_counter = defaultdict(int) processed_fixes_counter = defaultdict(int) for lang, head_files in head_files_by_lang.items(): if lang not in self.model: log.warning("skipped %d written in %s. Rules for %s do not exist in model", len(head_files), lang, lang) continue rules = self.model[lang] config = self.analyze_config[lang] rules = rules.filter_by_confidence(config["confidence_threshold"]) \ .filter_by_support(config["support_threshold"]) for file in filter_files(head_files, rules.origin_config["line_length_limit"], rules.origin_config["overall_size_limit"], log=log): processed_files_counter[lang] += 1 try: prev_file = base_files_by_lang[lang][file.path] except KeyError: prev_file = None lines = None else: lines = sorted(chain.from_iterable(( find_new_lines(prev_file, file), find_deleted_lines(prev_file, file), ))) log.debug("%s %s", file.path, lines) fe = FeatureExtractor(language=lang, **rules.origin_config["feature_extractor"]) feature_extractor_output = fe.extract_features([file], [lines]) if feature_extractor_output is None: submit_event("%s.analyze.%s.parse_failures" % (self.name, lang), 1) if config["report_parse_failures"]: log.warning("Failed to parse %s", file.path) yield FileFix(error="Failed to parse", head_file=file, language=lang, feature_extractor=fe, base_file=prev_file, file_vnodes=[], line_fixes=[], y_pred_pure=None, y=None) else: fixes, file_vnodes, y_pred_pure, y = self._generate_token_fixes( file, fe, feature_extractor_output, data_service.get_bblfsh(), rules) log.debug("%s %d fixes", file.path, len(fixes)) processed_fixes_counter[lang] += len(fixes) yield FileFix(error="", head_file=file, language=lang, feature_extractor=fe, base_file=prev_file, file_vnodes=file_vnodes, line_fixes=fixes, y_pred_pure=y_pred_pure, y=y) for key, val in processed_files_counter.items(): submit_event("%s.analyze.%s.files" % (self.name, key), val) for key, val in processed_fixes_counter.items(): submit_event("%s.analyze.%s.fixes" % (self.name, key), val)
def generate_typos_fixes(self, changes: Sequence[Change]) -> Iterator[TypoFix]: """ Generate all data about typo fix required for any type of further processing. The processing can be comment generation or performance report generation. :param changes: The list of changes in the pointed state. :return: Iterator with unrendered data per comment. """ base_files_by_lang = files_by_language(c.base for c in changes) head_files_by_lang = files_by_language(c.head for c in changes) for lang, head_files in head_files_by_lang.items(): for file in filter_files( files=head_files, line_length_limit=self.config["line_length_limit"], overall_size_limit=self.config["overall_size_limit"], log=self._log): try: prev_file = base_files_by_lang[lang][file.path] except KeyError: lines = [] old_identifiers = set() else: lines = find_new_lines(prev_file, file) old_identifiers = { node.token for node in uast2sequence(prev_file.uast) if bblfsh.role_id("IDENTIFIER") in node.roles and bblfsh.role_id("IMPORT") not in node.roles and node.token } changed_nodes = extract_changed_nodes(file.uast, lines) new_identifiers = [ node for node in changed_nodes if bblfsh.role_id("IDENTIFIER") in node.roles and bblfsh.role_id("IMPORT") not in node.roles and node.token and node.token not in old_identifiers ] if not new_identifiers: continue suggestions = self.check_identifiers( [n.token for n in new_identifiers]) for index in suggestions.keys(): corrections = suggestions[index] for token in corrections.keys(): yield TypoFix( head_file=file, token=new_identifiers[index].token, candidates=[ Candidate(*c[:2]) for c in corrections[token] ], line_number=new_identifiers[index].start_position. line, )
def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService, files: Iterator[File], **data) -> IdTyposModel: """ Generate a new model on top of the specified source code. :param ptr: Git repository state pointer. :param config: Configuration of the training of unspecified structure. :param data_service: The channel to the data service in Lookout server to query for \ UASTs, file contents, etc. :param files: iterator of File records from the data service. :param data: Extra data passed into the method. Used by the decorators to simplify \ the data retrieval. :return: Instance of `AnalyzerModel` (`model_type`, to be precise). """ _log = logging.getLogger(cls.__name__) train_config = cls._load_config(config) _log.info("train %s %s %s %s", __version__, ptr.url, ptr.commit, pformat(train_config, width=4096, compact=True)) model = IdTyposModel() for _, files in files_by_language(files).items(): for file in filter_files( files=files, line_length_limit=train_config["line_length_limit"], overall_size_limit=train_config["overall_size_limit"], log=_log): model.identifiers.update({ node.token for node in cls._get_identifiers(file.uast, []) }) return model
def check_training_required( cls, old_model: FormatModel, ptr: ReferencePointer, config: Mapping[str, Any], data_service: "lookout.core.data_requests.DataService", **data) -> bool: """ Return True if the format model needs to be refreshed; otherwise, False. We calculate the ratio of the number of changed lines to the overall number of lines. If it is bigger than lines_ratio_train_trigger - we need to train. :param old_model: Current FormatModel. :param ptr: Git repository state pointer. :param config: configuration dict. :param data: contains "files" - the list of files in the pointed state. :param data_service: connection to the Lookout data retrieval service. :return: True or False """ _log = logging.getLogger(cls.__name__) changes = list(request_changes( data_service.get_data(), old_model.ptr, ptr, contents=True, uast=False)) base_files_by_lang = files_by_language(c.base for c in changes) head_files_by_lang = files_by_language(c.head for c in changes) config = cls._load_config(config) for language, head_files in head_files_by_lang.items(): try: lang_config = config["train"][language] except KeyError: _log.warning("language %s is not supported, skipped", language) continue overall_lines = changed_lines = 0 for file in filter_files(head_files, lang_config["line_length_limit"], lang_config["overall_size_limit"], log=_log): head_lines = len(file.content.splitlines()) overall_lines += head_lines try: prev_file = base_files_by_lang[language][file.path] except KeyError: changed_lines += head_lines else: changed_lines += len(find_new_lines(prev_file, file)) changed_lines += len(find_deleted_lines(prev_file, file)) ratio = changed_lines / (overall_lines or 1) _log.debug("check %s ratio: %.3f", language, ratio) if ratio > lang_config["lines_ratio_train_trigger"]: _log.info("%s triggers the training with changes ratio %.3f", language, ratio) return True return False
def test_files_by_language(self): file_stats = {"js": 2, "ruby": 7, "Python": 5} files = [] for language, n_files in file_stats.items(): for i in range(n_files): files.append(File(language=language, uast=Node(children=[Node()]), path=language + str(i))) result = files_by_language(files) self.assertEqual([("python", 5), ("js", 2), ("ruby", 7)], [(k, len(v)) for k, v in result.items()]) return result
def generate_typos_fixes(self, changes: Sequence[Change]) -> Iterator[TypoFix]: """ Generate all data about typo fix required for any type of further processing. The processing can be comment generation or performance report generation. :param changes: The list of changes in the pointed state. :return: Iterator with unrendered data per comment. """ base_files_by_lang = files_by_language(c.base for c in changes) head_files_by_lang = files_by_language(c.head for c in changes) for lang, head_files in head_files_by_lang.items(): for file in filter_files( files=head_files, line_length_limit=self.config["line_length_limit"], overall_size_limit=self.config["overall_size_limit"], log=self._log): try: prev_file = base_files_by_lang[lang][file.path] except KeyError: lines = [] else: lines = self._find_new_lines(prev_file.content, file.content) identifiers = self._get_identifiers(file.uast, lines) new_identifiers = [ node for node in identifiers if node.token not in self.allowed_identifiers ] if not new_identifiers: continue self._log.debug("found %d new identifiers" % len(new_identifiers)) suggestions = self.check_identifiers( [n.token for n in new_identifiers]) if not suggestions: continue for index in suggestions.keys(): identifier = new_identifiers[index].token candidates = { token: [ Candidate(*sugg) for sugg in suggestions[index][token] ] for token in suggestions[index] } sugg_identifiers, id_confidences = [], [] for final_sugg, conf in self.generate_identifier_suggestions( candidates, identifier): sugg_identifiers.append(final_sugg) id_confidences.append(conf) identifier_candidates = [ Candidate(i, c) for i, c in zip( sugg_identifiers, self._normalize_confidences(id_confidences), ) if i != identifier ] if identifier_candidates: yield TypoFix( content=file.content.decode("utf-8", "replace"), path=file.path, identifier=identifier, line_number=new_identifiers[index].start_position. line, candidates=identifier_candidates, identifiers_number=len( set(n.token for n in new_identifiers)), )
def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService, files: Iterator[File], **data) -> FormatModel: """ Train a model given the files available. :param ptr: Git repository state pointer. :param config: configuration dict. :param data: contains "files" - the list of files in the pointed state. :param data_service: connection to the Lookout data retrieval service. :param files: iterator of File records from the data service. :return: AnalyzerModel containing the learned rules, per language. """ _log = logging.getLogger(cls.__name__) train_config = cls._load_config(config)["train"] _log.info("train %s %s %s %s", __version__, ptr.url, ptr.commit, pformat(train_config, width=4096, compact=True)) model = FormatModel().generate(cls, ptr) for language, files in files_by_language(files).items(): try: lang_config = train_config[language] except KeyError: _log.warning("language %s is not supported, skipped", language) continue _log.info("effective train config for %s:\n%s", language, pformat(lang_config, width=120, compact=True)) random_state = lang_config["random_state"] files = filter_files( files, lang_config["line_length_limit"], lang_config["overall_size_limit"], random_state, _log) submit_event("%s.train.%s.files" % (cls.name, language), len(files)) if len(files) == 0: _log.info("zero files after filtering, language %s is skipped.", language) continue try: fe = FeatureExtractor(language=language, **lang_config["feature_extractor"]) except ImportError: _log.warning("skipped %d %s files - not supported", len(files), language) continue else: _log.info("training on %d %s files", len(files), language) train_files, test_files = FormatAnalyzer.split_train_test( files, lang_config["test_dataset_ratio"], random_state=random_state) # ensure that the features are reproducible train_files = sorted(train_files, key=lambda x: x.path) test_files = sorted(test_files, key=lambda x: x.path) X_train, y_train, _ = fe.extract_features(train_files) X_train, selected_features = fe.select_features(X_train, y_train) if test_files: X_test, y_test, _ = fe.extract_features(test_files) if lang_config["test_dataset_ratio"]: _log.debug("Real test ratio is %.3f", X_test.shape[0] / (X_test.shape[0] + X_train.shape[0]) if test_files else 0) lang_config["feature_extractor"]["selected_features"] = selected_features lang_config["feature_extractor"]["label_composites"] = fe.labels_to_class_sequences lower_bound_instances = lang_config["lower_bound_instances"] if X_train.shape[0] < lower_bound_instances: _log.warning("skipped %d %s files: too few samples (%d/%d)", len(files), language, X_train.shape[0], lower_bound_instances) continue _log.info("extracted %d samples to train, searching for the best hyperparameters", X_train.shape[0]) optimizer = Optimizer(**lang_config["optimizer"], random_state=random_state) best_score, best_params = optimizer.optimize(X_train, y_train) if _log.isEnabledFor(logging.DEBUG): _log.debug("score of the best estimator found: %.6f", best_score) _log.debug("params of the best estimator found: %s", str(best_params)) _log.debug("training the model with complete data") else: _log.info("finished hyperopt at %.6f, training the full model", -best_score) lang_config["trainable_rules"].update(best_params) trainable_rules = TrainableRules(**lang_config["trainable_rules"], random_state=random_state, origin_config=lang_config) trainable_rules.fit(X_train, y_train) importances = trainable_rules.feature_importances_ _log.debug( "feature importances from %s:\n\t%s", lang_config["trainable_rules"]["base_model_name"], "\n\t".join("%-55s %.5E" % (fe.feature_names[i], importances[i]) for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5)) trainable_rules.prune_categorical_attributes(fe) _log.info("obtained %d rules, generating the classification report", len(trainable_rules.rules)) trainable_rules.rules.generate_classification_report( X_train, y_train, "train", fe.composite_class_representations) if test_files: trainable_rules.rules.generate_classification_report( X_test, y_test, "test", fe.composite_class_representations) submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules)) if trainable_rules.rules.rules: model[language] = trainable_rules.rules else: _log.warning("model for %s has 0 rules. Skipped.", language) _log.info("trained %s", model) return model
def analyze(self, ptr_from: ReferencePointer, ptr_to: ReferencePointer, data_service: DataService, **data) -> [Comment]: """ Return the list of `Comment`-s - found typo corrections. :param ptr_from: The Git revision of the fork point. Exists in both the original and \ the forked repositories. :param ptr_to: The Git revision to analyze. Exists only in the forked repository. :param data_service: The channel to the data service in Lookout server to query for \ UASTs, file contents, etc. :param data: Extra data passed into the method. Used by the decorators to simplify \ the data retrieval. :return: List of found review suggestions. Refer to \ lookout/core/server/sdk/service_analyzer.proto. """ log = self.log comments = [] changes = list(data["changes"]) base_files_by_lang = files_by_language(c.base for c in changes) head_files_by_lang = files_by_language(c.head for c in changes) line_length = self.config.get("line_length_limit", self.DEFAULT_LINE_LENGTH_LIMIT) for lang, head_files in head_files_by_lang.items(): for file in filter_files(head_files, line_length, log): try: prev_file = base_files_by_lang[lang][file.path] except KeyError: lines = [] old_identifiers = set() else: lines = find_new_lines(prev_file, file) old_identifiers = { node.token for node in uast2sequence(prev_file.uast) if bblfsh.role_id("IDENTIFIER") in node.roles and bblfsh.role_id("IMPORT") not in node.roles and node.token } changed_nodes = extract_changed_nodes(file.uast, lines) new_identifiers = [ node for node in changed_nodes if bblfsh.role_id("IDENTIFIER") in node.roles and bblfsh.role_id("IMPORT") not in node.roles and node.token and node.token not in old_identifiers ] if not new_identifiers: continue suggestions = self.check_identifiers( [n.token for n in new_identifiers]) for index in suggestions.keys(): corrections = suggestions[index] for token in corrections.keys(): comment = Comment() comment.file = file.path corrections_line = " " + ", ".join( "%s (%d%%)" % (candidate[0], int(candidate[1] * 100)) for candidate in corrections[token]) comment.text = """ Possible typo in \"%s\". Suggestions: """.strip( ) % new_identifiers[index].token + corrections_line comment.line = new_identifiers[ index].start_position.line comment.confidence = int(corrections[token][0][1] * 100) comments.append(comment) return comments
def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService, **data) -> FormatModel: """ Train a model given the files available. :param ptr: Git repository state pointer. :param config: configuration dict. :param data: contains "files" - the list of files in the pointed state. :param data_service: connection to the Lookout data retrieval service. :return: AnalyzerModel containing the learned rules, per language. """ _log = logging.getLogger(cls.__name__) _log.info("train %s %s %s", ptr.url, ptr.commit, pformat(config, width=4096, compact=True)) model = FormatModel().construct(cls, ptr) config = cls._load_train_config(config) for language, files in files_by_language(data["files"]).items(): try: lang_config = config[language] except KeyError: _log.warning("language %s is not supported, skipped", language) continue files = filter_files(files, lang_config["line_length_limit"], _log) submit_event("%s.train.%s.files" % (cls.name, language), len(files)) if len(files) == 0: _log.info( "zero files after filtering, language %s is skipped.", language) continue try: fe = FeatureExtractor(language=language, **lang_config["feature_extractor"]) except ImportError: _log.warning("skipped %d %s files - not supported", len(files), language) continue else: _log.info("training on %d %s files", len(files), language) # we sort to make the features reproducible X, y, _ = fe.extract_features(sorted(files, key=lambda x: x.path)) X, selected_features = fe.select_features(X, y) lang_config["feature_extractor"][ "selected_features"] = selected_features lang_config["feature_extractor"][ "label_composites"] = fe.labels_to_class_sequences lower_bound_instances = lang_config["lower_bound_instances"] if X.shape[0] < lower_bound_instances: _log.warning("skipped %d %s files: too few samples (%d/%d)", len(files), language, X.shape[0], lower_bound_instances) continue _log.debug("training the rules model") optimizer = Optimizer( n_jobs=lang_config["n_jobs"], n_iter=lang_config["n_iter"], cv=lang_config["cv"], random_state=lang_config["trainable_rules"]["random_state"]) best_score, best_params = optimizer.optimize(X, y) _log.debug("score of the best estimator found: %.6f", best_score) _log.debug("params of the best estimator found: %s", str(best_params)) _log.debug("training the model with complete data") lang_config["trainable_rules"].update(best_params) trainable_rules = TrainableRules(**lang_config["trainable_rules"], origin_config=lang_config) trainable_rules.fit(X, y) importances = trainable_rules.feature_importances_ _log.debug( "feature importances from %s:\n\t%s", lang_config["trainable_rules"]["base_model_name"], "\n\t".join( "%-55s %.5E" % (fe.feature_names[i], importances[i]) for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5)) submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules)) # TODO(vmarkovtsev): save the achieved precision, recall, etc. to the model # throw away imprecise classes if trainable_rules.rules.rules: model[language] = trainable_rules.rules else: _log.warning("model for %s has 0 rules. Skipping.", language) _log.info("trained %s", model) return model