Beispiel #1
0
 def analyze(self, ptr_from: ReferencePointer, ptr_to: ReferencePointer,  # noqa: D
             data_service: DataService, changes: Iterable[Change]) -> [Comment]:
     self._log.info("analyze %s %s", ptr_from.commit, ptr_to.commit)
     comments = []
     parser = TokenParser(stem_threshold=100, single_shot=True)
     words = autocorrect.word.KNOWN_WORDS.copy()
     try:
         for name in self.model.names:
             if len(name) >= 3:
                 autocorrect.word.KNOWN_WORDS.add(name)
         for change in changes:
             suggestions = defaultdict(list)
             new_lines = set(find_new_lines(change.base, change.head))
             for node in bblfsh.filter(change.head.uast, "//*[@roleIdentifier]"):
                 if node.start_position is not None and node.start_position.line in new_lines:
                     for part in parser.split(node.token):
                         if part not in self.model.names:
                             fixed = autocorrect.spell(part)
                             if fixed != part:
                                 suggestions[node.start_position.line].append(
                                     (node.token, part, fixed))
             for line, s in suggestions.items():
                 comment = Comment()
                 comment.file = change.head.path
                 comment.text = "\n".join("`%s`: %s > %s" % fix for fix in s)
                 comment.line = line
                 comment.confidence = 100
                 comments.append(comment)
     finally:
         autocorrect.word.KNOWN_WORDS = words
     return comments
Beispiel #2
0
    def generate_file_fixes(self, data_service: DataService, changes: Sequence[Change],
                            ) -> Iterator[FileFix]:
        """
        Generate all data required for any type of further processing.

        Next processing can be comment generation or performance report generation.

        :param data_service: Connection to the Lookout data retrieval service.
        :param changes: The list of changes in the pointed state.
        :return: Iterator with unrendered data per comment.
        """
        log = self._log
        base_files_by_lang = files_by_language(c.base for c in changes)
        head_files_by_lang = files_by_language(c.head for c in changes)
        processed_files_counter = defaultdict(int)
        processed_fixes_counter = defaultdict(int)
        for lang, head_files in head_files_by_lang.items():
            if lang not in self.model:
                log.warning("skipped %d written in %s. Rules for %s do not exist in model",
                            len(head_files), lang, lang)
                continue
            rules = self.model[lang]
            config = self.analyze_config[lang]
            rules = rules.filter_by_confidence(config["confidence_threshold"]) \
                .filter_by_support(config["support_threshold"])
            for file in filter_files(head_files, rules.origin_config["line_length_limit"],
                                     rules.origin_config["overall_size_limit"], log=log):
                processed_files_counter[lang] += 1
                try:
                    prev_file = base_files_by_lang[lang][file.path]
                except KeyError:
                    prev_file = None
                    lines = None
                else:
                    lines = sorted(chain.from_iterable((
                        find_new_lines(prev_file, file),
                        find_deleted_lines(prev_file, file),
                    )))
                log.debug("%s %s", file.path, lines)
                fe = FeatureExtractor(language=lang, **rules.origin_config["feature_extractor"])
                feature_extractor_output = fe.extract_features([file], [lines])
                if feature_extractor_output is None:
                    submit_event("%s.analyze.%s.parse_failures" % (self.name, lang), 1)
                    if config["report_parse_failures"]:
                        log.warning("Failed to parse %s", file.path)
                        yield FileFix(error="Failed to parse", head_file=file, language=lang,
                                      feature_extractor=fe, base_file=prev_file, file_vnodes=[],
                                      line_fixes=[], y_pred_pure=None, y=None)
                else:
                    fixes, file_vnodes, y_pred_pure, y = self._generate_token_fixes(
                        file, fe, feature_extractor_output, data_service.get_bblfsh(), rules)
                    log.debug("%s %d fixes", file.path, len(fixes))
                    processed_fixes_counter[lang] += len(fixes)
                    yield FileFix(error="", head_file=file, language=lang, feature_extractor=fe,
                                  base_file=prev_file, file_vnodes=file_vnodes, line_fixes=fixes,
                                  y_pred_pure=y_pred_pure, y=y)
        for key, val in processed_files_counter.items():
            submit_event("%s.analyze.%s.files" % (self.name, key), val)
        for key, val in processed_fixes_counter.items():
            submit_event("%s.analyze.%s.fixes" % (self.name, key), val)
Beispiel #3
0
    def generate_typos_fixes(self,
                             changes: Sequence[Change]) -> Iterator[TypoFix]:
        """
        Generate all data about typo fix required for any type of further processing.

        The processing can be comment generation or performance report generation.

        :param changes: The list of changes in the pointed state.
        :return: Iterator with unrendered data per comment.
        """
        base_files_by_lang = files_by_language(c.base for c in changes)
        head_files_by_lang = files_by_language(c.head for c in changes)
        for lang, head_files in head_files_by_lang.items():
            for file in filter_files(
                    files=head_files,
                    line_length_limit=self.config["line_length_limit"],
                    overall_size_limit=self.config["overall_size_limit"],
                    log=self._log):
                try:
                    prev_file = base_files_by_lang[lang][file.path]
                except KeyError:
                    lines = []
                    old_identifiers = set()
                else:
                    lines = find_new_lines(prev_file, file)
                    old_identifiers = {
                        node.token
                        for node in uast2sequence(prev_file.uast)
                        if bblfsh.role_id("IDENTIFIER") in node.roles
                        and bblfsh.role_id("IMPORT") not in node.roles
                        and node.token
                    }
                changed_nodes = extract_changed_nodes(file.uast, lines)
                new_identifiers = [
                    node for node in changed_nodes
                    if bblfsh.role_id("IDENTIFIER") in node.roles
                    and bblfsh.role_id("IMPORT") not in node.roles
                    and node.token and node.token not in old_identifiers
                ]
                if not new_identifiers:
                    continue
                suggestions = self.check_identifiers(
                    [n.token for n in new_identifiers])
                for index in suggestions.keys():
                    corrections = suggestions[index]
                    for token in corrections.keys():
                        yield TypoFix(
                            head_file=file,
                            token=new_identifiers[index].token,
                            candidates=[
                                Candidate(*c[:2]) for c in corrections[token]
                            ],
                            line_number=new_identifiers[index].start_position.
                            line,
                        )
Beispiel #4
0
    def check_training_required(
            cls, old_model: FormatModel, ptr: ReferencePointer, config: Mapping[str, Any],
            data_service: "lookout.core.data_requests.DataService", **data) -> bool:
        """
        Return True if the format model needs to be refreshed; otherwise, False.

        We calculate the ratio of the number of changed lines to the overall number of lines.
        If it is bigger than lines_ratio_train_trigger - we need to train.

        :param old_model: Current FormatModel.
        :param ptr: Git repository state pointer.
        :param config: configuration dict.
        :param data: contains "files" - the list of files in the pointed state.
        :param data_service: connection to the Lookout data retrieval service.
        :return: True or False
        """
        _log = logging.getLogger(cls.__name__)
        changes = list(request_changes(
            data_service.get_data(), old_model.ptr, ptr, contents=True, uast=False))
        base_files_by_lang = files_by_language(c.base for c in changes)
        head_files_by_lang = files_by_language(c.head for c in changes)
        config = cls._load_config(config)
        for language, head_files in head_files_by_lang.items():
            try:
                lang_config = config["train"][language]
            except KeyError:
                _log.warning("language %s is not supported, skipped", language)
                continue
            overall_lines = changed_lines = 0
            for file in filter_files(head_files, lang_config["line_length_limit"],
                                     lang_config["overall_size_limit"], log=_log):
                head_lines = len(file.content.splitlines())
                overall_lines += head_lines
                try:
                    prev_file = base_files_by_lang[language][file.path]
                except KeyError:
                    changed_lines += head_lines
                else:
                    changed_lines += len(find_new_lines(prev_file, file))
                    changed_lines += len(find_deleted_lines(prev_file, file))
            ratio = changed_lines / (overall_lines or 1)
            _log.debug("check %s ratio: %.3f", language, ratio)
            if ratio > lang_config["lines_ratio_train_trigger"]:
                _log.info("%s triggers the training with changes ratio %.3f", language, ratio)
                return True
        return False
Beispiel #5
0
 def test_find_modified_lines(self):
     text_base = """
     Lorem ipsum dolor sit amet, consectetur adipiscing elit.
     Maecenas volutpat dui id ipsum cursus, sit amet accumsan nisl ornare.
     Vivamus euismod lorem viverra semper dictum.
     Nam consectetur enim eget elementum mattis.
     Ut condimentum metus vehicula tellus tempus, vel ultricies lectus dapibus.
     Etiam vitae nisi at ante pretium lacinia et eu massa."""
     # inserted lines: 3 and 6 (counting from 1 with a new line at the start)
     # modified line: 4
     text_head = """
     Lorem ipsum dolor sit amet, consectetur adipiscing elit.
     Curabitur congue libero vitae quam venenatis, tristique commodo diam lacinia.
     Mecenas volutpat dui id ipsum cursus, sit amet accumsan nisl ornare.
     Vivamus euismod lorem viverra semper dictum.
     Praesent eu ipsum sit amet elit aliquam laoreet.
     Nam consectetur enim eget elementum mattis.
     Ut condimentum metus vehicula tellus tempus, vel ultricies lectus dapibus.
     Etiam vitae nisi at ante pretium lacinia et eu massa."""
     new_line_indices = find_new_lines(text_base, text_head)
     self.assertEqual(new_line_indices, [3, 4, 6])
Beispiel #6
0
 def _find_new_lines(self, prev_content: str, content: str) -> List[int]:
     return find_new_lines(prev_content, content)
    def analyze(self, ptr_from: ReferencePointer, ptr_to: ReferencePointer,
                data_service: DataService, **data) -> [Comment]:
        """
        Return the list of `Comment`-s - found typo corrections.

        :param ptr_from: The Git revision of the fork point. Exists in both the original and \
                         the forked repositories.
        :param ptr_to: The Git revision to analyze. Exists only in the forked repository.
        :param data_service: The channel to the data service in Lookout server to query for \
                             UASTs, file contents, etc.
        :param data: Extra data passed into the method. Used by the decorators to simplify \
                     the data retrieval.
        :return: List of found review suggestions. Refer to \
                 lookout/core/server/sdk/service_analyzer.proto.
        """
        log = self.log
        comments = []
        changes = list(data["changes"])
        base_files_by_lang = files_by_language(c.base for c in changes)
        head_files_by_lang = files_by_language(c.head for c in changes)
        line_length = self.config.get("line_length_limit",
                                      self.DEFAULT_LINE_LENGTH_LIMIT)
        for lang, head_files in head_files_by_lang.items():
            for file in filter_files(head_files, line_length, log):
                try:
                    prev_file = base_files_by_lang[lang][file.path]
                except KeyError:
                    lines = []
                    old_identifiers = set()
                else:
                    lines = find_new_lines(prev_file, file)
                    old_identifiers = {
                        node.token
                        for node in uast2sequence(prev_file.uast)
                        if bblfsh.role_id("IDENTIFIER") in node.roles
                        and bblfsh.role_id("IMPORT") not in node.roles
                        and node.token
                    }
                changed_nodes = extract_changed_nodes(file.uast, lines)
                new_identifiers = [
                    node for node in changed_nodes
                    if bblfsh.role_id("IDENTIFIER") in node.roles
                    and bblfsh.role_id("IMPORT") not in node.roles
                    and node.token and node.token not in old_identifiers
                ]
                if not new_identifiers:
                    continue
                suggestions = self.check_identifiers(
                    [n.token for n in new_identifiers])
                for index in suggestions.keys():
                    corrections = suggestions[index]
                    for token in corrections.keys():
                        comment = Comment()
                        comment.file = file.path
                        corrections_line = " " + ", ".join(
                            "%s (%d%%)" %
                            (candidate[0], int(candidate[1] * 100))
                            for candidate in corrections[token])
                        comment.text = """
                            Possible typo in \"%s\". Suggestions:
                        """.strip(
                        ) % new_identifiers[index].token + corrections_line
                        comment.line = new_identifiers[
                            index].start_position.line
                        comment.confidence = int(corrections[token][0][1] *
                                                 100)
                        comments.append(comment)
        return comments
Beispiel #8
0
    def generate_typos_fixes(self,
                             changes: Sequence[Change]) -> Iterator[TypoFix]:
        """
        Generate all data about typo fix required for any type of further processing.

        The processing can be comment generation or performance report generation.

        :param changes: The list of changes in the pointed state.
        :return: Iterator with unrendered data per comment.
        """
        base_files_by_lang = files_by_language(c.base for c in changes)
        head_files_by_lang = files_by_language(c.head for c in changes)
        for lang, head_files in head_files_by_lang.items():
            for file in filter_files(
                    files=head_files,
                    line_length_limit=self.config["line_length_limit"],
                    overall_size_limit=self.config["overall_size_limit"],
                    log=self._log):
                try:
                    prev_file = base_files_by_lang[lang][file.path]
                except KeyError:
                    lines = []
                else:
                    lines = find_new_lines(prev_file.content, file.content)
                identifiers = self._get_identifiers(file.uast, lines)
                new_identifiers = [
                    node for node in identifiers
                    if node.token not in self.allowed_identifiers
                ]
                if not new_identifiers:
                    continue
                self._log.debug("found %d new identifiers" %
                                len(new_identifiers))
                suggestions = self.check_identifiers(
                    [n.token for n in new_identifiers])
                if not suggestions:
                    continue
                for index in suggestions.keys():
                    identifier = new_identifiers[index].token
                    candidates = {
                        token: [
                            Candidate(*sugg)
                            for sugg in suggestions[index][token]
                        ]
                        for token in suggestions[index]
                    }
                    sugg_identifiers, id_confidences = [], []
                    for final_sugg, conf in self.generate_identifier_suggestions(
                            candidates, identifier):
                        sugg_identifiers.append(final_sugg)
                        id_confidences.append(conf)

                    identifier_candidates = [
                        Candidate(i, c) for i, c in zip(
                            sugg_identifiers,
                            self._normalize_confidences(id_confidences),
                        ) if i != identifier
                    ]
                    if identifier_candidates:
                        yield TypoFix(
                            content=file.content.decode("utf-8", "replace"),
                            path=file.path,
                            identifier=identifier,
                            line_number=new_identifiers[index].start_position.
                            line,
                            candidates=identifier_candidates,
                            identifiers_number=len(identifiers),
                        )