def test_extract_features_all_lines(self):
        file = File(content=bytes(self.contents, "utf-8"), uast=self.uast)
        files = [file, file]

        self.check_X_y(*self.extractor.extract_features(
            files, [list(range(1,
                               self.contents.count("\n") + 1))] * 2))
def prepare_files(filenames: Iterable[str], client: BblfshClient,
                  language: str) -> Iterable[File]:
    """
    Prepare the given folder for analysis by extracting UASTs and creating the gRPC wrappers.

    :param filenames: List of paths to files to analyze.
    :param client: Babelfish client. Babelfish server should be started accordingly.
    :param language: Language to consider. Will discard the other languages.
    :return: Iterator of File-s with content, uast, path and language set.
    """
    files = []
    for file in tqdm(filter_filepaths(list(filenames))):
        try:
            res = client.parse(file)
        except NonUTF8ContentException:
            # skip files that can't be parsed because of UTF-8 decoding errors.
            continue
        if res.status == 0 and res.language.lower() == language.lower():
            uast = res.uast
            path = file
            with open(file) as f:
                content = f.read().encode("utf-8")
            files.append(
                File(content=content,
                     uast=uast,
                     path=path,
                     language=res.language.lower()))
    return files
 def setUpClass(cls):
     config = FormatAnalyzer._load_train_config(merge_dicts(
         get_train_config(), {
             "javascript": {
                 "feature_extractor": {
                     "left_siblings_window": 1,
                     "right_siblings_window": 1,
                     "parents_depth": 1,
                     "node_features": ["start_line", "reserved", "roles"],
                 },
             },
         }))
     base = Path(__file__).parent
     with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin:
         contents = fin.read()
     with lzma.open(str(base / "benchmark.uast.xz")) as fin:
         uast = bblfsh.Node.FromString(fin.read())
     file = File(content=bytes(contents, "utf-8"), uast=uast)
     files = [file, file]
     cls.fe = FeatureExtractor(language="javascript",
                               **config["javascript"]["feature_extractor"])
     cls.fe.extract_features(files)
     cls.class_representations = cls.fe.composite_class_representations
     cls.n_classes = len(cls.fe.labels_to_class_sequences)
     cls.ordinal = cls.return_node_feature(FeatureId.start_line)
     cls.categorical = cls.return_node_feature(FeatureId.reserved)
     cls.bag = cls.return_node_feature(FeatureId.roles)
    def run(self,
            ptr_from: ReferencePointer,
            data_service_head: DataService,
            data_service_base: Optional[DataService] = None
            ) -> Iterable[FileFix]:
        """
        Run `generate_file_fixes` for all files in ptr_from revision.

        :param ptr_from: Git repository state pointer to the base revision.
        :param data_service_head: Connection to the Lookout data retrieval service to get \
                                the new files.
        :param data_service_base: Connection to the Lookout data retrieval service to get \
                                  the initial files. If it is None, we assume the empty contents.
        :return: Generator of fixes for each file.
        """
        files_head = list(
            request_files(data_service_head.get_data(),
                          ptr_from,
                          contents=True,
                          uast=True,
                          unicode=True))

        if data_service_base is not None:
            files_base = list(
                request_files(data_service_base.get_data(),
                              ptr_from,
                              contents=True,
                              uast=True,
                              unicode=True))
        else:
            files_base = [File(path=f.path) for f in files_head]
        return self.generate_file_fixes(
            data_service_head,
            [self.Changes(f1, f2) for f1, f2 in zip(files_base, files_head)])
    def generate_local_test(mcs, case_name, uast, contents):
        fe_config = FormatAnalyzer._load_config(
            get_config())["train"]["javascript"]
        feature_extractor = FeatureExtractor(language="javascript",
                                             label_composites=label_composites,
                                             **fe_config["feature_extractor"])
        file = File(content=bytes(contents, "utf-8"), uast=uast)
        _, _, (vnodes_y, _, _, _) = feature_extractor.extract_features([file])
        offsets, y_pred, result = cases[case_name]

        def _test(self):
            y_cur = deepcopy(self.y)
            for offset, yi in zip(offsets, y_pred):
                i = None
                for i, vnode in enumerate(vnodes_y):  # noqa: B007
                    if offset == vnode.start.offset:
                        break
                y_cur[i] = yi
            code_generator = CodeGenerator(self.feature_extractor)
            pred_vnodes = code_generator.apply_predicted_y(
                self.vnodes, self.vnodes_y, list(range(len(self.vnodes_y))),
                FakeRules(y_cur))
            generated_file = code_generator.generate(pred_vnodes)
            self.assertEqual(generated_file, result)

        return _test
    def test_extract_features(self):
        file = File(content=bytes(self.contents, "utf-8"), uast=self.uast)
        files = [file, file]

        res = self.extractor.extract_features(files)
        self.assertIsNotNone(res, "Failed to parse files.")
        self.check_X_y(*res)
Exemple #7
0
    def test_filter_files_by_overall_size(self):
        files = {
            "one.py": File(content=b"hello"),
            "two.py": File(content=b"world" * 100)
        }

        def getter(key):
            return files[key].content

        filtered = list(
            filter_files_by_overall_size(files.keys(), getter, 1000000))
        self.assertEqual(len(filtered), 2)
        filtered = list(filter_files_by_overall_size(files.keys(), getter, 1))
        self.assertEqual(len(filtered), 0)
        filtered = list(
            filter_files_by_overall_size(files.keys(), getter, 5 * 100))
        self.assertEqual(len(filtered), 1)
Exemple #8
0
    def test_filter_files(self):
        files = [File(path="one", content=b"hello"), File(path="two", content=b"world" * 100)]
        files = {file.path: file for file in files}
        logged = False

        class Log:
            def debug(self, *args, **kwargs):
                nonlocal logged
                logged = True

        try:
            bblfsh_client = BblfshClient("0.0.0.0:9432")
            filtered = filter_files(files=files, line_length_limit=80, overall_size_limit=5 << 20,
                                    log=Log())
            self.assertEqual(len(filtered), 1)
            self.assertEqual(filtered[0].content, b"hello")
            self.assertTrue(logged)
        finally:
            bblfsh_client._channel.close()
Exemple #9
0
 def test_files_by_language(self):
     file_stats = {"js": 2, "ruby": 7, "Python": 5}
     files = []
     for language, n_files in file_stats.items():
         for i in range(n_files):
             files.append(File(language=language, uast=Node(children=[Node()]),
                               path=language + str(i)))
     result = files_by_language(files)
     self.assertEqual([("python", 5), ("js", 2), ("ruby", 7)],
                      [(k, len(v)) for k, v in result.items()])
     return result
 def get_class_sequences_from_code(
         code: str) -> Sequence[Tuple[int, ...]]:
     uast = client.parse(filename="",
                         language="javascript",
                         contents=code.encode()).uast
     extractor = FeatureExtractor(language="javascript", **config)
     result = extractor.extract_features(
         [File(content=code.encode(), uast=uast, path="")])
     if result is None:
         self.fail("Could not parse test code.")
     _, _, (vnodes_y, _, _, _) = result
     return [vnode.y for vnode in vnodes_y]
Exemple #11
0
 def setUpClass(cls):
     base = Path(__file__).parent
     # str() is needed for Python 3.5
     with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin:
         contents = fin.read()
     with lzma.open(str(base / "benchmark.uast.xz")) as fin:
         uast = bblfsh.Node.FromString(fin.read())
     file = File(content=bytes(contents, "utf-8"), uast=uast)
     cls.files = [file]
     config = FormatAnalyzer._load_config(get_config())["train"]
     cls.extractor = FeatureExtractor(
         language="javascript", **config["javascript"]["feature_extractor"])
Exemple #12
0
 def setUpClass(cls):
     base = Path(__file__).parent
     # str() is needed for Python 3.5
     with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin:
         contents = fin.read()
     with lzma.open(str(base / "benchmark.uast.xz")) as fin:
         uast = bblfsh.Node.FromString(fin.read())
     file = File(content=bytes(contents, "utf-8"), uast=uast)
     cls.files = [file]
     cls.extractor = FeatureExtractor("javascript",
                                      parents_depth=2,
                                      siblings_window=5)
Exemple #13
0
 def test_vnode_positions(self):
     test_js_code_filepath = Path(__file__).parent / "jquery.layout.js"
     with open(str(test_js_code_filepath), mode="rb") as f:
         code = f.read()
     uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(
         filename="", language="javascript", contents=code).uast
     file = BytesToUnicodeConverter.convert_file(
         File(content=code, uast=uast, language="javascript", path="test.js"))
     annotated_data = AnnotationManager.from_file(file)
     self.extractor._parse_file(annotated_data)
     # Just should not fail
     self.extractor._classify_vnodes(annotated_data)
Exemple #14
0
 def test_find_deleted_lines(self):
     text_base = """
     Lorem ipsum dolor sit amet, consectetur adipiscing elit.
     Maecenas volutpat dui id ipsum cursus, sit amet accumsan nisl ornare.
     Vivamus euismod lorem viverra semper dictum.
     Nam consectetur enim eget elementum mattis.
     Ut condimentum metus vehicula tellus tempus, vel ultricies lectus dapibus.
     Etiam vitae nisi at ante pretium lacinia et eu massa."""
     base_lines_number = text_base.count("\n") + 1
     # Delete first line
     new_line_indices = find_deleted_lines(
         File(content=bytes(text_base, "utf-8")),
         File(content=bytes("\n".join(text_base.split("\n")[1:]), "utf-8")))
     self.assertEqual(new_line_indices, [1])
     # Delete first two lines
     new_line_indices = find_deleted_lines(
         File(content=bytes(text_base, "utf-8")),
         File(content=bytes("\n".join(text_base.split("\n")[2:]), "utf-8")))
     self.assertEqual(new_line_indices, [1])
     # Delete last line
     new_line_indices = find_deleted_lines(
         File(content=bytes(text_base, "utf-8")),
         File(
             content=bytes("\n".join(text_base.split("\n")[:-1]), "utf-8")))
     self.assertEqual(new_line_indices, [base_lines_number - 1])
     # Delete last two lines
     new_line_indices = find_deleted_lines(
         File(content=bytes(text_base, "utf-8")),
         File(
             content=bytes("\n".join(text_base.split("\n")[:-2]), "utf-8")))
     self.assertEqual(new_line_indices, [base_lines_number - 2])
     # Delete line in the middle
     middle = 3
     text_head = text_base.split("\n")
     text_head.pop(middle)
     text_head = "\n".join(text_head)
     new_line_indices = find_deleted_lines(
         File(content=bytes(text_base, "utf-8")),
         File(content=bytes(text_head, "utf-8")))
     self.assertEqual(new_line_indices, [middle, middle + 1])
Exemple #15
0
    def test_extract_features_some_lines(self):
        file = File(content=bytes(self.contents, 'utf-8'), uast=self.uast)
        files = [file]

        X1, y1, vn1 = self.extractor.extract_features(
            files, [list(range(1,
                               self.contents.count("\n") // 2 + 1))] * 2)
        self.check_X_y(X1, y1, vn1)
        X2, y2, vn2 = self.extractor.extract_features(files)
        self.assertTrue((X1 == X2[:len(X1)]).all())
        self.assertTrue((y1 == y2[:len(y1)]).all())
        self.assertTrue(vn1 == vn2[:len(vn1)])
        self.assertLess(len(y1), len(y2))
Exemple #16
0
 def setUpClass(cls):
     logging.basicConfig(level=logging.INFO)
     logging.getLogger("IdTyposAnalyzer").setLevel(logging.DEBUG)
     base = Path(__file__).parent
     # str() is needed for Python 3.5
     cls.bblfsh_client = bblfsh.BblfshClient("0.0.0.0:9432")
     with lzma.open(str(base / "test_base_file.js.xz")) as fin:
         contents = fin.read()
         uast = cls.bblfsh_client.parse("test_base_file.js",
                                        contents=contents).uast
         cls.base_files = [
             File(path="test_file.js",
                  content=contents,
                  uast=uast,
                  language="Javascript")
         ]
     with lzma.open(str(base / "test_head_file.js.xz")) as fin:
         contents = b"var print_tipe = 0;\n" + fin.read()
         uast = cls.bblfsh_client.parse("test_head_file.js",
                                        contents=contents).uast
         cls.head_files = [
             File(path="test_file.js",
                  content=contents,
                  uast=uast,
                  language="Javascript")
         ]
     cls.ptr = ReferencePointer("someurl", "someref", "somecommit")
     cls.config = {
         "model": MODEL_PATH,
         "confidence_threshold": 0.0,
         "n_candidates": 3,
         "check_all_identifiers": True,
         "analyze": {
             "filepath": cls.base_files[0].path,
             "wrong_id": "print_tipe",
             "line": 0
         }
     }
 def setUpClass(cls):
     logging.basicConfig(level=logging.INFO)
     logging.getLogger("IdTyposAnalyzer").setLevel(logging.DEBUG)
     base = Path(__file__).parent
     # str() is needed for Python 3.5
     client = bblfsh.BblfshClient("0.0.0.0:9432")
     with lzma.open(str(base / "test_base_file.py.xz")) as fin:
         uast = client.parse("test_base_file.py", contents=fin.read()).uast
         cls.base_files = [
             File(path="test_base_file.py",
                  content=fin.read(),
                  uast=uast,
                  language="Python")
         ]
     with lzma.open(str(base / "test_head_file.py.xz")) as fin:
         uast = client.parse("test_head_file.py", contents=fin.read()).uast
         cls.head_files = [
             File(path="test_head_file.py",
                  content=fin.read(),
                  uast=uast,
                  language="Python")
         ]
     cls.ptr = ReferencePointer("someurl", "someref", "somecommit")
Exemple #18
0
 def test_find_modified_lines(self):
     text_base = """
     Lorem ipsum dolor sit amet, consectetur adipiscing elit.
     Maecenas volutpat dui id ipsum cursus, sit amet accumsan nisl ornare.
     Vivamus euismod lorem viverra semper dictum.
     Nam consectetur enim eget elementum mattis.
     Ut condimentum metus vehicula tellus tempus, vel ultricies lectus dapibus.
     Etiam vitae nisi at ante pretium lacinia et eu massa."""
     # inserted lines: 3 and 6 (counting from 1 with a new line at the start)
     # modified line: 4
     text_head = """
     Lorem ipsum dolor sit amet, consectetur adipiscing elit.
     Curabitur congue libero vitae quam venenatis, tristique commodo diam lacinia.
     Mecenas volutpat dui id ipsum cursus, sit amet accumsan nisl ornare.
     Vivamus euismod lorem viverra semper dictum.
     Praesent eu ipsum sit amet elit aliquam laoreet.
     Nam consectetur enim eget elementum mattis.
     Ut condimentum metus vehicula tellus tempus, vel ultricies lectus dapibus.
     Etiam vitae nisi at ante pretium lacinia et eu massa."""
     new_line_indices = find_new_lines(
         File(content=bytes(text_base, "utf-8")),
         File(content=bytes(text_head, "utf-8")))
     self.assertEqual(new_line_indices, [3, 4, 6])
 def test_files_by_language(self):
     file_stats = {"js": 2, "Python": 5, "ruby": 7}
     files = []
     for language, n_files in file_stats.items():
         for i in range(n_files):
             files.append(
                 File(language=language, uast=self.uast, path=str(i)))
     result = FormatAnalyzer._files_by_language(files)
     self.assertEqual({
         "js": 2,
         "python": 5,
         "ruby": 7
     }, {k: len(v)
         for k, v in result.items()})
     return result
    def test_extract_features_some_lines(self):
        file = File(content=bytes(self.contents, "utf-8"), uast=self.uast)
        files = [file]

        X1_csr, y1, (vn1_y, vn1, vn1_parents,
                     n1_parents) = self.extractor.extract_features(
                         files,
                         [list(range(1,
                                     self.contents.count("\n") // 2 + 1))] * 2)
        self.check_X_y(X1_csr, y1, (vn1_y, vn1, vn1_parents, n1_parents))
        X2_csr, y2, (vn2_y, vn2, _, _) = self.extractor.extract_features(files)
        X1, X2 = X1_csr.toarray(), X2_csr.toarray()
        self.assertTrue((X1 == X2[:len(X1)]).all())
        self.assertTrue((y1 == y2[:len(y1)]).all())
        self.assertTrue(vn1_y == vn2_y[:len(vn1_y)])
        self.assertLess(len(y1), len(y2))
Exemple #21
0
 def setUpClass(cls):
     cls.maxDiff = None
     base = Path(__file__).parent
     # str() is needed for Python 3.5
     with lzma.open(str(base / "benchmark_small.js.xz"), mode="rt") as fin:
         contents = fin.read()
     with lzma.open(str(base / "benchmark_small.js.uast.xz")) as fin:
         uast = bblfsh.Node.FromString(fin.read())
     config = FormatAnalyzer._load_train_config(get_train_config())
     fe_config = config["javascript"]
     cls.feature_extractor = FeatureExtractor(
         language="javascript",
         label_composites=label_composites,
         **fe_config["feature_extractor"])
     cls.file = File(content=bytes(contents, "utf-8"), uast=uast)
     cls.X, cls.y, (cls.vnodes_y, cls.vnodes, cls.vnode_parents, cls.node_parents) = \
         cls.feature_extractor.extract_features([cls.file])
Exemple #22
0
 def get_files_from_tar(tar_path: str) -> Dict[str, File]:
     files = defaultdict(lambda: [None, None])
     with tarfile.open(tar_path) as tar:
         for member in tar:
             name = member.name
             if name == ".":
                 continue
             file = tar.extractfile(member)
             uast = True if name.endswith(".uast") else False
             content = file.read()
             if uast:
                 name = name[:-5]
                 content = bblfsh.Node.FromString(content)
             files[name][uast] = content
     for key, (content, uast) in files.items():
         files[key] = File(path=key, content=content, uast=uast, language="JavaScript")
     return files
Exemple #23
0
def prepare_file(filename: str, client: BblfshClient, language: str) -> File:
    """
    Prepare the given file for analysis by extracting UAST and creating the gRPC wrapper.

    :param filename: Path to the filename to analyze.
    :param client: Babelfish client. Babelfish server should be started accordingly.
    :param language: Language to consider. Will discard the other languages
    """
    assert os.path.isfile(filename), "\"%s\" should be a file" % filename
    res = client.parse(filename, language)
    assert res.status == 0, "Parse returned status %s for file %s" % (
        res.status, filename)
    error_log = "Language for % should be %s instead of %s"
    assert res.language.lower() == language.lower(), error_log % (
        filename, language, res.language)

    with open(filename) as f:
        content = f.read().encode("utf-8")

    return File(content=content, uast=res.uast, path=filename)
Exemple #24
0
 def test_multiple_files(self):
     data = [
         ("var a = 0", {
             1: (CLS_NOOP, )
         }),
         ("var b = 123", {
             4: (CLS_NOOP, )
         }),
     ]
     files = []
     for i, (code, _) in enumerate(data):
         uast, errors = parse_uast(self.stub,
                                   code,
                                   filename="",
                                   language=self.language)
         if errors:
             self.fail("Could not parse the testing code.")
         files.append(
             File(content=code.encode(), uast=uast,
                  path="test_file_%d" % i))
     X, y, (vnodes_y, vnodes, vnode_parents,
            node_parents) = self.fe.extract_features(files)
     y_pred = y.copy()
     rule_winners = numpy.zeros(y.shape)
     for (_, modif) in data:
         for i in modif:
             y_pred[i] = self._to_label(modif[i])
     checker = UASTStabilityChecker(self.fe)
     new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check(
         y,
         y_pred,
         vnodes_y,
         vnodes,
         files,
         self.stub,
         vnode_parents,
         node_parents,
         rule_winners,
         grouped_quote_predictions={})
     self.assertEqual(list(safe_preds), [0, 2, 3, 4, 5, 6, 7, 8])
Exemple #25
0
    def run(self, ptr: ReferencePointer,
            data_service: DataService) -> Iterable[TypoFix]:
        """
        Run `generate_typos_fixes` for all lines and all files in `ptr_from` revision.

        :param ptr: Git repository state pointer to the revision that should be analyzed.
        :param data_service: Connection to the Lookout data retrieval service to get the files.
        :return: Generator of fixes for each file.
        """
        for file in request_files(data_service.get_data(),
                                  ptr,
                                  contents=True,
                                  uast=True,
                                  unicode=False):
            if file.path == self.config["filepath_to_analyze"]:
                break
        else:
            raise ValueError("No such file %s in %s" %
                             (self.config["filepath_to_analyze"], ptr))

        typos_fixes = list(
            self.generate_typos_fixes([
                UnicodeChange(head=file,
                              base=File(path=file.path,
                                        language=file.language))
            ]))
        if typos_fixes:
            return typos_fixes
        identifiers_number = len(self._get_identifiers(file.uast, []))
        if not identifiers_number:
            raise ValueError("No identifiers for file %s in %s" %
                             (self.config["filepath_to_analyze"], ptr))
        return [
            TypoFix(content=file.content.decode("utf-8", "replace"),
                    path=file.path,
                    line_number=0,
                    identifier="",
                    candidates=[],
                    identifiers_number=identifiers_number)
        ]
Exemple #26
0
 def edit_and_test(self,
                   code: str,
                   modifs: Mapping[int, Sequence[str]],
                   *,
                   quote_indices: Optional[Tuple[int, ...]] = None,
                   bad_indices: Optional[FrozenSet[int]] = None) -> None:
     uast, errors = parse_uast(self.stub,
                               code,
                               filename="",
                               language=self.language)
     if errors:
         self.fail("Could not parse the testing code.")
     file = File(content=code.encode(), uast=uast, path="test_file")
     X, y, (vnodes_y, vnodes, vnode_parents,
            node_parents) = self.fe.extract_features([file])
     y_pred = y.copy()
     rule_winners = numpy.zeros(y.shape)
     for index, classes in modifs.items():
         y_pred[index] = self._to_label(classes)
     checker = UASTStabilityChecker(self.fe)
     grouped_quote_predictions = self._grouped_predictions_mapping(
         vnodes, quote_indices)
     new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check(
         y,
         y_pred,
         vnodes_y,
         vnodes, [file],
         self.stub,
         vnode_parents,
         node_parents,
         rule_winners,
         grouped_quote_predictions=grouped_quote_predictions)
     bad_preds = set(range(y.shape[0])) - set(safe_preds)
     bad = modifs.keys() if bad_indices is None else bad_indices
     self.assertEqual(bad_preds, bad)
     self.assertEqual(len(y) - len(bad), len(new_y))
     self.assertEqual(len(y_pred) - len(bad), len(new_y_pred))
     self.assertEqual(len(vnodes_y) - len(bad), len(new_vnodes_y))
     self.assertEqual(len(rule_winners) - len(bad), len(new_rule_winners))
Exemple #27
0
    def test_extended_roles(self):
        file = File(content=bytes(self.contents, 'utf-8'), uast=self.uast)
        X, _, vns = self.extractor.extract_features([file])
        # last columns are only roles
        last_columns = self.extractor.parents_depth + self.extractor.siblings_window
        self.assertGreater(
            numpy.count_nonzero(X[:, -last_columns:] > len(ROLE_INDEX)), 0)
        col_role_left_sibling = (
            self.extractor.count_features(FeatureType.node) +
            self.extractor.count_features(FeatureType.left_siblings) - 1)

        def get_ext_role(role_index):
            return RESERVED[role_index - len(ROLE_INDEX)]

        for i, (x, vn) in enumerate(zip(X, vns)):
            start = vn.start.offset
            # Don't test the first two nodes, they might not have a left sibling
            if i < 2:
                continue
            role_index_left = x[col_role_left_sibling]
            if role_index_left >= len(ROLE_INDEX):
                role_left = get_ext_role(role_index_left)
                self.assertEqual(self.contents[start - len(role_left):start],
                                 role_left)
 def test_extract_features_exact_match(self):
     file = File(content=bytes(self.contents, "utf-8"), uast=self.uast)
     files = [file]
     X, y, (vnodes_y, vnodes, _, _) = self.extractor.extract_features(files)
     self.assertEqual("".join(vnode.value for vnode in vnodes),
                      self.contents)
def return_features() -> Response:
    """Featurize the given code."""
    body = request.get_json()
    code = body["code"]
    babelfish_address = body["babelfish_address"]
    language = body["language"]
    client = BblfshClient(babelfish_address)
    res = client.parse(filename="", contents=code.encode(), language=language)
    if res.status != 0:
        abort(500)
    model = FormatModel().load(
        str(Path(__file__).parent / "models" / "model.asdf"))
    if language not in model:
        raise NotFittedError()
    rules = model[language]
    file = File(content=code.encode(), uast=res.uast, language="javascript")
    config = rules.origin_config["feature_extractor"]
    config["return_sibling_indices"] = True
    fe = FeatureExtractor(language=language, **config)
    res = fe.extract_features([file])
    if res is None:
        abort(500)
    X, y, (vnodes_y, vnodes, vnode_parents, node_parents,
           sibling_indices) = res
    y_pred, rule_winners, rules, grouped_quote_predictions = rules.predict(
        X=X, vnodes_y=vnodes_y, vnodes=vnodes, feature_extractor=fe)
    refuse_to_predict = y_pred < 0
    _, _, _, _, safe_preds = filter_uast_breaking_preds(
        y=y,
        y_pred=y_pred,
        vnodes_y=vnodes_y,
        vnodes=vnodes,
        files={file.path: file},
        feature_extractor=fe,
        stub=client._stub,
        vnode_parents=vnode_parents,
        node_parents=node_parents,
        rule_winners=rule_winners,
        grouped_quote_predictions=grouped_quote_predictions)
    break_uast = [False] * X.shape[0]
    for wrong_pred in set(range(X.shape[0])).difference(safe_preds):
        break_uast[wrong_pred] = True
    labeled_indices = {id(vnode): i for i, vnode in enumerate(vnodes_y)}
    app.logger.info("returning features of shape %d, %d" % X.shape)
    app.logger.info("length of rules: %d", len(rules))
    return jsonify({
        "code":
        code,
        "features":
        _input_matrix_to_descriptions(X, fe),
        "ground_truths":
        y.tolist(),
        "predictions":
        y_pred.tolist(),
        "refuse_to_predict":
        refuse_to_predict.tolist(),
        "sibling_indices":
        sibling_indices,
        "rules":
        _rules_to_jsonable(rules, fe),
        "winners":
        rule_winners.tolist(),
        "break_uast":
        break_uast,
        "feature_names":
        fe.feature_names,
        "class_representations":
        fe.composite_class_representations,
        "class_printables":
        fe.composite_class_printables,
        "vnodes":
        list(
            map(partial(_vnode_to_jsonable, labeled_indices=labeled_indices),
                vnodes)),
        "config":
        _mapping_to_jsonable(rules.origin_config)
    })
Exemple #30
0
 def create_files():
     files = [File(path="one", content=b"hello"),
              File(path="two", content=b"world" * 100)] * 1000
     files = random.sample(files, k=len(files))  # note: no need to set the seed
     return {file.path: file for file in files}