Beispiel #1
0
 def setUp(self):
     config = FormatAnalyzer._load_config(get_config())
     self.annotated_file = AnnotationManager.from_file(self.file)
     self.final_config = config["train"]["javascript"]
     self.extractor = FeatureExtractor(
         language="javascript", **self.final_config["feature_extractor"])
     self.annotated_file = AnnotationManager.from_file(self.file)
 def _annotate_files(
         self,
         files: Iterable[UnicodeFile],
         lines: Optional[List[List[int]]] = None
 ) -> List[AnnotationManager]:
     parsed_files = []
     for i, file in enumerate(files):
         path = file.path
         file = AnnotationManager.from_file(file)
         if lines is not None and lines[i] is not None:
             file.add(
                 LinesToCheckAnnotation(0, len(file), frozenset(lines[i])))
         try:
             self._parse_file(file)
         except AssertionError as e:
             self._log.warning("could not parse %s: error '%s', skipping",
                               path, e)
             if self.debug_parsing:
                 import traceback
                 traceback.print_exc()
                 input("Press Enter to continue…")
             continue
         self._classify_vnodes(file)
         self._merge_classes_to_composite_labels(file)
         self._add_noops(file)
         parsed_files.append(file)
         self._fill_vnode_parents(file)
     vnodes_parsed_number = sum(
         file.count(TokenAnnotation) for file in parsed_files)
     self._log.debug("Parsed %d vnodes", vnodes_parsed_number)
     return parsed_files
Beispiel #3
0
 def test_classify_vnodes_with_trailing_space(self):
     contents = self.contents + " "
     file = BytesToUnicodeConverter.convert_file(
         File(content=contents.encode(),
              uast=self.uast,
              language="javascript",
              path="test"))
     annotated_file = AnnotationManager.from_file(file)
     self.extractor._parse_file(annotated_file)
     self.extractor._classify_vnodes(annotated_file)
     text = "".join(
         annotated_file[token.span]
         for token in annotated_file.iter_by_type(AtomicTokenAnnotation))
     self.assertEqual(text, contents)
     cls_counts = Counter()
     old_stop = 0
     for annotations in annotated_file.iter_by_type_nested(
             AtomicTokenAnnotation, ClassAnnotation):
         self.assertEqual(old_stop, annotations.start)
         if ClassAnnotation in annotations:
             cls_counts.update(
                 map(CLASSES.__getitem__, annotations[ClassAnnotation].cls))
         old_stop = annotations.stop
     self.assertEqual(len(contents), old_stop)
     self.assertEqual(cls_counts[CLS_SPACE_INC],
                      cls_counts[CLS_SPACE_DEC] + 1)
     self.assertGreater(cls_counts[CLS_SPACE_INC], 0)
     self.assertGreater(cls_counts[CLS_SPACE], 0)
     self.assertGreater(cls_counts[CLS_NEWLINE], 0)
     self.assertGreater(cls_counts[CLS_SINGLE_QUOTE], 0)
     self.assertTrue(cls_counts[CLS_SINGLE_QUOTE] % 2 == 0)
Beispiel #4
0
 def test_positions(self):
     test_js_code_filepath = Path(
         __file__).parent / "browser-policy-content.js"
     with open(str(test_js_code_filepath), mode="rb") as f:
         code = f.read()
     uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(filename="",
                                                      language="javascript",
                                                      contents=code).uast
     converter = BytesToUnicodeConverter(code)
     code_uni = converter.convert_content()
     uast_uni = converter.convert_uast(uast)
     file = UnicodeFile(content=code_uni,
                        uast=uast_uni,
                        language="javascript",
                        path="test.js")
     annotated_data = AnnotationManager.from_file(file)
     self.extractor._parse_file(annotated_data)
     nodes, _ = file_to_old_parse_file_format(annotated_data)
     for index, (node1, node2) in enumerate(zip(nodes, nodes[1:])):
         self.assertLessEqual(
             node1.start.line, node2.start.line,
             "Start line position decrease for %d, %d nodes" %
             (index, index + 1))
         self.assertLessEqual(
             node1.start.offset, node2.start.offset,
             "Start offset position decrease for %d, %d nodes" %
             (index, index + 1))
Beispiel #5
0
 def test_vnode_positions(self):
     test_js_code_filepath = Path(__file__).parent / "jquery.layout.js"
     with open(str(test_js_code_filepath), mode="rb") as f:
         code = f.read()
     uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(
         filename="", language="javascript", contents=code).uast
     file = BytesToUnicodeConverter.convert_file(
         File(content=code, uast=uast, language="javascript", path="test.js"))
     annotated_data = AnnotationManager.from_file(file)
     self.extractor._parse_file(annotated_data)
     # Just should not fail
     self.extractor._classify_vnodes(annotated_data)
Beispiel #6
0
 def test_parse_file_comment_after_regexp(self):
     code = b"x = // comment\n/<regexp>/;"
     uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(filename="",
                                                      language="javascript",
                                                      contents=code).uast
     file = BytesToUnicodeConverter.convert_file(
         File(uast=uast, content=code, language="javascript", path=""))
     annotated_file = AnnotationManager.from_file(file)
     self.extractor._parse_file(annotated_file)
     self.assertEqual(
         "".join(
             annotated_file[token.span]
             for token in annotated_file.iter_by_type(RawTokenAnnotation)),
         code.decode())
Beispiel #7
0
 def test_parse_file_with_trailing_space(self):
     contents = self.contents + " "
     file = BytesToUnicodeConverter.convert_file(
         File(content=contents.encode(),
              uast=self.uast,
              language="javascript",
              path="test"))
     annotated_data = AnnotationManager.from_file(file)
     self.extractor._parse_file(annotated_data)
     nodes, _ = file_to_old_parse_file_format(annotated_data)
     offset, line, col = nodes[-1].end
     self.assertEqual(len(contents), offset)
     # Space token always ends on the same line
     self.assertEqual(len(contents.splitlines()), line)
     self.assertEqual("".join(n.value for n in nodes), contents)
Beispiel #8
0
 def test_parse_file_exact_match(self):
     test_js_code_filepath = str(
         Path(__file__).parent / "for_parse_test.js.xz")
     with lzma.open(test_js_code_filepath, mode="rb") as f:
         code = f.read()
     uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(filename="",
                                                      language="javascript",
                                                      contents=code).uast
     file = BytesToUnicodeConverter.convert_file(
         File(uast=uast, content=code, language="javascript", path=""))
     annotated_file = AnnotationManager.from_file(file)
     self.extractor._parse_file(annotated_file)
     nodes, _ = file_to_old_parse_file_format(annotated_file)
     self.assertEqual("".join(n.value for n in nodes), code.decode())
     self.assertEqual(
         "".join(
             annotated_file[token.span]
             for token in annotated_file.iter_by_type(RawTokenAnnotation)),
         code.decode())