def _find_parent(
        search_start_offset: int,
        file: AnnotationManager,
        closest_left_node_id: int,
    ) -> Optional[bblfsh.Node]:
        """
        Compute the UAST parent of the `TokenAnnotation` as the LCA of the closest left and right \
        Babelfish nodes.

        :param search_start_offset: Offset of the current node.
        :param file: Source code annotated with `UASTAnnotation` and `TokenAnnotation`.
        :param closest_left_node_id: bblfsh node of the closest parent already gone through.
        :return: The `bblfsh.Node` of the found parent or None if no parent was found.
        """
        left_ancestors = set()
        current_left_ancestor_id = closest_left_node_id
        parents = file.get(UASTAnnotation).parents
        while current_left_ancestor_id in parents:
            left_ancestors.add(id(parents[current_left_ancestor_id]))
            current_left_ancestor_id = id(parents[current_left_ancestor_id])

        for future_vnode in file.iter_by_type(
                TokenAnnotation, start_offset=search_start_offset):
            if future_vnode.has_node:
                break
        else:
            return None
        current_right_ancestor_id = id(future_vnode.node)
        while current_right_ancestor_id in parents:
            if id(parents[current_right_ancestor_id]) in left_ancestors:
                return parents[current_right_ancestor_id]
            current_right_ancestor_id = id(parents[current_right_ancestor_id])
        return None
def file_to_old_parse_file_format(
    file: AnnotationManager
) -> Tuple[List["VirtualNode"], Dict[int, bblfsh.Node]]:
    """
    Convert `AnnotationManager` instance to the deprecated output format of \
    `FeatureExtractor._parse_file()`.

    The function exists for backward compatibility and should be removed after the refactoring is \
    finished.

    :param file: file annotated with `UASTAnnotation`, `PathAnnotation` and `RawTokenAnnotation`. \
                 It is expected to be the output of  `FeatureExtractor._parse_file()`.
    :return: The old `FeatureExtractor._parse_file()` output format, that is \
             Tuple with `VirtualNode`-s and `bbfsh.Node` id to parent mapping.
    """
    vnodes = []
    path = file.get(PathAnnotation).path
    raw_lines_data = file.sequence.splitlines(keepends=True)
    line_lens = [0] + [len(d) for d in raw_lines_data]
    line_lens[-1] += 1
    _line_start_offsets = numpy.array(line_lens).cumsum()
    for annotation in file.iter_by_type(RawTokenAnnotation):
        vnode = VirtualNode(
            file[annotation.span],
            _to_position(raw_lines_data, _line_start_offsets,
                         annotation.start),
            _to_position(raw_lines_data, _line_start_offsets, annotation.stop),
            is_accumulated_indentation=False,
            path=path,
            node=annotation.node,
            y=None,
        )
        vnodes.append(vnode)
    return vnodes, file.get(UASTAnnotation).parents
Beispiel #3
0
    def test_add(self):
        code = "0123456789"
        annotated_data = AnnotationManager(code)
        annotations = [Annotation(0, 3), Annotation(3, 3), Annotation(3, 4)]
        another_annotation = AnotherAnnotation(0, len(code) - 1)
        annotated_data.add(*annotations)
        annotated_data.add(another_annotation)

        overlapping_annotations = annotations + [
            Annotation(0, 1),
            Annotation(1, 4),
            Annotation(3, 3),
            Annotation(2, 5)
        ]
        for annotation in overlapping_annotations:
            with self.assertRaises(ValueError):
                annotated_data.add(annotation)
        ok_annotations = [
            Annotation(0, 0),
            Annotation(4, 4),
            Annotation(9, 11),
            Annotation(4, 9)
        ]
        for annotation in ok_annotations:
            annotated_data.add(annotation)
Beispiel #4
0
 def setUp(self):
     config = FormatAnalyzer._load_config(get_config())
     self.annotated_file = AnnotationManager.from_file(self.file)
     self.final_config = config["train"]["javascript"]
     self.extractor = FeatureExtractor(
         language="javascript", **self.final_config["feature_extractor"])
     self.annotated_file = AnnotationManager.from_file(self.file)
Beispiel #5
0
 def test_get(self):
     code = "0123456789"
     annotated_data = AnnotationManager(code)
     with self.assertRaises(Exception):
         annotated_data.get(Annotation)
     annotated_data.add(Annotation(4, 7))
     annotated_data.get(Annotation, (4, 7))
     with self.assertRaises(Exception):
         annotated_data.get(Annotation)
Beispiel #6
0
    def test_getitem(self):
        am = AnnotationManager("")
        self.assertEqual(am[:1], "")
        with self.assertRaises(IndexError):
            am[0]

        seq = "0123456789"
        am = AnnotationManager(seq)
        self.assertEqual(am[0], seq[0])
        self.assertEqual(am[0:5], seq[0:5])
        self.assertEqual(am[(5, 7)], seq[5:7])
        self.assertEqual(am[7:5], seq[7:5])
        with self.assertRaises(IndexError):
            am[len(seq) + 1]
 def _annotate_files(
         self,
         files: Iterable[UnicodeFile],
         lines: Optional[List[List[int]]] = None
 ) -> List[AnnotationManager]:
     parsed_files = []
     for i, file in enumerate(files):
         path = file.path
         file = AnnotationManager.from_file(file)
         if lines is not None and lines[i] is not None:
             file.add(
                 LinesToCheckAnnotation(0, len(file), frozenset(lines[i])))
         try:
             self._parse_file(file)
         except AssertionError as e:
             self._log.warning("could not parse %s: error '%s', skipping",
                               path, e)
             if self.debug_parsing:
                 import traceback
                 traceback.print_exc()
                 input("Press Enter to continue…")
             continue
         self._classify_vnodes(file)
         self._merge_classes_to_composite_labels(file)
         self._add_noops(file)
         parsed_files.append(file)
         self._fill_vnode_parents(file)
     vnodes_parsed_number = sum(
         file.count(TokenAnnotation) for file in parsed_files)
     self._log.debug("Parsed %d vnodes", vnodes_parsed_number)
     return parsed_files
Beispiel #8
0
 def test_classify_vnodes_with_trailing_space(self):
     contents = self.contents + " "
     file = BytesToUnicodeConverter.convert_file(
         File(content=contents.encode(),
              uast=self.uast,
              language="javascript",
              path="test"))
     annotated_file = AnnotationManager.from_file(file)
     self.extractor._parse_file(annotated_file)
     self.extractor._classify_vnodes(annotated_file)
     text = "".join(
         annotated_file[token.span]
         for token in annotated_file.iter_by_type(AtomicTokenAnnotation))
     self.assertEqual(text, contents)
     cls_counts = Counter()
     old_stop = 0
     for annotations in annotated_file.iter_by_type_nested(
             AtomicTokenAnnotation, ClassAnnotation):
         self.assertEqual(old_stop, annotations.start)
         if ClassAnnotation in annotations:
             cls_counts.update(
                 map(CLASSES.__getitem__, annotations[ClassAnnotation].cls))
         old_stop = annotations.stop
     self.assertEqual(len(contents), old_stop)
     self.assertEqual(cls_counts[CLS_SPACE_INC],
                      cls_counts[CLS_SPACE_DEC] + 1)
     self.assertGreater(cls_counts[CLS_SPACE_INC], 0)
     self.assertGreater(cls_counts[CLS_SPACE], 0)
     self.assertGreater(cls_counts[CLS_NEWLINE], 0)
     self.assertGreater(cls_counts[CLS_SINGLE_QUOTE], 0)
     self.assertTrue(cls_counts[CLS_SINGLE_QUOTE] % 2 == 0)
Beispiel #9
0
 def test_positions(self):
     test_js_code_filepath = Path(
         __file__).parent / "browser-policy-content.js"
     with open(str(test_js_code_filepath), mode="rb") as f:
         code = f.read()
     uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(filename="",
                                                      language="javascript",
                                                      contents=code).uast
     converter = BytesToUnicodeConverter(code)
     code_uni = converter.convert_content()
     uast_uni = converter.convert_uast(uast)
     file = UnicodeFile(content=code_uni,
                        uast=uast_uni,
                        language="javascript",
                        path="test.js")
     annotated_data = AnnotationManager.from_file(file)
     self.extractor._parse_file(annotated_data)
     nodes, _ = file_to_old_parse_file_format(annotated_data)
     for index, (node1, node2) in enumerate(zip(nodes, nodes[1:])):
         self.assertLessEqual(
             node1.start.line, node2.start.line,
             "Start line position decrease for %d, %d nodes" %
             (index, index + 1))
         self.assertLessEqual(
             node1.start.offset, node2.start.offset,
             "Start offset position decrease for %d, %d nodes" %
             (index, index + 1))
    def _merge_classes_to_composite_labels(self,
                                           file: AnnotationManager) -> None:
        """
        Build "composite" `TokenAnnotation` and `LabelAnnotation` from predictable atomic tokens.

        :param file: Source code annotated with `AtomicTokenAnnotation`, `ClassAnnotation`, \
                     `AccumulatedIndentationAnnotation`.
        """
        def _class_seq_to_annotations(start, stop, current_class_seq):
            if NEWLINE_INDEX not in current_class_seq or \
                    current_class_seq[0] == NEWLINE_INDEX:
                # if there are no trailing whitespaces or tabs
                yield TokenAnnotation(start, stop)
                yield LabelAnnotation(start, stop, tuple(current_class_seq))
            else:
                index = current_class_seq.index(NEWLINE_INDEX)
                middle = start + index
                yield TokenAnnotation(start, middle)
                yield TokenAnnotation(middle, stop)
                yield LabelAnnotation(start, middle,
                                      tuple(current_class_seq[:index]))
                yield LabelAnnotation(middle, stop,
                                      tuple(current_class_seq[index:]))

        start, stop, current_class_seq = None, None, []

        for annotations in file.iter_by_type_nested(
                AtomicTokenAnnotation, ClassAnnotation,
                AccumulatedIndentationAnnotation):
            has_target = ClassAnnotation in annotations
            acc_indent = AccumulatedIndentationAnnotation in annotations
            if (not has_target and not acc_indent or
                (has_target
                 and annotations[ClassAnnotation].cls[0] in QUOTES_INDEX)):
                if current_class_seq:
                    file.add(*_class_seq_to_annotations(
                        start, stop, current_class_seq))
                    start, stop, current_class_seq = None, None, []
                file.add(
                    annotations[AtomicTokenAnnotation].to_token_annotation())
                if ClassAnnotation in annotations:
                    file.add(
                        annotations[ClassAnnotation].to_target_annotation())
            else:
                if not current_class_seq:
                    start = annotations.start
                stop = annotations.stop
                if not acc_indent:
                    current_class_seq.extend(annotations[ClassAnnotation].cls)
        if current_class_seq:
            file.add(
                *_class_seq_to_annotations(start, stop, current_class_seq))
def _file_to_vnodes_and_parents(
    file: AnnotationManager
) -> Tuple[List["VirtualNode"], Dict[int, bblfsh.Node]]:
    """
    Convert one `AnnotationManager` instance to the deprecated format of \
    `FeatureExtractor._annotate_files()` (`_parse_vnodes()` before refactoring).

    The old format is a sequence of vnodes and vnodes parents mapping. Used by
    `files_to_old_parse_file_format` to generate the old `_parse_vnodes`-like output format for a
    sequence of `AnnotationManager`-s. This function is different from
    `file_to_old_parse_file_format()` because it is created for `_parse_vnodes()` backward
    compatibility and `file_to_old_parse_file_format()` for `_parse_file()` backward compatibility.

    The function exists for backward compatibility and should be removed after the refactoring is \
    finished.

    :param file: file annotated with `Path`-, `Token`-, `Label`-, `TokenParent`- `Annotation`.
    :return: Tuple with `VirtualNode`-s and node id to parents mapping.
    """
    vnodes = []
    path = file.get(PathAnnotation).path
    raw_lines_data = file.sequence.splitlines(keepends=True)
    line_lens = [0] + [len(d) for d in raw_lines_data]
    line_lens[-1] += 1
    _line_start_offsets = numpy.array(line_lens).cumsum()
    vnode_parents = {}
    for annotations in file.iter_by_type_nested(TokenAnnotation,
                                                LabelAnnotation,
                                                TokenParentAnnotation):
        vnode = VirtualNode(
            file[annotations.span],
            _to_position(raw_lines_data, _line_start_offsets,
                         annotations.start),
            _to_position(raw_lines_data, _line_start_offsets,
                         annotations.stop),
            is_accumulated_indentation=False,
            path=path,
            node=annotations[TokenAnnotation].node,
            y=annotations[LabelAnnotation].label
            if LabelAnnotation in annotations else None,
        )
        vnodes.append(vnode)
        vnode_parents[id(vnode)] = annotations[TokenParentAnnotation].parent \
            if TokenParentAnnotation in annotations else None
    return vnodes, vnode_parents
Beispiel #12
0
 def test_find_overlapping_annotation(self):
     code = "0123456789"
     annotated_data = AnnotationManager(code)
     annotations = [Annotation(0, 3), Annotation(3, 3), Annotation(3, 4)]
     annotated_data.add(*annotations[::-1])
     for annotation in annotations:
         self.assertEqual(
             annotated_data.find_overlapping_annotation(
                 Annotation, *annotation.span), annotation)
     self.assertEqual(
         annotated_data.find_overlapping_annotation(Annotation, 1, 2),
         annotations[0])
     self.assertEqual(
         annotated_data.find_overlapping_annotation(Annotation, 3, 5),
         annotations[2])
     self.assertEqual(
         annotated_data.find_overlapping_annotation(Annotation, 2, 4),
         annotations[0])
     for span in [(4, 4), (4, 5), (5, 5)]:
         with self.assertRaises(NoAnnotation):
             annotated_data.find_overlapping_annotation(Annotation, *span)
Beispiel #13
0
 def test_vnode_positions(self):
     test_js_code_filepath = Path(__file__).parent / "jquery.layout.js"
     with open(str(test_js_code_filepath), mode="rb") as f:
         code = f.read()
     uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(
         filename="", language="javascript", contents=code).uast
     file = BytesToUnicodeConverter.convert_file(
         File(content=code, uast=uast, language="javascript", path="test.js"))
     annotated_data = AnnotationManager.from_file(file)
     self.extractor._parse_file(annotated_data)
     # Just should not fail
     self.extractor._classify_vnodes(annotated_data)
Beispiel #14
0
    def test_iter_annotations(self):
        code = "0123456789"
        annotated_data = AnnotationManager(code)
        annotations = [Annotation(0, 3), Annotation(3, 3), Annotation(3, 4)]
        another_annotation = AnotherAnnotation(0, len(code) - 1)
        annotated_data.add(*annotations[::-1])
        annotated_data.add(another_annotation)
        res = [
            AnnotationsSpan(0, 3, {
                Annotation: annotations[0],
                AnotherAnnotation: another_annotation
            }),
            AnnotationsSpan(3, 3, {
                Annotation: annotations[1],
                AnotherAnnotation: another_annotation
            }),
            AnnotationsSpan(3, 4, {
                Annotation: annotations[2],
                AnotherAnnotation: another_annotation
            })
        ]
        self.assertEqual(
            list(
                annotated_data.iter_by_type_nested(Annotation,
                                                   AnotherAnnotation)), res)

        annotations = list(
            annotated_data.iter_by_type_nested(AnotherAnnotation, Annotation))
        res = [
            AnnotationsSpan(0,
                            len(code) - 1,
                            {AnotherAnnotation: another_annotation})
        ]
        self.assertEqual(annotations, res)
Beispiel #15
0
    def test_iter_annotation(self):
        code = "0123456789"
        annotated_data = AnnotationManager(code)
        with self.assertRaises(KeyError):
            list(annotated_data.iter_by_type(Annotation))

        annotations = [Annotation(0, 3), Annotation(3, 3), Annotation(3, 4)]
        annotated_data.add(*annotations[::-1])
        self.assertEqual(list(annotated_data.iter_by_type(Annotation)),
                         annotations)
        more_annotations = [
            Annotation(0, 0),
            Annotation(4, 4),
            Annotation(4, 7)
        ]
        annotated_data.add(*more_annotations)
        self.assertEqual(
            list(annotated_data.iter_by_type(Annotation)),
            sorted(annotations + more_annotations, key=lambda x: x.span))
Beispiel #16
0
 def test_parse_file_comment_after_regexp(self):
     code = b"x = // comment\n/<regexp>/;"
     uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(filename="",
                                                      language="javascript",
                                                      contents=code).uast
     file = BytesToUnicodeConverter.convert_file(
         File(uast=uast, content=code, language="javascript", path=""))
     annotated_file = AnnotationManager.from_file(file)
     self.extractor._parse_file(annotated_file)
     self.assertEqual(
         "".join(
             annotated_file[token.span]
             for token in annotated_file.iter_by_type(RawTokenAnnotation)),
         code.decode())
Beispiel #17
0
 def test_parse_file_with_trailing_space(self):
     contents = self.contents + " "
     file = BytesToUnicodeConverter.convert_file(
         File(content=contents.encode(),
              uast=self.uast,
              language="javascript",
              path="test"))
     annotated_data = AnnotationManager.from_file(file)
     self.extractor._parse_file(annotated_data)
     nodes, _ = file_to_old_parse_file_format(annotated_data)
     offset, line, col = nodes[-1].end
     self.assertEqual(len(contents), offset)
     # Space token always ends on the same line
     self.assertEqual(len(contents.splitlines()), line)
     self.assertEqual("".join(n.value for n in nodes), contents)
Beispiel #18
0
 def test_parse_file_exact_match(self):
     test_js_code_filepath = str(
         Path(__file__).parent / "for_parse_test.js.xz")
     with lzma.open(test_js_code_filepath, mode="rb") as f:
         code = f.read()
     uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(filename="",
                                                      language="javascript",
                                                      contents=code).uast
     file = BytesToUnicodeConverter.convert_file(
         File(uast=uast, content=code, language="javascript", path=""))
     annotated_file = AnnotationManager.from_file(file)
     self.extractor._parse_file(annotated_file)
     nodes, _ = file_to_old_parse_file_format(annotated_file)
     self.assertEqual("".join(n.value for n in nodes), code.decode())
     self.assertEqual(
         "".join(
             annotated_file[token.span]
             for token in annotated_file.iter_by_type(RawTokenAnnotation)),
         code.decode())
 def _fill_vnode_parents(self, file: AnnotationManager):
     closest_left_node_id = None
     uast_annotation = file.get(UASTAnnotation)
     for annotation in file.iter_by_type(TokenAnnotation):
         if annotation.has_node:
             closest_left_node_id = id(annotation.node)
             file.add(
                 TokenParentAnnotation(
                     *annotation.span,
                     uast_annotation.parents[closest_left_node_id]))
         else:
             parent = (self._find_parent(annotation.stop, file,
                                         closest_left_node_id)
                       or uast_annotation.uast)
             file.add(TokenParentAnnotation(*annotation.span, parent))
Beispiel #20
0
 def test_check_interval_crossing(self):
     data = [
         ((9, 19), (19, 20), False),
         ((19, 20), (9, 19), False),
         ((1, 3), (2, 4), True),
         ((2, 4), (1, 3), True),
         ((-2, 4), (1, 3), True),
         ((-2, 3), (1, 3), True),
         ((1, 3), (1, 3), True),
         ((1, 3), (6, 7), False),
         ((10, 30), (6, 7), False),
         ((10, 10), (10, 10), True),
         ((10, 30), (10, 10), False),
         ((10, 10), (10, 30), False),
         ((10, 10), (5, 30), True),
         ((5, 30), (10, 10), True),
     ]
     for i, (interval1, interval2, res) in enumerate(data):
         self.assertEqual(
             AnnotationManager._check_spans_overlap(*interval1, *interval2),
             res, "Case # %d" % i)
Beispiel #21
0
    def test_count(self):
        seq = "0123456789"
        am = AnnotationManager(seq)

        self.assertEqual(am.count(Annotation), 0)
        am.add(Annotation(0, 1))
        self.assertEqual(am.count(Annotation), 1)
        am.add(Annotation(4, 5), Annotation(3, 4), Annotation(4, 4))
        self.assertEqual(am.count(Annotation), 4)
        self.assertEqual(am.count(AnotherAnnotation), 0)
        am.add(AnotherAnnotation(4, 8))
        self.assertEqual(am.count(Annotation), 4)
        self.assertEqual(am.count(AnotherAnnotation), 1)
        am.add(AnotherAnnotation(0, 3), AnotherAnnotation(3, 4),
               AnotherAnnotation(8, 8))
        self.assertEqual(am.count(Annotation), 4)
        self.assertEqual(am.count(AnotherAnnotation), 4)
    def _parse_file(self, file: AnnotationManager) -> None:
        """
        Annotate source code with `RawTokenAnnotation`-s.

        Given the source text and the corresponding UAST this function covers all code with a
        `RawTokenAnnotation`-s.

        :param file: Source code annotated with `UASTAnnotation`.
        """
        # TODO(zurk): rename this function when the refactoring is finished.
        contents = file.sequence
        # build the line mapping
        lines = contents.splitlines(keepends=True)
        # Check if there is a newline in the end of file. Yes, you can just check
        # lines[-1][-1] == "\n" but if someone decide to use weird '\u2028' unicode character for
        # new line this condition gives wrong result.
        eof_new_line = lines[-1].splitlines()[0] != lines[-1]
        if eof_new_line:
            # We add last line as empty one because it actually exists, but .splitlines() does not
            # return it.
            lines.append("")
        line_offsets = numpy.zeros(len(lines) + 1, dtype=numpy.int32)
        pos = 0
        for i, line in enumerate(lines):
            line_offsets[i] = pos
            pos += len(line)
        line_offsets[-1] = pos + 1

        # walk the tree: collect nodes with assigned tokens
        node_tokens = []
        queue = [file.get(UASTAnnotation).uast]
        while queue:
            node = queue.pop()
            if node.internal_type in self.node_fixtures:
                self.node_fixtures[node.internal_type](node)
            queue.extend(node.children)
            if (node.token or node.start_position and node.end_position
                    and node.start_position != node.end_position
                    and not node.children):
                node_tokens.append(node)
        node_tokens.sort(key=lambda n: n.start_position.offset)
        sentinel = bblfsh.Node()
        sentinel.start_position.offset = len(contents)
        sentinel.start_position.line = len(lines)
        node_tokens.append(sentinel)

        # scan `node_tokens` and fill the gaps with imaginary nodes
        pos = 0
        parser = self.tokens.PARSER
        for node in node_tokens:
            if node.start_position.offset < pos:
                continue
            if node.start_position.offset > pos:
                sumlen = 0
                diff = contents[pos:node.start_position.offset]
                for match in parser.finditer(diff):
                    offsets = []
                    for suboff in (match.start(), match.end()):
                        offsets.append(pos + suboff)
                    token = match.group()
                    sumlen += len(token)
                    file.add(RawTokenAnnotation(*offsets))
                assert sumlen == node.start_position.offset - pos, \
                    "missed some imaginary tokens: \"%s\"" % diff
            if node is sentinel:
                break
            uast_node_annot = list(
                VirtualNode.from_node(node, contents, self.token_unwrappers))
            file.add(*uast_node_annot)
            pos = node.end_position.offset
    def _add_noops(self, file: AnnotationManager) -> None:
        """
        Add `TokenAnnotation` with zero length in between `TokenAnnotation` without labeled nodes.

        Such zero length annotations means that some formatting sequence can be inserted to the
        annotation position.

        :param file: Source code annotated with `TokenAnnotation` and `LabelAnnotation`.
        """
        noop_target = (CLASS_INDEX[CLS_NOOP], )
        if not len(file):
            return

        prev_annotations = None
        for i, annotations in enumerate(
                file.iter_by_type_nested(TokenAnnotation, LabelAnnotation)):
            if i == 0:
                if LabelAnnotation not in annotations:
                    file.add(TokenAnnotation(0, 0))
                    file.add(LabelAnnotation(0, 0, noop_target))
            else:
                if LabelAnnotation not in prev_annotations and \
                        LabelAnnotation not in annotations:
                    file.add(
                        TokenAnnotation(annotations.start, annotations.start))
                    file.add(
                        LabelAnnotation(annotations.start, annotations.start,
                                        noop_target))
            prev_annotations = annotations

        if LabelAnnotation not in annotations:
            file.add(TokenAnnotation(annotations.stop, annotations.stop))
            file.add(
                LabelAnnotation(annotations.stop, annotations.stop,
                                noop_target))
    def _classify_vnodes(self, file: AnnotationManager) -> None:
        """
        Annotate source code with `AtomicTokenAnnotation`, `ClassAnnotation` and \
        `AccumulatedIndentationAnnotation`.

        `ClassAnnotation` contains the index of the corresponding class to predict.
        We detect indentation changes, so several whitespace nodes are merged together.

        :param file: Source code annotated with `RawTokenAnnotation`.
        """
        indentation = []
        for token in file.iter_by_type(RawTokenAnnotation):
            token_value = file[token.span]
            if token.has_node:
                file.add(token.to_atomic_token_annotation())
                continue
            if not token_value.isspace():
                if token_value == "'":
                    file.add(
                        ClassAnnotation(*token.span,
                                        (CLASS_INDEX[CLS_SINGLE_QUOTE], )))
                elif token_value == '"':
                    file.add(
                        ClassAnnotation(*token.span,
                                        (CLASS_INDEX[CLS_DOUBLE_QUOTE], )))
                file.add(token.to_atomic_token_annotation())
                continue
            lines = token_value.splitlines(keepends=True)
            if lines[-1].splitlines()[0] != lines[-1]:
                # We add last line as empty one to mimic .split("\n") behaviour
                lines.append("")
            if len(lines) == 1:
                # only tabs and spaces are possible
                for i, char in enumerate(token_value):
                    if char == "\t":
                        cls = (CLASS_INDEX[CLS_TAB], )
                    else:
                        cls = (CLASS_INDEX[CLS_SPACE], )
                    offset = token.start
                    file.add(ClassAnnotation(offset + i, offset + i + 1, cls))
                    file.add(AtomicTokenAnnotation(offset + i, offset + i + 1))
                continue
            line_offset = 0
            traling_chars = lines[0].splitlines()[0]
            if traling_chars:
                # node contains trailing whitespaces from the previous line
                assert set(traling_chars) <= {" ", "\t"}
                file.add(
                    ClassAnnotation(
                        token.start, token.start + len(traling_chars),
                        tuple(CLASS_INDEX[CLS_SPACE if yi == " " else CLS_TAB]
                              for yi in traling_chars)))
                file.add(
                    AtomicTokenAnnotation(token.start,
                                          token.start + len(traling_chars)))

                lines[0] = lines[0][len(traling_chars):]
                line_offset += len(traling_chars)

            for line in lines[:-1]:
                # `line` ends with \r\n, we prepend \r to the newline node
                start_offset = token.start + line_offset
                file.add(
                    ClassAnnotation(start_offset, start_offset + len(line),
                                    (NEWLINE_INDEX, )))
                file.add(
                    AtomicTokenAnnotation(start_offset,
                                          start_offset + len(line)))
                line_offset += len(line)
            line = lines[-1].splitlines()[0] if lines[-1] else ""
            my_indent = list(line)
            offset = token.stop
            offset -= len(line)
            try:
                for ws in indentation:
                    my_indent.remove(ws)
            except ValueError:
                if my_indent:
                    # mixed tabs and spaces, do not classify
                    file.add(AtomicTokenAnnotation(offset, token.stop))
                    continue
                # indentation decreases
                if indentation[:len(line)]:
                    file.add(AtomicTokenAnnotation(offset, token.stop))
                    file.add(
                        AccumulatedIndentationAnnotation(offset, token.stop))
                dec_class = []
                for char in indentation[len(line):]:
                    if char == "\t":
                        cls = CLASS_INDEX[CLS_TAB_DEC]
                    else:
                        cls = CLASS_INDEX[CLS_SPACE_DEC]
                    dec_class.append(cls)

                file.add(AtomicTokenAnnotation(token.stop, token.stop))
                # It is not possible to have multiple zero-length intervals so we can only add it
                # with joined class
                file.add(
                    ClassAnnotation(token.stop, token.stop, tuple(dec_class)))
                indentation = indentation[:len(line)]
            else:
                # indentation is stable or increases
                if indentation:
                    file.add(
                        AtomicTokenAnnotation(offset,
                                              offset + len(indentation)))
                    file.add(
                        AccumulatedIndentationAnnotation(
                            offset, offset + len(indentation)))
                offset += len(indentation)
                for char in my_indent:
                    indentation.append(char)
                for i, char in enumerate(my_indent):
                    if char == "\t":
                        cls = (CLASS_INDEX[CLS_TAB_INC], )
                    else:
                        cls = (CLASS_INDEX[CLS_SPACE_INC], )
                    file.add(AtomicTokenAnnotation(offset + i, offset + i + 1))
                    file.add(ClassAnnotation(offset + i, offset + i + 1, cls))
                offset += len(my_indent)
Beispiel #25
0
 def test_len(self):
     self.assertEqual(0, len(AnnotationManager("")))
     self.assertEqual(100, len(AnnotationManager(100 * "1")))