Example #1
0
    def test_iter_annotations(self):
        code = "0123456789"
        annotated_data = AnnotationManager(code)
        annotations = [Annotation(0, 3), Annotation(3, 3), Annotation(3, 4)]
        another_annotation = AnotherAnnotation(0, len(code) - 1)
        annotated_data.add(*annotations[::-1])
        annotated_data.add(another_annotation)
        res = [
            AnnotationsSpan(0, 3, {
                Annotation: annotations[0],
                AnotherAnnotation: another_annotation
            }),
            AnnotationsSpan(3, 3, {
                Annotation: annotations[1],
                AnotherAnnotation: another_annotation
            }),
            AnnotationsSpan(3, 4, {
                Annotation: annotations[2],
                AnotherAnnotation: another_annotation
            })
        ]
        self.assertEqual(
            list(
                annotated_data.iter_by_type_nested(Annotation,
                                                   AnotherAnnotation)), res)

        annotations = list(
            annotated_data.iter_by_type_nested(AnotherAnnotation, Annotation))
        res = [
            AnnotationsSpan(0,
                            len(code) - 1,
                            {AnotherAnnotation: another_annotation})
        ]
        self.assertEqual(annotations, res)
Example #2
0
 def test_get(self):
     code = "0123456789"
     annotated_data = AnnotationManager(code)
     with self.assertRaises(Exception):
         annotated_data.get(Annotation)
     annotated_data.add(Annotation(4, 7))
     annotated_data.get(Annotation, (4, 7))
     with self.assertRaises(Exception):
         annotated_data.get(Annotation)
Example #3
0
    def test_add(self):
        code = "0123456789"
        annotated_data = AnnotationManager(code)
        annotations = [Annotation(0, 3), Annotation(3, 3), Annotation(3, 4)]
        another_annotation = AnotherAnnotation(0, len(code) - 1)
        annotated_data.add(*annotations)
        annotated_data.add(another_annotation)

        overlapping_annotations = annotations + [
            Annotation(0, 1),
            Annotation(1, 4),
            Annotation(3, 3),
            Annotation(2, 5)
        ]
        for annotation in overlapping_annotations:
            with self.assertRaises(ValueError):
                annotated_data.add(annotation)
        ok_annotations = [
            Annotation(0, 0),
            Annotation(4, 4),
            Annotation(9, 11),
            Annotation(4, 9)
        ]
        for annotation in ok_annotations:
            annotated_data.add(annotation)
Example #4
0
 def _fill_vnode_parents(self, file: AnnotationManager):
     closest_left_node_id = None
     uast_annotation = file.get(UASTAnnotation)
     for annotation in file.iter_by_type(TokenAnnotation):
         if annotation.has_node:
             closest_left_node_id = id(annotation.node)
             file.add(
                 TokenParentAnnotation(
                     *annotation.span,
                     uast_annotation.parents[closest_left_node_id]))
         else:
             parent = (self._find_parent(annotation.stop, file,
                                         closest_left_node_id)
                       or uast_annotation.uast)
             file.add(TokenParentAnnotation(*annotation.span, parent))
Example #5
0
 def test_find_covering_annotation(self):
     code = "0123456789"
     annotated_data = AnnotationManager(code)
     annotations = [Annotation(0, 3), Annotation(3, 3), Annotation(3, 4)]
     annotated_data.add(*annotations[::-1])
     for annotation in annotations:
         self.assertEqual(
             annotated_data.find_covering_annotation(
                 Annotation, *annotation.span), annotation)
     self.assertEqual(
         annotated_data.find_covering_annotation(Annotation, 1, 2),
         annotations[0])
     self.assertEqual(
         annotated_data.find_covering_annotation(Annotation, 1, 1),
         annotations[0])
     for span in [(4, 4), (4, 5), (5, 5), (3, 5), (2, 4)]:
         with self.assertRaises(NoAnnotation):
             annotated_data.find_covering_annotation(Annotation, *span)
Example #6
0
    def test_iter_annotation(self):
        code = "0123456789"
        annotated_data = AnnotationManager(code)
        with self.assertRaises(KeyError):
            list(annotated_data.iter_by_type(Annotation))

        annotations = [Annotation(0, 3), Annotation(3, 3), Annotation(3, 4)]
        annotated_data.add(*annotations[::-1])
        self.assertEqual(list(annotated_data.iter_by_type(Annotation)),
                         annotations)
        more_annotations = [
            Annotation(0, 0),
            Annotation(4, 4),
            Annotation(4, 7)
        ]
        annotated_data.add(*more_annotations)
        self.assertEqual(
            list(annotated_data.iter_by_type(Annotation)),
            sorted(annotations + more_annotations, key=lambda x: x.span))
Example #7
0
    def _merge_classes_to_composite_labels(self,
                                           file: AnnotationManager) -> None:
        """
        Build "composite" `TokenAnnotation` and `LabelAnnotation` from predictable atomic tokens.

        :param file: Source code annotated with `AtomicTokenAnnotation`, `ClassAnnotation`, \
                     `AccumulatedIndentationAnnotation`.
        """
        def _class_seq_to_annotations(start, stop, current_class_seq):
            if NEWLINE_INDEX not in current_class_seq or \
                    current_class_seq[0] == NEWLINE_INDEX:
                # if there are no trailing whitespaces or tabs
                yield TokenAnnotation(start, stop)
                yield LabelAnnotation(start, stop, tuple(current_class_seq))
            else:
                index = current_class_seq.index(NEWLINE_INDEX)
                middle = start + index
                yield TokenAnnotation(start, middle)
                yield TokenAnnotation(middle, stop)
                yield LabelAnnotation(start, middle,
                                      tuple(current_class_seq[:index]))
                yield LabelAnnotation(middle, stop,
                                      tuple(current_class_seq[index:]))

        start, stop, current_class_seq = None, None, []

        for annotations in file.iter_by_type_nested(
                AtomicTokenAnnotation, ClassAnnotation,
                AccumulatedIndentationAnnotation):
            has_target = ClassAnnotation in annotations
            acc_indent = AccumulatedIndentationAnnotation in annotations
            if (not has_target and not acc_indent or
                (has_target
                 and annotations[ClassAnnotation].cls[0] in QUOTES_INDEX)):
                if current_class_seq:
                    file.add(*_class_seq_to_annotations(
                        start, stop, current_class_seq))
                    start, stop, current_class_seq = None, None, []
                file.add(
                    annotations[AtomicTokenAnnotation].to_token_annotation())
                if ClassAnnotation in annotations:
                    file.add(
                        annotations[ClassAnnotation].to_target_annotation())
            else:
                if not current_class_seq:
                    start = annotations.start
                stop = annotations.stop
                if not acc_indent:
                    current_class_seq.extend(annotations[ClassAnnotation].cls)
        if current_class_seq:
            file.add(
                *_class_seq_to_annotations(start, stop, current_class_seq))
Example #8
0
    def test_count(self):
        seq = "0123456789"
        am = AnnotationManager(seq)

        self.assertEqual(am.count(Annotation), 0)
        am.add(Annotation(0, 1))
        self.assertEqual(am.count(Annotation), 1)
        am.add(Annotation(4, 5), Annotation(3, 4), Annotation(4, 4))
        self.assertEqual(am.count(Annotation), 4)
        self.assertEqual(am.count(AnotherAnnotation), 0)
        am.add(AnotherAnnotation(4, 8))
        self.assertEqual(am.count(Annotation), 4)
        self.assertEqual(am.count(AnotherAnnotation), 1)
        am.add(AnotherAnnotation(0, 3), AnotherAnnotation(3, 4),
               AnotherAnnotation(8, 8))
        self.assertEqual(am.count(Annotation), 4)
        self.assertEqual(am.count(AnotherAnnotation), 4)
Example #9
0
    def _parse_file(self, file: AnnotationManager) -> None:
        """
        Annotate source code with `RawTokenAnnotation`-s.

        Given the source text and the corresponding UAST this function covers all code with a
        `RawTokenAnnotation`-s.

        :param file: Source code annotated with `UASTAnnotation`.
        """
        # TODO(zurk): rename this function when the refactoring is finished.
        contents = file.sequence
        # build the line mapping
        lines = contents.splitlines(keepends=True)
        # Check if there is a newline in the end of file. Yes, you can just check
        # lines[-1][-1] == "\n" but if someone decide to use weird '\u2028' unicode character for
        # new line this condition gives wrong result.
        eof_new_line = lines[-1].splitlines()[0] != lines[-1]
        if eof_new_line:
            # We add last line as empty one because it actually exists, but .splitlines() does not
            # return it.
            lines.append("")
        line_offsets = numpy.zeros(len(lines) + 1, dtype=numpy.int32)
        pos = 0
        for i, line in enumerate(lines):
            line_offsets[i] = pos
            pos += len(line)
        line_offsets[-1] = pos + 1

        # walk the tree: collect nodes with assigned tokens
        node_tokens = []
        queue = [file.get(UASTAnnotation).uast]
        while queue:
            node = queue.pop()
            if node.internal_type in self.node_fixtures:
                self.node_fixtures[node.internal_type](node)
            queue.extend(node.children)
            if (node.token or node.start_position and node.end_position
                    and node.start_position != node.end_position
                    and not node.children):
                node_tokens.append(node)
        node_tokens.sort(key=lambda n: n.start_position.offset)
        sentinel = bblfsh.Node()
        sentinel.start_position.offset = len(contents)
        sentinel.start_position.line = len(lines)
        node_tokens.append(sentinel)

        # scan `node_tokens` and fill the gaps with imaginary nodes
        pos = 0
        parser = self.tokens.PARSER
        for node in node_tokens:
            if node.start_position.offset < pos:
                continue
            if node.start_position.offset > pos:
                sumlen = 0
                diff = contents[pos:node.start_position.offset]
                for match in parser.finditer(diff):
                    offsets = []
                    for suboff in (match.start(), match.end()):
                        offsets.append(pos + suboff)
                    token = match.group()
                    sumlen += len(token)
                    file.add(RawTokenAnnotation(*offsets))
                assert sumlen == node.start_position.offset - pos, \
                    "missed some imaginary tokens: \"%s\"" % diff
            if node is sentinel:
                break
            uast_node_annot = list(
                VirtualNode.from_node(node, contents, self.token_unwrappers))
            file.add(*uast_node_annot)
            pos = node.end_position.offset
Example #10
0
    def _add_noops(self, file: AnnotationManager) -> None:
        """
        Add `TokenAnnotation` with zero length in between `TokenAnnotation` without labeled nodes.

        Such zero length annotations means that some formatting sequence can be inserted to the
        annotation position.

        :param file: Source code annotated with `TokenAnnotation` and `LabelAnnotation`.
        """
        noop_target = (CLASS_INDEX[CLS_NOOP], )
        if not len(file):
            return

        prev_annotations = None
        for i, annotations in enumerate(
                file.iter_by_type_nested(TokenAnnotation, LabelAnnotation)):
            if i == 0:
                if LabelAnnotation not in annotations:
                    file.add(TokenAnnotation(0, 0))
                    file.add(LabelAnnotation(0, 0, noop_target))
            else:
                if LabelAnnotation not in prev_annotations and \
                        LabelAnnotation not in annotations:
                    file.add(
                        TokenAnnotation(annotations.start, annotations.start))
                    file.add(
                        LabelAnnotation(annotations.start, annotations.start,
                                        noop_target))
            prev_annotations = annotations

        if LabelAnnotation not in annotations:
            file.add(TokenAnnotation(annotations.stop, annotations.stop))
            file.add(
                LabelAnnotation(annotations.stop, annotations.stop,
                                noop_target))
Example #11
0
    def _classify_vnodes(self, file: AnnotationManager) -> None:
        """
        Annotate source code with `AtomicTokenAnnotation`, `ClassAnnotation` and \
        `AccumulatedIndentationAnnotation`.

        `ClassAnnotation` contains the index of the corresponding class to predict.
        We detect indentation changes, so several whitespace nodes are merged together.

        :param file: Source code annotated with `RawTokenAnnotation`.
        """
        indentation = []
        for token in file.iter_by_type(RawTokenAnnotation):
            token_value = file[token.span]
            if token.has_node:
                file.add(token.to_atomic_token_annotation())
                continue
            if not token_value.isspace():
                if token_value == "'":
                    file.add(
                        ClassAnnotation(*token.span,
                                        (CLASS_INDEX[CLS_SINGLE_QUOTE], )))
                elif token_value == '"':
                    file.add(
                        ClassAnnotation(*token.span,
                                        (CLASS_INDEX[CLS_DOUBLE_QUOTE], )))
                file.add(token.to_atomic_token_annotation())
                continue
            lines = token_value.splitlines(keepends=True)
            if lines[-1].splitlines()[0] != lines[-1]:
                # We add last line as empty one to mimic .split("\n") behaviour
                lines.append("")
            if len(lines) == 1:
                # only tabs and spaces are possible
                for i, char in enumerate(token_value):
                    if char == "\t":
                        cls = (CLASS_INDEX[CLS_TAB], )
                    else:
                        cls = (CLASS_INDEX[CLS_SPACE], )
                    offset = token.start
                    file.add(ClassAnnotation(offset + i, offset + i + 1, cls))
                    file.add(AtomicTokenAnnotation(offset + i, offset + i + 1))
                continue
            line_offset = 0
            traling_chars = lines[0].splitlines()[0]
            if traling_chars:
                # node contains trailing whitespaces from the previous line
                assert set(traling_chars) <= {" ", "\t"}
                file.add(
                    ClassAnnotation(
                        token.start, token.start + len(traling_chars),
                        tuple(CLASS_INDEX[CLS_SPACE if yi == " " else CLS_TAB]
                              for yi in traling_chars)))
                file.add(
                    AtomicTokenAnnotation(token.start,
                                          token.start + len(traling_chars)))

                lines[0] = lines[0][len(traling_chars):]
                line_offset += len(traling_chars)

            for line in lines[:-1]:
                # `line` ends with \r\n, we prepend \r to the newline node
                start_offset = token.start + line_offset
                file.add(
                    ClassAnnotation(start_offset, start_offset + len(line),
                                    (NEWLINE_INDEX, )))
                file.add(
                    AtomicTokenAnnotation(start_offset,
                                          start_offset + len(line)))
                line_offset += len(line)
            line = lines[-1].splitlines()[0] if lines[-1] else ""
            my_indent = list(line)
            offset = token.stop
            offset -= len(line)
            try:
                for ws in indentation:
                    my_indent.remove(ws)
            except ValueError:
                if my_indent:
                    # mixed tabs and spaces, do not classify
                    file.add(AtomicTokenAnnotation(offset, token.stop))
                    continue
                # indentation decreases
                if indentation[:len(line)]:
                    file.add(AtomicTokenAnnotation(offset, token.stop))
                    file.add(
                        AccumulatedIndentationAnnotation(offset, token.stop))
                dec_class = []
                for char in indentation[len(line):]:
                    if char == "\t":
                        cls = CLASS_INDEX[CLS_TAB_DEC]
                    else:
                        cls = CLASS_INDEX[CLS_SPACE_DEC]
                    dec_class.append(cls)

                file.add(AtomicTokenAnnotation(token.stop, token.stop))
                # It is not possible to have multiple zero-length intervals so we can only add it
                # with joined class
                file.add(
                    ClassAnnotation(token.stop, token.stop, tuple(dec_class)))
                indentation = indentation[:len(line)]
            else:
                # indentation is stable or increases
                if indentation:
                    file.add(
                        AtomicTokenAnnotation(offset,
                                              offset + len(indentation)))
                    file.add(
                        AccumulatedIndentationAnnotation(
                            offset, offset + len(indentation)))
                offset += len(indentation)
                for char in my_indent:
                    indentation.append(char)
                for i, char in enumerate(my_indent):
                    if char == "\t":
                        cls = (CLASS_INDEX[CLS_TAB_INC], )
                    else:
                        cls = (CLASS_INDEX[CLS_SPACE_INC], )
                    file.add(AtomicTokenAnnotation(offset + i, offset + i + 1))
                    file.add(ClassAnnotation(offset + i, offset + i + 1, cls))
                offset += len(my_indent)