def unpack_annotations(body, labels):
    """
    Use information from ast package to strip type annotation from function body
    :param body:
    :param labels: DataFrame with information about type annotations
    :return: Trimmed body and list of annotations.
    """
    if labels is None:
        return [], []

    global remove_default

    variables = []
    annotations = []

    for ind, row in labels.iterrows():
        if row['name'] == "annotation":
            variables.append(
                (row['var_line'], row['var_end_line'], row['var_col_offset'],
                 row['var_end_col_offset'], 'variable'))
            annotations.append(
                (row['line'], row['end_line'], row['col_offset'],
                 row['end_col_offset'], 'annotation '))

    # most likely do not need to use as_bytes here, because non-unicode usually appear in strings
    # but type annotations usually appear in the end of signature and in the beginnig of a line
    variables = to_offsets(body, variables, as_bytes=True)
    annotations = to_offsets(body, annotations, as_bytes=True)
    defaults_spans = get_defaults_spans(body)

    cuts = []
    vars = []

    for offset_ann, offset_var in zip(annotations, variables):
        beginning = offset_ann[0]
        end = offset_ann[1]

        head = body[:offset_ann[0]]
        orig_len = len(head)
        head = head.rstrip()
        stripped_len = len(head)

        annsymbol = ":"
        assert head.endswith(annsymbol)
        beginning = beginning - (orig_len - stripped_len) - len(annsymbol)
        cuts.append((beginning, end))

        assert offset_var[0] != len(head)
        vars.append((offset_var[0], beginning,
                     preprocess(body[offset_ann[0]:offset_ann[1]])))

    if remove_default:
        cuts.extend(defaults_spans)

    return vars, cuts
def get_descendants(function, children):
    """

    :param function: function string
    :param children: List of targets.
    :return: Offsets for attributes or names that are used as target for assignment operation. Subscript, Tuple and List
    targets are skipped.
    """
    descendants = []

    # if isinstance(children, ast.Tuple):
    #     descendants.extend(get_descendants(function, children.elts))
    # else:
    for chld in children:
        # for node in ast.walk(chld):
        node = chld
        if isinstance(node, ast.Attribute) or isinstance(node, ast.Name):
            # if isinstance(node, ast.Name):
            offset = to_offsets(
                function, [(node.lineno - 1, node.end_lineno - 1,
                            node.col_offset, node.end_col_offset, "new_var")],
                as_bytes=True)
            # descendants.append((node.id, offset[-1]))
            descendants.append(
                (function[offset[-1][0]:offset[-1][1]], offset[-1]))
        # elif isinstance(node, ast.Tuple):
        #     descendants.extend(get_descendants(function, node.elts))
        elif isinstance(node, ast.Subscript) or isinstance(
                node, ast.Tuple) or isinstance(node, ast.List):
            pass  # skip for now
        else:
            raise Exception("")

    return descendants
def get_docstring(body: str):
    """
    Get docstring ranges
    :param body:
    :return:
    """
    body_lines = body.split("\n")

    docstring_ranges = []

    for node in ast.walk(ast.parse(body)):
        try:
            docstring = ast.get_docstring(node)
        except:  # syntax error?
            continue
        else:
            if docstring is not None:
                docstring_ranges.append((
                    node.body[0].lineno - 1,
                    node.body[0].end_lineno - 1,  # first line, last line
                    0,
                    len(body_lines[
                        node.body[0].end_lineno -
                        1]),  # beginning of first line, end of last line
                    "docstring"))

    # as bytes is not needed because the offsets are created using len and not ast package
    return to_offsets(body, docstring_ranges, as_bytes=False)
def get_mentions(function, root, mention):
    """
    Find all mentions of a variable in the function's body
    :param function: string that contains function's body
    :param root: body parsed with ast package
    :param mention: the name of a variable to look for
    :return: list of offsets where the variable is mentioned
    """
    mentions = []

    for node in ast.walk(root):
        if isinstance(node, ast.Name):  # a variable or a ...
            if node.id == mention:
                offset = to_offsets(
                    function,
                    [(node.lineno - 1, node.end_lineno - 1, node.col_offset,
                      node.end_col_offset, "mention")],
                    as_bytes=True)

                mentions.extend(offset)

    # hack for deduplication
    # the origin of duplicates is still unknown
    # it apears that mention contain false alarms....
    mentions = resolve_self_collision(mentions)

    return mentions
    def parse_as_expression(self, node, *args, **kwargs):
        offset = to_offsets(
            self.full_source,
            [(node.lineno - 1, node.end_lineno - 1, node.col_offset,
              node.end_col_offset, "expression")],
            as_bytes=True)
        offset, = offset
        line = self.full_source[offset[0]:offset[1]].replace("@", "##at##")
        name = GNode(name=line, type="Name")
        expr = GNode(name="Expression" + "_" + str(hex(int(time_ns()))),
                     type="mention")
        edges = [
            {
                "scope": copy(self.scope[-1]),
                "src": name,
                "dst": expr,
                "type": "local_mention",
                "line": node.lineno - 1,
                "end_line": node.end_lineno - 1,
                "col_offset": node.col_offset,
                "end_col_offset": node.end_col_offset
            },
        ]

        return edges, expr
 def into_offset(range):
     try:
         return to_offsets(body, [(*range, None)],
                           cum_lens=cum_lens,
                           b2c=byte2char,
                           as_bytes=True)[-1][:2]
     except:
         return None
def get_defaults_spans(body):
    root = ast.parse(body)
    defaults_offsets = to_offsets(
        body, [(arg.lineno - 1, arg.end_lineno - 1, arg.col_offset,
                arg.end_col_offset, "default")
               for arg in root.body[0].args.defaults],
        as_bytes=True)

    extended = []
    for start, end, label in defaults_offsets:
        while body[start] != "=":
            start -= 1
        extended.append((start, end))
    return extended
def get_declarations(function_):
    """

    :param function:
    :return:
    """
    function = function_.lstrip()
    initial_strip = function_[:len(function_) - len(function)]

    root = ast.parse(function)

    declarations = {}
    added = set()

    for node in ast.walk(root):
        if isinstance(node, ast.arg):  # function argument
            # TODO
            # not quite sure why this if statement was needed, but there should be no annotations in the code
            if node.annotation is None:
                offset = to_offsets(
                    function, [(node.lineno - 1, node.end_lineno - 1,
                                node.col_offset, node.end_col_offset, "arg")],
                    as_bytes=True)

                assert function[offset[-1][0]:offset[-1][
                    1]] == node.arg, f"{function[offset[-1][0]:offset[-1][1]]} != {node.arg}"

                declarations[offset[-1]] = get_mentions(
                    function, root, node.arg)
                added.add(node.arg)  # mark variable name as seen
        elif isinstance(node, ast.Assign):
            desc = get_descendants(function, node.targets)

            for d in desc:
                if d[0] not in added:
                    mentions = get_mentions(function, root, d[0])
                    valid_mentions = list(
                        filter(lambda mention: mention[0] >= d[1][0],
                               mentions))
                    declarations[d[1]] = valid_mentions
                    added.add(d[0])

    initial_strip_len = len(initial_strip)
    declarations = {
        adjust_offsets2([key], initial_strip_len)[0]:
        adjust_offsets2(val, initial_strip_len)
        for key, val in declarations.items()
    }

    return declarations
Beispiel #9
0
def process_body(body, local_occurrences, nodeid2name, f_id, f_start):
    """
    Extract the list
    :param body:
    :param local_occurrences:
    :param nodeid2name:
    :param f_id:
    :param f_start:
    :return:
    """
    body_lines = body.split("\n")

    local_occurrences = sort_occurrences(local_occurrences)

    list_of_replacements = []

    for occ_ind, occurrence in local_occurrences.iterrows():
        if occurrence.start_line == occurrence.end_line:

            curr_line = occurrence.start_line - 1 - f_start

            if curr_line >= len(body_lines):
                continue

            start_col = occurrence.start_column - 1
            end_col = occurrence.end_column

            extended_range, sourcetrail_name = get_range_for_replacement(
                occurrence, start_col, end_col, body_lines[curr_line],
                nodeid2name)

            if extended_range is not None:
                occ_col_start, occ_col_end = extended_range

                # (start_line, end_line, start_col, end_col)
                list_of_replacements.append(
                    (curr_line, curr_line, occ_col_start, occ_col_end,
                     sourcetrail_name))

    list_of_replacements = list(set(list_of_replacements))
    list_of_replacements = to_offsets(body,
                                      list_of_replacements,
                                      as_bytes=False)

    return {
        "id": f_id,
        "body": body,
        "docstring": get_docstring_ast(body),
        "replacement_list": list_of_replacements,
    }
Beispiel #10
0
def get_function_body(file_content, file_id, start, end, s_col, e_col) -> str:
    # need to extract using offsets because last line can have content in it
    offsets = [(start, end, s_col, e_col, "body")]
    offsets = to_offsets(file_content[file_id], offsets)
    # source_lines = file_content.query(f"id == {file_id}").iloc[0]['content'].split("\n")

    source_lines = file_content[file_id].split("\n")

    if start == end:  # handle situations when the entire function takes only one line
        body_lines = source_lines[start]
    else:
        body_lines = source_lines[start:end]

    initial_strip = body_lines[0][0:len(body_lines[0]) -
                                  len(body_lines[0].lstrip())]
    body = initial_strip + file_content[file_id][offsets[0][0]:offsets[0][1]]

    return body
def unpack_returns(body: str, labels: pd.DataFrame):
    """
    Use information from ast package to strip return type annotation from function body
    :param body:
    :param labels: DataFrame with information about return type annotation
    :return: Trimmed body and list of return types (normally one).
    """
    if labels is None:
        return [], []

    returns = []

    for ind, row in labels.iterrows():
        if row['name'] == "returns":
            returns.append((row['line'], row['end_line'], row['col_offset'],
                            row['end_col_offset'], "returns"))

    # most likely do not need to use as_bytes here, because non-unicode usually appear in strings
    # but type annotations usually appear in the end of signature and in the beginnig of a line
    return_offsets = to_offsets(body, returns, as_bytes=True)

    cuts = []
    ret = []

    for offset in return_offsets:
        beginning = offset[0]
        end = offset[1]

        head = body[:offset[0]]
        orig_len = len(head)
        head = head.rstrip()
        head = head.rstrip("\\")
        head = head.rstrip()
        stripped_len = len(head)

        fannsymbol = "->"
        assert head.endswith(fannsymbol)
        beginning = beginning - (orig_len - stripped_len) - len(fannsymbol)
        cuts.append((beginning, end))
        ret.append(preprocess(body[offset[0]:offset[1]]))

    return ret, cuts
Beispiel #12
0
def _get_from_ast(bodies,
                  node_resolver,
                  bpe_tokenizer_path=None,
                  create_subword_instances=True,
                  connect_subwords=False):
    ast_edges = None

    bodies_with_replacements = {}

    subword_tokenizer = make_tokenizer(load_bpe_model((bpe_tokenizer_path))) \
        if bpe_tokenizer_path else None

    tokenizer = RegexpTokenizer("\w+|[^\w\s]")

    for ind_bodies, (_, row) in custom_tqdm(enumerate(bodies.iterrows()),
                                            message="Extracting AST edges",
                                            total=len(bodies)):
        orig_body = row['body_with_random_replacements']
        if not isinstance(orig_body, str):
            continue

        srctrl2original = get_srctrl2original_replacements(row)

        c = orig_body.lstrip()
        strip_len = len(orig_body) - len(c)

        try:
            ast.parse(c)
        except SyntaxError as e:
            print(e)
            continue

        replacements = row['random_2_srctrl']

        g = AstGraphGenerator(c)

        edges = g.get_edges()

        if len(edges) == 0:
            continue

        # replacements_lookup = lambda x: complex_replacement_lookup(x, replacements)
        replacements_lookup = lambda x: \
            GNode(name=random_replacement_lookup(x.name, x.type, replacements, tokenizer),
                  type=x.type) if "@" not in x.name else \
                GNode(name=random_replacement_lookup(x.name.split("@")[0], x.type, replacements, tokenizer) +
                           "@" + x.name.split("@")[1],
                      type=x.type)

        edges['src'] = edges['src'].apply(replacements_lookup)
        edges['dst'] = edges['dst'].apply(replacements_lookup)

        resolve = lambda node: node_resolver.resolve(node, srctrl2original)

        edges['src'] = edges['src'].apply(resolve)
        edges['dst'] = edges['dst'].apply(resolve)

        edges = replace_mentions_with_subword_instances(
            edges,
            subword_tokenizer,
            create_subword_instances=create_subword_instances,
            connect_subwords=connect_subwords)

        resolve_node_id = lambda node: node_resolver.resolve_node_id(
            node, row['id'])

        edges['src'] = edges['src'].apply(resolve_node_id)
        edges['dst'] = edges['dst'].apply(resolve_node_id)

        extract_id = lambda node: node.id
        edges['src'] = edges['src'].apply(extract_id)
        edges['dst'] = edges['dst'].apply(extract_id)

        # edges = edges.append(node_resolver.get_mention_edges())
        edges = edges.drop_duplicates(subset=["src", "dst", "type"])

        edges['id'] = 0

        ast_nodes = resolve_self_collision(
            filter_nodes(
                adjust_offsets(
                    to_offsets(c, get_ast_nodes(edges), as_bytes=True),
                    -strip_len), orig_body))

        srctrl_nodes = list(
            map(
                lambda x: (x[0], x[1],
                           node_resolver.resolve(GNode(name=x[2], type="Name"),
                                                 srctrl2original).global_id),
                to_offsets(row['body_with_random_replacements'],
                           format_replacement_offsets(
                               row['replacement_list']))))

        all_offsets = join_offsets(sorted(ast_nodes, key=lambda x: x[0]),
                                   sorted(srctrl_nodes, key=lambda x: x[0]))

        bodies_with_replacements[row['id']] = all_offsets

        # append_edges(path=edges_with_ast_name, edges=edges)
        edges['mentioned_in'] = row['id']
        ast_edges = append_edges(ast_edges=ast_edges, new_edges=edges)
        # print("\r%d/%d" % (ind_bodies, len(bodies['body_normalized'])), end="")

    # print(" " * 30, end="\r")

    bodies['graph_node_replacements'] = bodies['id'].apply(
        lambda id_: bodies_with_replacements.get(id_, None))

    # write_nodes(path=nodes_with_ast_name, node_resolver=node_resolver)

    # ast_nodes = pd.DataFrame(node_resolver.new_nodes)[['id', 'type', 'serialized_name', 'mentioned_in']].astype(
    #     {'mentioned_in': 'Int32'}
    # )

    ast_edges = ast_edges.append(node_resolver.get_mention_edges())
    ast_edges['id'] = 0

    ast_nodes = node_resolver.new_nodes_for_write()
    ast_edges = ast_edges.rename(
        {
            'src': 'source_node_id',
            'dst': 'target_node_id'
        }, axis=1).astype({'mentioned_in': 'Int32'})

    # assert leaf_nodes_are_leaf_types(ast_nodes, ast_edges)
    leaf_nodes_are_leaf_types(ast_nodes, ast_edges)

    return ast_nodes, ast_edges, bodies