def create_vocabulary_from_corpus(corpus_path, output_token_path=None):

    all_sub_tokens = []
    node_types = get_used_nodes_type()

    # Extract all subtokens from all nodes of the appropriate type using all graphs in the corpus
    for dirpath, dirs, files in os.walk(corpus_path):
        for filename in files:
            if filename.endswith('proto'):
                fname = os.path.join(dirpath, filename)

                with open(fname, "rb") as f:
                    g = Graph()
                    g.ParseFromString(f.read())

                    for n in g.node:
                        if n.type in node_types:
                            all_sub_tokens += split_identifier_into_parts(n.contents)

    all_sub_tokens = list(set(all_sub_tokens))
    all_sub_tokens.append('<SLOT>')
    all_sub_tokens.append('sos_token')
    all_sub_tokens.sort()

    vocabulary = __create_voc_from_tokens(all_sub_tokens)

    # Save the vocabulary
    if output_token_path != None:
        with open(output_token_path, "wb") as fp:
            pickle.dump(vocabulary, fp)

    return vocabulary
def main(path):
  with open(path, "rb") as f:
    g = Graph()
    g.ParseFromString(f.read())
    token_count = len(list(filter(lambda n:n.type in 
                             (FeatureNode.TOKEN,
                              FeatureNode.IDENTIFIER_TOKEN), g.node)))
    token_count = len(set(g.node.startLineNumber))
                              # startLineNumber
    print("%s contains %d tokens" % (g.sourceFile, token_count))
def open_proto(filename):

    print(filename)

    with open(filename, "rb") as f:

        g = Graph()
        g.ParseFromString(f.read())

        get_info(g)
Beispiel #4
0
def compute_corpus_stats(corpus_path):

    max_node_len, max_var_len, max_var_usage = 0, 0, 0

    for dirpath, dirs, files in os.walk(corpus_path):
        for filename in files:
            if filename.endswith('proto'):

                fname = os.path.join(dirpath, filename)

                with open(fname, "rb") as f:

                    g = Graph()
                    g.ParseFromString(f.read())

                    var_node_usages = {}
                    identifier_node_ids = []

                    for node in g.node:

                        if node.type not in get_used_nodes_type() \
                                and node.type != FeatureNode.SYMBOL_VAR:
                            continue

                        node_len = len(
                            split_identifier_into_parts(node.contents))

                        if node_len > max_node_len:
                            max_node_len = node_len

                        if node.type == FeatureNode.SYMBOL_VAR:

                            var_node_usages[node.id] = 0

                            if node_len > max_var_len:
                                max_var_len = node_len

                        elif node.type == FeatureNode.IDENTIFIER_TOKEN:
                            identifier_node_ids.append(node.id)

                    for edge in g.edge:

                        if edge.sourceId in var_node_usages and edge.destinationId in identifier_node_ids:
                            var_node_usages[edge.sourceId] += 1

                    if len(var_node_usages.values()) > 0:
                        var_usage = max(var_node_usages.values())
                    else:
                        var_usage = 0

                    if var_usage > max_var_usage: max_var_usage = var_usage

    print("Longest node length: ", max_node_len)
    print("Longest variable length: ", max_var_len)
    print("Largest variable usage: ", max_var_usage)
Beispiel #5
0
def runAnalysis(fileLocation):
    with open(fileLocation, "rb") as f:
        if verbose:
            print("Opening " + fileLocation, end='')

        g = Graph()
        g.ParseFromString(f.read())
        logs = detectLogs(g)
        if verbose:
            print(" ------- Number of logs found: " + str(len(logs)))
        return logs
Beispiel #6
0
def get_n_tokens(path, n):
    with open(path, "rb") as f:
        g = Graph()
        g.ParseFromString(f.read())
        to_print_len = min(len(g.node), n)
        token_list = []
        for node in filter(isToken, g.node):
            token_list.append(node.contents)
            to_print_len -= 1
            if to_print_len <= 0:
                break
        return token_list
Beispiel #7
0
def get_types_and_dependencies(file_name, max_time=30):
    with open(file_name, "rb") as f:
        graph = Graph()
        graph.ParseFromString(f.read())
        id_mapping = get_id_to_node_graph(graph)
        source_mapping = get_source_dict_graph(graph)
        global START_TIME, MAX_TIME
        START_TIME = time.time()
        MAX_TIME = max_time
        variable_types = get_type_mapping(graph, id_mapping, source_mapping)
        dependencies = get_all_dependencies(graph, id_mapping, source_mapping,
                                            variable_types)
        return variable_types, dependencies
    def compute_var_usages(self):

        if self.num_usages is not None: return self.num_usages

        with open(self.fname, "rb") as f:

            g = Graph()
            g.ParseFromString(f.read())

            n_usages = get_var_usages(g, self.node_id)

        self.num_usages = n_usages

        return n_usages
    def compute_var_type(self):

        if self.type != self.empty_type: return self.type

        with open(self.fname, "rb") as f:

            g = Graph()
            g.ParseFromString(f.read())

            var_type = get_var_type(g, self.node_id, self.empty_type)

        self.type = var_type

        return var_type
def get_file_methods_data(file):
    """
    Extract the source code tokens, identifier names and graph for methods in a source file.
    Identifier tokens are split into subtokens. Constructors are not included in the methods.
    :param file: file
    :return: (methods_source, methods_names, methods_graph) where methods_source[i] is a list of the tokens for
    the source of ith method in the file, methods_names[i] is a list of tokens for name of the
    ith method in the file, and methods_graph[i] is the subtree of the file parse tree starting
    from the method node.
    """
    adj_list, nodes, edges = get_file_graph(file)

    with file.open('rb') as f:
        class_name = file.name.split('.')

        g = Graph()
        g.ParseFromString(f.read())
        methods_source = []
        methods_names = []
        methods_graph = []
        # class_name_node = get_class_name_node(g)

        for node in g.node:
            if node.contents == "METHOD":
                method_name_node = get_method_name_node(g, node)

                # If method name is the same as class name, then method name is constructor,
                # so discard it
                if method_name_node.contents == class_name:
                    continue

                method_edges, method_nodes, non_tokens_nodes_features = get_method_edges(
                    node.id, adj_list, nodes)
                methods_graph.append((method_edges, non_tokens_nodes_features))
                methods_names.append(
                    split_identifier_into_parts(method_name_node.contents))

                method_source = []

                for other_node in method_nodes.values():
                    if other_node.id == method_name_node.id:
                        # Replace method name with '_' in method source code
                        method_source.append('_')
                    elif other_node.type == FeatureNode.TOKEN or other_node.type == \
                            FeatureNode.IDENTIFIER_TOKEN:
                        method_source.append(other_node.contents)

                methods_source.append(method_source)

        return methods_source, methods_names, methods_graph
def load_data_file(file_path: str) -> Iterable[List[str]]:
    """
    Load a single data file, returning token streams.

    Args:
        file_path: The path to a data file.

    Returns:
        Iterable of lists of strings, each a list of tokens observed in the data.
    """
    with open(file_path, "rb") as f:
        g = Graph()
        g.ParseFromString(f.read())

        return method_split_tokens(g)
Beispiel #12
0
    def create_samples(self, filepath):

        with open(filepath, "rb") as f:

            g = Graph()
            g.ParseFromString(f.read())

            true_labels = []

            max_path_len = 8

            graph_samples = log_graph_processing.get_log_samples(
                g, max_path_len, self.max_node_seq_len, self.pad_token_id,
                self.vocabulary)

            samples, labels, slot_labels = [], [], []

            try:
                with open(
                        os.path.splitext(filepath)[0].replace("java", "json"),
                        "r") as f_:
                    labels, slot_labels = json.load(f_)
                    if len(slot_labels) != len(graph_samples) or len(
                            labels) != sum(slot_labels):
                        print(
                            "Error: labels and samples don't match, num of labels: %d, num of samples: %d, filename: %s"
                            % (len(labels), len(graph_samples), filepath))
                        os.system("rm -f " + filepath[:-11] + "*")
            except FileNotFoundError as e:
                print(
                    "Warning: file not found. It's ok if you are not training the model"
                )
                labels = [self.label_kind - 1] * len(graph_samples)
                slot_labels = [1] * len(graph_samples)

            count = 0

            for i in range(len(graph_samples)):
                if slot_labels[i] != 0:
                    new_sample = self.create_sample(*(graph_samples[i]),
                                                    labels[count])
                    samples.append(new_sample)
                    true_labels.append(labels[count])
                    count += 1

            return samples, true_labels
def count_one(path):
    with open(path, "rb") as f:
        g = Graph()
        g.ParseFromString(f.read())
        token_count = len(
            list(
                filter(
                    lambda n: n.type in
                    (FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN),
                    g.node)))
        # max_line = max(list(map(lambda n: n.endLineNumber, g.node)))
        max_line = g.ast_root.endLineNumber
        javadoc_comments = len(
            list(
                filter(lambda n: n.type == FeatureNode.COMMENT_JAVADOC,
                       g.node)))
        return token_count, max_line, javadoc_comments
def get_nx_graph(file):
    """
    Get networkx graph corresponding to a file.
    """
    nx_graph = nx.DiGraph()
    with file.open('rb') as f:
        g = Graph()
        g.ParseFromString(f.read())

        for edge in g.edge:
            edge_type = [
                name for name, value in list(vars(FeatureEdge).items())[8:]
                if value == edge.type
            ][0]
            nx_graph.add_edge(edge.sourceId,
                              edge.destinationId,
                              edge_type=edge_type)
    return nx_graph
Beispiel #15
0
    def load_data_file(_, file_path: str) -> Iterable[Tuple[str, bool]]:
        """
        Load a single data file, returning token streams.

        Args:
            file_path: The path to a data file.

        Returns:
            Iterable of lists of strings, each a list of tokens observed in the data.
        """
        with open(file_path, "rb") as f:
            g = Graph()
            g.ParseFromString(f.read())
            v = [(n.contents.lower(), n.type == FeatureNode.IDENTIFIER_TOKEN) for n in g.node
                    if n.type in
                        [FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN]
                    ]
            return v
Beispiel #16
0
def modifyGraphFile(graphFile, rootId):
    g = Graph()
    g.ParseFromString(graphFile.read())
    nodes = g.node
    edges = g.edge

    # get all the relevant log nodes
    allLogNodes, baseNodeIndex, \
    lastLogNodeIndex, lastNodeEndLineNumber, lastNodeEndPosition = retrieveAllLogsNodes(nodes, rootId)
    # get the releant node ids
    allLogNodesIds = list(map(lambda node: node.id, allLogNodes))

    # STEP 1) Remove any edge that both originates and targets nodes within our log statment
    edges = removeLogEdges(edges, allLogNodesIds)
    # STEP 2) Modify any edge that links to one of our log nodes within the statement. These edges will now point to the
    # root LOG node.
    edges = adjustOutsideEdges(edges, allLogNodesIds)
    # STEP 3) Modify all the log nodes. Modify root node to be special LOG node, delete rest.
    nodes = modifyNodes(nodes, baseNodeIndex, lastLogNodeIndex,
                        lastNodeEndLineNumber, lastNodeEndPosition)

    # create a new Graph file to return for writing, using all the modified nodes and edges
    returnGraph = Graph()
    for node in nodes:
        graphNode = graph_pb2.FeatureNode()
        graphNode.id = node.id
        graphNode.type = node.type
        graphNode.contents = removeImportLeaks(node.contents)
        graphNode.startPosition = node.startPosition
        graphNode.endPosition = node.endPosition
        graphNode.startLineNumber = node.startLineNumber
        graphNode.endLineNumber = node.endLineNumber
        # append our node to the new graph file
        returnGraph.node.append(graphNode)
    for edge in edges:
        graphEdge = graph_pb2.FeatureEdge()
        graphEdge.sourceId = edge.sourceId
        graphEdge.destinationId = edge.destinationId
        graphEdge.type = edge.type
        # append our edge to the graph file
        returnGraph.edge.append(graphEdge)

    graphFile.close()
    return returnGraph
    def create_samples(self, filepath):

        with open(filepath, "rb") as f:

            g = Graph()
            g.ParseFromString(f.read())

            max_path_len = 8

            # Select sample parsing strategy depending on the specified model task
            if self.task_type == 0:
                graph_samples, slot_node_ids = graph_processing.get_usage_samples(
                    g, max_path_len, self.max_slots, self.max_node_seq_len,
                    self.pad_token_id, self.slot_id, self.vocabulary)

            elif self.task_type == 1:
                graph_samples, slot_node_ids = graph_processing.get_usage_samples(
                    g, max_path_len, self.max_slots, self.max_node_seq_len,
                    self.pad_token_id, self.slot_id, self.vocabulary, True)

            elif self.task_type == 2:
                graph_samples, slot_node_ids = graph_processing.get_method_body_samples(
                    g, self.max_node_seq_len, self.pad_token_id, self.slot_id,
                    self.vocabulary)

            else:
                raise ValueError("Invalid task id...")

            samples, labels = [], []

            for sample in graph_samples:
                new_sample, new_label = self.create_sample(*sample)
                samples.append(new_sample)
                labels.append(new_label)

            # Save sample meta-information
            samples_meta_inf = []

            for slot_node_id in slot_node_ids:
                new_inf = SampleMetaInformation(filepath, slot_node_id)
                samples_meta_inf.append(new_inf)

            return samples, labels, samples_meta_inf
def get_file_graph(file):
    """
    Compute graph for the given file.
    """
    with file.open('rb') as f:
        g = Graph()
        g.ParseFromString(f.read())
        node_ids = [node.id for node in g.node]
        edges = [(e.sourceId, e.destinationId, e.type) for e in g.edge]

        adj_list = {node: [] for node in node_ids}
        for edge in edges:
            adj_list[edge[0]].append({
                'destination': edge[1],
                'edge_type': edge[2]
            })

        nodes = {node.id: node for node in g.node}

        return adj_list, nodes, edges
Beispiel #19
0
def main():
    with open(str(graphLocation), "rb") as graphFile:
        g = Graph()
        g.ParseFromString(graphFile.read())
        logIds = []
        with open(graphLocation.name + ".dot", "w") as f:
            # write the first line of the dot
            f.write("digraph G {\n")
            # for each node, write out the node with the contents as its label
            for node in g.node:
                # the first line writes the node type as the label, the second the node contents. Second is better.
                # f.write(str(node.id) + ' [ label="' + toNodeText(node) + '" ];\n')
                f.write(str(node.id) + ' [ label="' + re.escape(node.contents) + '" ];\n')
                if (toNodeText(node) == "LOG"):
                    logIds.append(node.id)
            # for each edge, write out the edge as a link between the source and the destination
            for edge in g.edge:
                f.write((str(edge.sourceId) + " -> " + str(edge.destinationId) + "\n"))
            # Pretify any special LOG nodes by making them distinct (Square with diamond inside)
            for logId in logIds:
                f.write(str(logId) + " [shape=Msquare];")
            f.write("}\n")
            f.close()
Beispiel #20
0
def load_data_file(file_path: str) -> Iterable[List[str]]:
    """
    Load a single data file, returning token streams.

    Args:
        file_path: The path to a data file.

    Returns:
        Iterable of lists of strings, each a list of tokens observed in the data.
    """
    #TODO 2# Insert your data parsing code here
    # Method for checking if a node is a method
    def isMethod(node):
        return node.type == FeatureNode.AST_ELEMENT and node.contents == "METHOD"

    # Method that decides whether a node is a token
    def isToken(node):
        return node.type in (FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN)

    # Retrieve token leaf nodes, by DFS
    def get_leaf_nodes(nodeId, sourceDict, nodeDict, visited):
        if (nodeId in visited):
            return []
        visited.add(nodeId)
        if (nodeId == None or nodeDict.get(nodeId) == None):
            return []
        if (nodeDict.get(nodeId).type in [FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN]):
            return [nodeDict.get(nodeId)]
        edgeTo = sourceDict.get(nodeId)
        if (edgeTo == None):
            return []
        to_return = []
        for edge in edgeTo:
            to_return += get_leaf_nodes(edge.destinationId, sourceDict, nodeDict, visited)
        return to_return

    # Reorder leaf nodes from top to bottom
    def reorder_leaves(leaves_arr, sourceDict, nodeDict):
        leaves_map = dict()
        for (index, node) in enumerate(leaves_arr):
            leaves_map[node.id] = index
        length = len(leaves_arr)
        index_sum = int(((length - 1) * length) / 2)
        for node in leaves_arr:
            if (node.id in sourceDict) and ((sourceDict[node.id][0]).destinationId in leaves_map):
                index_sum -= leaves_map[(sourceDict[node.id][0]).destinationId]
        current = leaves_arr[index_sum]
        to_return = []
        for _ in range(length):
            to_return.append(current)
            if current.id in sourceDict:
                current = nodeDict[(sourceDict[current.id][0]).destinationId]
            else:
                break
        return to_return

    # Get tokens for given file
    with open(file_path, "rb") as f:
        g = Graph()
        g.ParseFromString(f.read())
        token_count = len(list(filter(lambda n:n.type in 
                                (FeatureNode.TOKEN,
                                FeatureNode.IDENTIFIER_TOKEN), g.node)))
        to_print_len = min(len(g.node), 100)
        idsInNode = dict()
        sourceIdsInEdge = dict()
        for node in g.node:
            idsInNode[node.id] = node
        for edge in g.edge:
            cur = sourceIdsInEdge.get(edge.sourceId, [])
            cur.append(edge)
            sourceIdsInEdge[edge.sourceId] = cur
        all_results = []
        for node in g.node:
            if isMethod(node):
                initial_leaves = reorder_leaves(get_leaf_nodes(node.id, sourceIdsInEdge, idsInNode, set()), \
                                                sourceIdsInEdge, idsInNode)
                correct = [str(n.contents).lower() for n in filter(isToken, initial_leaves)]
                all_results.append(correct)
        return all_results
def get_source_dict(path):
    with open(path, "rb") as f:
        g = Graph()
        g.ParseFromString(f.read())
        return get_source_dict_graph(g)
def tokenize_methods_for_file(path, full=False):
    with open(path, "rb") as f:
        g = Graph()
        g.ParseFromString(f.read())
        return tokenize_methods_for_graph(g, full)
def get_id_to_node(path):
    with open(path, "rb") as f:
        g = Graph()
        g.ParseFromString(f.read())
        return get_id_to_node_graph(g)
Beispiel #24
0
def load_data_file_methods(file_path: str):
    """
    Load a single data file, returning methods code and JavaDoc comments.
    """

    methods_code = []
    methods_comments = []
    graphs = []

    g = Graph()
    with open(file_path, "rb") as f:
        g.ParseFromString(f.read())

    # Build a dictionary of nodes indexed by id
    # by start position and end position
    nodes_dict = {}
    tokens_by_start_pos = {}
    tokens_by_end_pos = {}
    # A list of methods root nodes
    methods = []
    for n in g.node:
        nodes_dict[n.id] = n
        if n.contents == 'METHOD':
            methods.append(n)
        if n.type in (FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN):
            tokens_by_start_pos[n.startPosition] = n
            tokens_by_end_pos[n.endPosition] = n

    # Build a dictionary of edges indexed by source id
    edges_dict = {}
    for e in g.edge:
        if e.sourceId in edges_dict:
            edges_dict[e.sourceId].append(e)
        else:
            edges_dict[e.sourceId] = [e]

    for m in methods:
        # Start with a node that is a token and starts at the same position
        # as method's start postion
        nid = tokens_by_start_pos[m.startPosition].id
        tokens = []
        comment = ""

        # Follow the 'next token' edges up to the token finishing at end postion
        while nid != tokens_by_end_pos[m.endPosition].id:
            tokens.append(nodes_dict[nid].contents.lower())
            if nid in edges_dict:
                for e in edges_dict[nid]:
                    if e.type == FeatureEdge.NEXT_TOKEN:
                        nid = e.destinationId

        for n in g.node:
            if n.type == FeatureNode.COMMENT_JAVADOC and m.id == edges_dict[
                    n.id][0].destinationId:
                comment = format_comment_to_plain_text(n.contents)

        # I add only the non-empty methods that have comments.
        # I also ensure that method is not vrtual and has a body starting with '{'.
        if len(tokens) > 0 and len(comment) > 0 and 'lbrace' in tokens and len(
                tokens) < 200:
            methods_code.append(tokens)
            methods_comments.append(comment)

            methods_edges, nodes_features = get_method_graph(
                m, nodes_dict, edges_dict)

            graph = {
                'Target': word_tokenize(comment),
                'Source_len': len(tokens),
                'graph': {
                    'node_features': nodes_features,
                    'adjacency_lists': methods_edges
                }
            }
            if len(nodes_features) < 300:
                graphs.append(graph)

            # print(graph)

    return methods_code, methods_comments, graphs
def obfuscate_graph(g, precomputed_name_files):
    id_mapping = get_id_to_node_graph(g)
    source_mapping = get_source_dict_graph(g)
    start_node = g.ast_root
    initialPath = []
    initialPath.append(start_node)
    to_obfuscate = get_obfuscation_names(start_node.id, id_mapping,
                                         source_mapping, set(), initialPath)

    new_names = get_new_names(len(to_obfuscate), precomputed_name_files)
    new_names_mapping = create_names_mapping(to_obfuscate, new_names)
    substitute_all(g.node, new_names_mapping)
    return g


if __name__ == "__main__":
    filePath = sys.argv[1]
    precomputed_name_files = "precomputed_names.txt"
    with open(filePath, "rb") as f:
        untouched = Graph()
        untouched.ParseFromString(f.read())
        obfuscated_graph = obfuscate_path(filePath, precomputed_name_files)
        before = tokenize_methods_for_graph(untouched)
        after = tokenize_methods_for_graph(obfuscated_graph)

        print("BEFORE:")
        print(before)

        print("AFTER:")
        print(after)
def obfuscate_path(path, precomputed_name_files):
    with open(path, "rb") as f:
        g = Graph()
        g.ParseFromString(f.read())
        return obfuscate_graph(g, precomputed_name_files)
def convertGraph(graphLoc, severity, msgToken):
    # return JSON structure visualized.
    returnJSON = {
        "backbone_sequence": [],
        "node_labels": [],
        "edges": {},
        "method_name": [],
        "log_node": -1
    }
    with open(graphLoc, "rb") as graphFile:
        g = Graph()
        g.ParseFromString(graphFile.read())
        nodes = g.node
        edges = g.edge
        # make a map of all the backbone nodes by looping over the edges and getting any node that
        # has a NEXT_TOKEN pointing to/from it
        backboneNodes = {}
        for edge in edges:
            if edge.type == 2:
                # overwrite, but thats ok!
                backboneNodes[edge.destinationId] = True
                backboneNodes[edge.sourceId] = True
        # Map of nodeId->Index. Used in step 3
        IdIndexDict = {}
        # Used to get the index of our special node to put it to the JSON
        specialLogNodeIndex = -1
        for index, node in enumerate(nodes):
            # got the special node
            if node.type == 17:
                specialLogNodeIndex = index
            # add the node's index to the id map
            IdIndexDict[node.id] = index
            # STEP 1) Create backbone_sequence by first checking if the node is a backbone node and then appending its
            # index to the array
            if node.id in backboneNodes:
                returnJSON["backbone_sequence"].append(index)

            # STEP 2) Create node_labels by appending the node contents to the array
            returnJSON["node_labels"].append(node.contents)
        # STEP 3) Create edges by parsing the edge data into the correct format
        returnDict = {}
        for edge in edges:
            type = EdgeType(edge.type).name
            if type not in returnDict:
                returnDict[type] = []
            sourceIndex = IdIndexDict[edge.sourceId]
            destinationIndex = IdIndexDict[edge.destinationId]
            returnDict[type].append([sourceIndex, destinationIndex])
        returnJSON["edges"] = returnDict
        # STEP 4) If we are trying to predict the logging statement, add the tokenized msg to the
        # prediction variable (method_name). Also put the logging level inside as well.
        # If we are trying to predict the severity, add the severity to the prediction variable only.
        if args.statement_generation:
            returnJSON["method_name"] = msgToken
            # TODO check if severity should be here
            #returnJSON["severity"] = severity
        else:
            returnJSON["method_name"].append(severity)
        # STEP 5) Add the index of the log node to the JSON
        returnJSON["log_node"] = specialLogNodeIndex
        return json.dumps(returnJSON)
    def parse_file(self, file_name, only_javadoc=True, lowercase_api=True, should_subtokenize=False):
        """
        Extracts features from a single protobuf file.
        """

        names = []
        apis = []
        javadocs = []
        tokens = []
        method_bodies = []

        with open(file_name, 'rb') as proto_file:
            g = Graph()

            # Parse protobuf file as a graph. Skips all files which error.
            try:
                g.ParseFromString(proto_file.read())
            except:
                print('Error parsing: {0}'.format(file_name))
                return tokens, apis, names, javadocs, method_bodies

            code_graph = CodeGraph(g)

            # We either extract features from all method or those which have associated
            # Javadoc comments.
            method_dict = code_graph.methods if only_javadoc else code_graph.all_methods

            for method in method_dict.values():

                # Omit methods which are below the defined threshold
                if method.num_lines <= self.line_threshold:
                    continue

                # Parse method name tokens
                method_name_tokens = self.text_filter.apply_to_method_name(method.method_name)

                # Parse API invocations
                method_invocations = self._get_method_invocations(method.method_block, code_graph)
                api_call_tokens = []
                for invocation in method_invocations:
                    parsed_invocation = self._parse_method_invocation(invocation, code_graph).strip()
                    api_call_tokens.append(parsed_invocation)

                obj_init_tokens = self._get_object_inits(method.method_block, code_graph)
                api_call_tokens += obj_init_tokens
                api_call_tokens = self.text_filter.apply_to_api_calls(api_call_tokens, lowercase_api,
                                                                      should_subtokenize=should_subtokenize)

                # Parse Javadoc comments. We check to make sure the method has an associdated
                # Javadoc comment, as there may be no javadoc on methods which are used during testing.
                javadoc_tokens = []
                if method.javadoc:
                    javadoc_tokens = self.text_filter.apply_to_javadoc(method.javadoc.contents)

                # Parse method tokens
                method_tokens = self._get_method_tokens(method.method_block, code_graph)
                method_tokens = self.text_filter.apply_to_token_lst(method_tokens)

                # Extract the entire method body. This field is used during searching.
                method_str = self._method_to_str(method.method_block, code_graph)

                # During testing, we only omit methods for which there is no proper method body
                if not only_javadoc and len(method_str.strip()) > 0 and len(method_name_tokens) > 0:
                    # Tokens in the output files are separated by spaces
                    names.append(' '.join(method_name_tokens))
                    apis.append(' '.join(api_call_tokens))
                    tokens.append(' '.join(method_tokens))
                    javadocs.append(' '.join(javadoc_tokens))
                    method_bodies.append(method_str)

                # During training, we only omit methods which have no name or javadoc description
                if only_javadoc and len(javadoc_tokens) > 0 and len(method_name_tokens) > 0:
                    # Tokens in the output files are separated by spaces
                    names.append(' '.join(method_name_tokens))
                    apis.append(' '.join(api_call_tokens))
                    tokens.append(' '.join(method_tokens))
                    javadocs.append(' '.join(javadoc_tokens))
                    method_bodies.append(method_str)

        return tokens, apis, names, javadocs, method_bodies
def compute_names_and_types(nodes, id_mapping, source_mapping):
    mapping = dict()
    for node in nodes:
        name = get_variable_name(node, id_mapping, source_mapping)
        Type = get_variable_type(node, id_mapping, source_mapping)
        if '' in (name, Type):
            continue
        mapping[name] = Type
    return mapping


def get_type_mapping(g, id_mapping=None, source_mapping=None):
    if id_mapping == None:
        id_mapping = get_id_to_node_graph(g)
    if source_mapping == None:
        source_mapping = get_source_dict_graph(g)
    root = g.ast_root
    all_members = get_variables(root, id_mapping, source_mapping)
    all_members.extend(get_classes(root, id_mapping, source_mapping))
    all_members.extend(get_methods(root, id_mapping, source_mapping))
    return compute_names_and_types(all_members, id_mapping, source_mapping)


if __name__ == "__main__":
    filePath = sys.argv[1]
    with open(filePath, "rb") as f:
        graph = Graph()
        graph.ParseFromString(f.read())
        type_mapping = get_type_mapping(graph)