예제 #1
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("working_directory")
    parser.add_argument("function_id", type=int)
    parser.add_argument("output_path")

    args = parser.parse_args()

    nodes_path = join(args.working_directory, "common_nodes.bz2")
    edges_path = join(args.working_directory, "common_edges.bz2")

    nodes, edges = load_data(nodes_path, edges_path, rename_columns=False)

    fn_edges = edges.query(f"mentioned_in == {args.function_id}")
    fn_node_ids = set(fn_edges["source_node_id"] + fn_edges["target_node_id"])
    registered_edges = set(fn_edges["id"])
    extra_edges = edges[
        (edges["source_node_id"].apply(lambda id_: id_ in fn_node_ids) |
        edges["target_node_id"].apply(lambda id_: id_ in fn_node_ids) ) #&
        # edges["id"].apply(lambda id_: id_ not in registered_edges)
    ]
    fn_nodes = nodes[
        nodes["id"].apply(lambda id_: id_ in fn_node_ids)
    ]
    visualize(nodes, fn_edges, args.output_path)
예제 #2
0
def main(args):
    nodes, edges = load_data(args.nodes, args.edges)

    hierarchy_detector = HierarchyDetector(nodes, edges)
    hierarchy_levels = hierarchy_detector.assign_hierarchy_levels()

    persist(hierarchy_levels,
            os.path.join(os.path.dirname(args.nodes), "hierarchies.csv"))

    print(
        hierarchy_levels.groupby("hierarchy_level")\
            .count().rename({'id': 'count'}, axis=1)\
            .sort_values(by='count', ascending=False).to_string()
    )
예제 #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("working_directory")
    parser.add_argument("output")
    args = parser.parse_args()

    nodes, edges = load_data(join(args.working_directory, "nodes.bz2"),
                             join(args.working_directory, "edges.bz2"))
    type_annotated = set(
        unpersist(join(args.working_directory,
                       "type_annotations.bz2"))["src"].tolist())
    arguments = set(nodes.query("type == 'arg'")["id"].tolist())
    mentions = set(nodes.query("type == 'mention'")["id"].tolist())

    edges["in_mentions"] = edges["src"].apply(lambda src: src in mentions)

    edges["in_args"] = edges["dst"].apply(lambda dst: dst in arguments)

    edges = edges.query("in_mentions == True and in_args == True")

    mapping = {}
    for src, dst in edges[["src", "dst"]].values:
        if dst in mapping:
            print()
        mapping[dst] = src

    with open(args.output, "w") as sink:
        with open(join(args.working_directory,
                       "function_annotations.jsonl")) as fa:
            for line in fa:
                entry = json.loads(line)
                new_repl = [[s, e, int(mapping.get(r, r))]
                            for s, e, r in entry["replacements"]]
                entry["replacements"] = new_repl

                sink.write(f"{json.dumps(entry)}\n")

    print()
                    help='Path to the file with holdout edges')
parser.add_argument('--entities_path',
                    dest='entities_path',
                    default=None,
                    help='')
parser.add_argument('--entities_emb_path',
                    dest='entities_emb_path',
                    default=None,
                    help='')
parser.add_argument('--out_path', dest='out_path', default=None, help='')

args = parser.parse_args()

ent_map, new_embs = load_npy(args.entities_path, args.entities_emb_path)

nodes, edges = load_data(args.nodes_path, args.edges_path)
pd.read_csv(args.held_path).to_csv(os.path.join(args.out_path, "held.csv"),
                                   index=False)

nodes['global_graph_id'] = nodes['id'].apply(lambda x: ent_map[x])

# splits = get_train_val_test_indices(nodes.index)
from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import node_types
splits = SourceGraphDataset.get_train_val_test_indices(
    nodes.query(f"type_backup == '{node_types[4096]}'").index)

# nodes, edges, held = SourceGraphDataset.holdout(nodes, edges, 0.001)
# nodes['label'] = nodes['type']

# from SourceCodeTools.code.data.dataset.Dataset import create_train_val_test_masks
# def add_splits(nodes, splits):
예제 #5
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("working_directory")
    parser.add_argument("k_hops", type=int)
    parser.add_argument("output")

    args = parser.parse_args()

    nodes, edges = load_data(join(args.working_directory, "common_nodes.bz2"),
                             join(args.working_directory, "common_edges.bz2"))

    edge_types = {}
    edge_lists = {}
    for s, d, t in edges[["src", "dst", "type"]].values:
        edge_types[(s, d)] = t
        if s not in edge_lists:
            edge_lists[s] = []
        edge_lists[s].append(d)

    g = nx.from_pandas_edgelist(edges,
                                source="src",
                                target="dst",
                                create_using=nx.DiGraph,
                                edge_attr="type")

    # def expand_edges(node_id, view, edge_prefix, level=0):
    #     edges = []
    #     if level <= args.k_hops:
    #         if edge_prefix != "":
    #             edge_prefix += "|"
    #         for e in view:
    #             edges.append((node_id, e, edge_prefix + view[e]["type"]))
    #             edges.extend(expand_edges(node_id, g[e], edge_prefix + view[e]["type"], level=level+1))
    #     return edges
    #
    # edges = []
    # for node in tqdm(g.nodes):
    #     edges.extend(expand_edges(node, g[node], "", level=0))

    def expand_edges(node_id, s, dlist, edge_prefix, level=0):
        edges = []
        if level <= args.k_hops:
            if edge_prefix != "":
                edge_prefix += "|"
            for d in dlist:
                etype = edge_prefix + edge_types[(s, d)]
                edges.append((node_id, d, etype))
                edges.extend(
                    expand_edges(node_id,
                                 d,
                                 edge_lists[d],
                                 etype,
                                 level=level + 1))
        return edges

    edges = []
    for node in tqdm(edge_lists):
        edges.extend(expand_edges(node, node, edge_lists[node], "", level=0))

    print()
def main():
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('dataset_path',
                        default=None,
                        help='Path to the dataset')
    parser.add_argument('output_path', default=None, help='')
    parser.add_argument("--extra_objectives",
                        action="store_true",
                        default=False)
    parser.add_argument("--eval_frac",
                        dest="eval_frac",
                        default=0.05,
                        type=float)

    args = parser.parse_args()

    nodes_path, edges_path, extra_paths = get_paths(
        args.dataset_path, use_extra_objectives=args.extra_objectives)

    nodes, edges = load_data(nodes_path, edges_path)
    nodes, edges, holdout = SourceGraphDataset.holdout(nodes, edges)
    edges = edges.astype({
        "src": 'str',
        "dst": "str",
        "type": 'str'
    })[['src', 'dst', 'type']]
    holdout = holdout.astype({
        "src": 'str',
        "dst": "str",
        "type": 'str'
    })[['src', 'dst', 'type']]

    node2graph_id = compact_property(nodes['id'])
    nodes['global_graph_id'] = nodes['id'].apply(lambda x: node2graph_id[x])

    node_ids = set(nodes['id'].unique())

    if args.extra_objectives:
        for objective_path in extra_paths:
            data = unpersist(objective_path)
            data = filter_relevant(data, node_ids)
            data["type"] = objective_path.split(".")[0]
            edges = edges.append(data)

    if not os.path.isdir(args.output_path):
        os.mkdir(args.output_path)

    edges = edges[['src', 'dst', 'type']]
    eval_sample = edges.sample(frac=args.eval_frac)

    persist(nodes, join(args.output_path, "nodes_dglke.csv"))
    persist(edges,
            join(args.output_path, "edges_train_dglke.tsv"),
            header=False,
            sep="\t")
    persist(edges,
            join(args.output_path, "edges_train_node2vec.tsv"),
            header=False,
            sep=" ")
    persist(eval_sample,
            join(args.output_path, "edges_eval_dglke.tsv"),
            header=False,
            sep="\t")
    persist(eval_sample,
            join(args.output_path, "edges_eval_node2vec.tsv"),
            header=False,
            sep=" ")
    persist(holdout,
            join(args.output_path, "edges_eval_dglke_10000.tsv"),
            header=False,
            sep="\t")
    persist(holdout,
            join(args.output_path, "edges_eval_node2vec_10000.tsv"),
            header=False,
            sep=" ")
예제 #7
0
                continue

            vec = list(map(float, elements[1:]))
            assert len(vec) == n_dims
            id_map[id_] = len(vecs)
            vecs.append(vec)
    vecs = np.array(vecs)
    return id_map, vecs


emb_path = sys.argv[1]
nodes_path = sys.argv[2]
edges_path = sys.argv[3]
out_path = sys.argv[4]

nodes, edges = load_data(nodes_path, edges_path)

# splits = get_train_val_test_indices(nodes.index)
from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import node_types
splits = SourceGraphDataset.get_train_val_test_indices(
    nodes.query(f"type_backup == '{node_types[4096]}'").index)

id_map, vecs = load_w2v(emb_path)

nodes['global_graph_id'] = nodes['id'].apply(lambda x: id_map[x])

# nodes, edges, held = SourceGraphDataset.holdout(nodes, edges, 0.001)
# nodes['label'] = nodes['type']

# emb = Embedder.load_word2vec(emb_path)
emb = Embedder(id_map, vecs)
def main(args):

    nodes, edges = load_data(args.node_path, args.edge_path)

    print()

    uri = f"neo4j://{args.host}:{args.port}"
    driver = GraphDatabase.driver(uri,
                                  auth=(f"{args.user}", f"{args.password}"))

    def create_node(tx, id_, name_, node_type):
        tx.run(
            f"CREATE (n:{node_type.replace('#','_')}{{id:{id_}, name:'{name_}'}})"
        )

    def create_edge(tx, src, dst, edge_type):
        tx.run(f"""
        match (s{{id:{src}}}), (d{{id:{dst}}})
        merge (s)-[:{edge_type}]->(d)
        """)

    def create_indexes(tx, types):
        for type in types:
            tx.run(f"create index on :{type.replace('#','_')}(id)")
            tx.run(f"create index on :{type.replace('#','_')}(name)")

    with driver.session() as session:
        # https://neo4j.com/docs/api/python-driver/current/api.html#explicit-transactions

        print("Importing nodes")

        tx = session.begin_transaction()
        for ind, row in nodes.iterrows():
            create_node(tx, row['id'], row['name'], row['type'])
            if (ind + 1) % args.batch_size == 0:
                print(f"{ind}/{len(nodes)}", end="\r")
                tx.commit()
                tx.close()
                tx = session.begin_transaction()
        tx.commit()
        tx.close()

        print()

        print("Creating indexes")
        tx = session.begin_transaction()
        create_indexes(tx, types=nodes['type'].unique().tolist())
        tx.commit()
        tx.close()

        print("Importing edges")

        tx = session.begin_transaction()
        for ind, row in edges.iterrows():
            create_edge(tx, row['src'], row['dst'], row['type'])
            if (ind + 1) % args.batch_size == 0:
                print(f"{ind}/{len(edges)}", end="\r")
                tx.commit()
                tx.close()
                tx = session.begin_transaction()
        tx.commit()
        tx.close()

        print()

    driver.close()