Ejemplo n.º 1
0
def print_num_nodes_all(native_dir, interface_dir, complement_dir):

    i = 0
    for graph_file in os.listdir(native_dir):
        # Loop control
        # if i == 30: break
        # i += 1
        if '.json' not in graph_file: continue

        # read input and compute function
        try:
            interface = load_json(os.path.join(interface_dir, graph_file))
        except FileNotFoundError:
            print('\nWARNING, interface graph not found for: ',
                    graph_file, '\n')
            break
        try:
            complement = load_json(os.path.join(complement_dir, graph_file))
        except FileNotFoundError:
            print('\nWARNING, complement graph not found for: ',
                    graph_file, '\n\n')
            continue
        try:
            native = load_json(os.path.join(native_dir, graph_file))
        except FileNotFoundError:
            print('\nWARNING, native graph not found for: ',
                    graph_file, '\n\n')
            continue
        print_num_nodes(native, interface, complement)

    return
Ejemplo n.º 2
0
def remove_graphs_p(graph_dir, interaction, threshold=1):

    num_nodes = 0
    binding_nodes = 0
    # Count binding and non binding nodes
    for graph_file in tqdm(listdir_fullpath(graph_dir)):
        g = load_json(graph_file)
        g_binding_nodes = len([n for n, d in g.nodes.data()\
                            if d['binding_' + interaction] is not None])
        binding_nodes += g_binding_nodes
        num_nodes += len(g.nodes)

    num_NB = num_nodes - binding_nodes
    for graph_file in tqdm(listdir_fullpath(graph_dir)):
        if binding_nodes * threshold > num_NB:
            break
        g = load_json(graph_file)
        g_binding_nodes = len([n for n, d in g.nodes.data()\
                            if d['binding_' + interaction] is not None])
        if g_binding_nodes == 0:
            os.remove(graph_file)
            num_NB -= len(g.nodes)
            continue

    return num_NB, binding_nodes
Ejemplo n.º 3
0
def remove_nodes(graph_dir,
                 interaction,
                 num_NB,
                 num_binding,
                 threshold,
                 remove_size=0.3):
    """
    Undersample nodes from graphs to produce a balanced dataset
    """
    # Count number of nodes needed to remove

    remove_size = int((num_NB - num_binding) / len(os.listdir(graph_dir)))
    remove_size -= 10

    while num_binding * threshold < num_NB:
        print(f"Binding: {num_binding} \t Non-Binding {num_NB}")
        small = 0
        for graph_file in tqdm(listdir_fullpath(graph_dir)):
            g = load_json(graph_file)
            NB_nodes = [n for n, d in g.nodes.data()\
                    if d['binding_' + interaction] is None]
            if len(g.nodes) < 20 or len(NB_nodes) == 0:
                small += 1
                continue
            shuffle(NB_nodes)
            trash = NB_nodes[:remove_size]
            g.remove_nodes_from(trash)
            num_NB -= len(trash)
            dump_json(graph_file, g)
        if small > len(os.listdir(graph_dir)):
            break

    print(f"DONE \nBinding: {num_binding} \t Non-Binding {num_NB}")
Ejemplo n.º 4
0
def balance_complement_all(interface_dir, complement_dir, output_dir):

    i = 0
    for graph_file in os.listdir(interface_dir):
        # Loop control
        # if i == 30: break
        # i += 1
        if '.json' not in graph_file: continue

        # read input and compute function
        try:
            interface = load_json(os.path.join(interface_dir, graph_file))
            complement = load_json(os.path.join(complement_dir, graph_file))
            print('Balancing', graph_file, '...')
        except FileNotFoundError:
            print('\nWARNING, complement graph not found for: ', graph_file, '\n\n')
            continue
        balanced_complement = balance_complement(interface, complement)

        # Write output
        dump_json(balanced_complement, os.path.join(output_dir, graph_file))
Ejemplo n.º 5
0
def connect_all(input_dir, native_dir, output_dir):
    """
    runs connect_components on all graphs in input_dir and outputs
    resulting connected graphs to output_dir
    """
    i = 0
    for graph_file in os.listdir(input_dir):
        # Loop control
        # if i == 30: break
        # i += 1
        if '.json' not in graph_file: continue

        # read input and compute function
        g = load_json(os.path.join(input_dir, graph_file))
        g_native = load_json(os.path.join(native_dir, graph_file))
        connected_graphs = connect_components(g, g_native)

        # Write output
        pbid = graph_file[:4]
        for i, h in enumerate(connected_graphs):
            dump_json(h, os.path.join(output_dir, (pbid + '_' + str(i) + '.json') ))
            print_component_info(h, i)
Ejemplo n.º 6
0
def connect_and_balance_all(interface_dir, native_dir, complement_dir, output_dir,
                            quiet=False):
    """
    UNFINISHED: STILL NEED TO WRITE DESCRIPTION
    """
    # Make a directory inside output for the complements
    try:
        os.mkdir(os.path.join(output_dir, 'complement'))
    except FileExistsError:
        print('complement directory already exists! make sure you are not overwriting')
    comp_dir = os.path.join(output_dir, 'complement')

    i = 0
    for graph_file in os.listdir(interface_dir):
        #Loop control
        if i == 30: break
        i += 1
        if '.json' not in graph_file: continue

        if not quiet: print("Connecting and Balancing graph", graph_file )
        # read interface, complement and native graphs
        g = load_json(os.path.join(interface_dir, graph_file))
        g_native = load_json(os.path.join(native_dir, graph_file))
        g_complement = load_json(os.path.join(complement_dir, graph_file))

        # Connect the components into a set of graphs
        interface_graphs = connect_components(g, g_native)
        complement_graphs = connect_components(g_complement, g_native, trim_dangles=False)
        num_comps = len(complement_graphs)

        # Balance and write output
        pbid = graph_file[:4]
        for i, h in enumerate(interface_graphs):
            balanced_comp = balance_complement(h, complement_graphs[i%num_comps])
            dump_json(h, os.path.join(output_dir, (pbid + '_' + str(i) + '.json') ))
            dump_json(balanced_comp, os.path.join(comp_dir,
                                                        (pbid + '_' + str(i) + '.json') ))
Ejemplo n.º 7
0
def remove_graphs(graph_dir, interaction):

    num_nodes = 0
    binding_nodes = 0
    for graph_file in tqdm(listdir_fullpath(graph_dir)):
        g = load_json(graph_file)
        g_binding_nodes = len([n for n, d in g.nodes.data()\
                            if d['binding_' + interaction] is not None])
        binding_nodes += g_binding_nodes
        if g_binding_nodes == 0:
            os.remove(graph_file)
            continue
        num_nodes += len(g.nodes)

    return num_nodes - binding_nodes, binding_nodes
Ejemplo n.º 8
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        'input_dir', help='input_directory containing graphs and complement')
    parser.add_argument('output', help='csv output file')

    args = parser.parse_args()

    stats = {}

    stats['Graphs'] = len(os.listdir(args.input_dir))

    # Compute number of nodes
    stats['Nodes'] = 0
    stats['Edges'] = 0
    stats['Protein Binding'] = 0
    stats['Small-Mol. Binding'] = 0
    stats['Ion Binding'] = 0
    for graph_file in tqdm(listdir_fullpath(args.input_dir)):
        g = load_json(graph_file)
        stats['Nodes'] += len(g.nodes)
        stats['Edges'] += len(g.edges)
        stats['Protein Binding'] += len([n for n, d in g.nodes.data()\
                                        if d['binding_protein'] is not None])
        stats['Small-Mol. Binding'] += len([n for n, d in g.nodes.data()\
                                        if d['binding_small-molecule'] is not None])
        stats['Ion Binding'] += len([n for n, d in g.nodes.data()\
                                        if d['binding_ion'] is not None])

    stats['Avg Nodes'] = int(stats['Nodes'] / stats['Graphs'])
    stats['Avg Edges'] = int(stats['Edges'] / stats['Graphs'])

    if os.path.exists(args.output): header = False
    else: header = True

    name = '_'.join(args.input_dir.split('/')[-2:])

    with open(args.output, 'a') as f:
        writer = csv.writer(f, delimiter=',')
        # header
        if header: writer.writerow(['Dataset'] + list(stats.keys()))

        writer.writerow([name] + list(stats.values()))
Ejemplo n.º 9
0
    def __getitem__(self, idx):
        g_path = os.path.join(self.path, self.all_graphs[idx])
        graph = graph_io.load_json(g_path)

        # We can go from directed to undirected
        if self.directed and not nx.is_directed(graph):
            raise ValueError(
                f"The loader is asked to produce a directed graph from {g_path} that is undirected"
            )
        if not self.directed:
            graph = nx.to_undirected(graph)

        # This is a weird call but necessary for DGL as it only deals
        #   with undirected graphs that have both directed edges
        # The error raised above ensures that we don't have a discrepancy *
        #   between the attribute directed and the graphs :
        #   One should not explicitly ask to make the graphs directed in the learning as it is done by default but when
        #   directed graphs are what we want, we should use the directed annotation rather than the undirected.
        graph = nx.to_directed(graph)
        one_hot = {
            edge: torch.tensor(self.edge_map[label])
            for edge, label in (
                nx.get_edge_attributes(graph, self.label)).items()
        }
        nx.set_edge_attributes(graph, name='one_hot', values=one_hot)
        interface = get_labels(graph, interaction='protein')
        nx.set_node_attributes(graph, name='interface', values=interface)

        # Careful ! When doing this, the graph nodes get sorted.
        g_dgl = dgl.from_networkx(nx_graph=graph,
                                  edge_attrs=['one_hot'],
                                  node_attrs=['interface'])

        if self.node_simfunc is not None:
            ring = list(sorted(graph.nodes(data=self.level)))
            return g_dgl, ring
        else:
            return g_dgl, 0