コード例 #1
0
def getRecTFs(rec_tfs_file, costs=False):
    """ Get the receptors and TFs from a file. 
    uniprot_accession_number should be in the first column, with node_type (receptor or tf) in the second.
    """
    if costs is True:
        lines = utils.readColumns(rec_tfs_file, 1, 2, 3, 4)
        receptors = set([
            acc for acc, node_type, cost, zscore in lines
            if node_type == 'receptor'
        ])
        tfs = set([
            acc for acc, node_type, cost, zscore in lines if node_type == 'tf'
        ])
        costs = {acc: float(cost) for acc, node_type, cost, zscore in lines}
        zscores = {
            acc: float(zscore)
            for acc, node_type, cost, zscore in lines
        }
        return receptors, tfs, costs, zscores
    else:
        lines = utils.readColumns(rec_tfs_file, 1, 2)
        receptors = set(
            [acc for acc, node_type in lines if node_type == 'receptor'])
        tfs = set([acc for acc, node_type in lines if node_type == 'tf'])
        return receptors, tfs
コード例 #2
0
def main(args):
    #global PARENTNODES

    opts, args = parseArgs(args)

    #PARENTNODES = opts.include_parent_nodes

    # Set of edges from another source such as a pathway
    lines = utils.readColumns(opts.edges,1,2)
    prededges = set(lines)

    node_labels = {} 
    if opts.mapping_file is not None:
        node_labels = utils.readDict(opts.mapping_file, 1, 2)

    # get attributes of nodes and edges from the graph_attr file
    graph_attr = {}
    attr_desc = {} 
    if opts.graph_attr:
        graph_attr, attr_desc = readGraphAttr(opts.graph_attr)

    if opts.net is not None:
        # add the edge weight from the network to attr_desc which will be used for the popup
        edge_weights = {(u,v):float(w) for u,v,w in utils.readColumns(opts.net,1,2,3)}
        for e in prededges:
            if e not in attr_desc:
                attr_desc[e] = {}
            attr_desc[e]["edge weight"] = edge_weights[e]

    # set the width of the edges by the network weight
    if opts.net is not None and opts.set_edge_width:
        graph_attr = set_edge_width(prededges, edge_weights, graph_attr, a=1, b=12)

    # TODO build the popups here. That way the popup building logic can be separated from the
    # GSGraph building logic
    popups = {}
    prednodes = set([n for edge in prededges for n in edge])
    for n in prednodes:
        popups[n] = buildNodePopup(n, attr_val=attr_desc)
    for u,v in prededges:
        popups[(u,v)] = buildEdgePopup(u,v, node_labels=node_labels, attr_val=attr_desc)

    # Now post to graphspace!
    G = constructGraph(prededges, node_labels=node_labels, graph_attr=graph_attr, popups=popups)

    # TODO add an option to build the 'graph information' tab legend/info
    # build the 'Graph Information' metadata
    desc = buildGraphDescription(opts.edges, opts.net)
    metadata = {'description':desc,'tags':[], 'title':''}
    if opts.tag:
        metadata['tags'] = opts.tag
    G.set_data(metadata)
    G.set_name(opts.graph_name)

    post_graph_to_graphspace(G, opts.username, opts.password, opts.graph_name, apply_layout=opts.apply_layout, layout_name=opts.layout_name,
                             group=opts.group, make_public=opts.make_public)
コード例 #3
0
def get_ctd_support(chemical, prednodes, ctd_support_file):
    print("Getting CTD support counts from %s" % (ctd_support_file))
    num_intxs_per_gene = {}
    num_intxs_per_node = {}
    for cas, gene, interaction_action, pubmedids in utils.readColumns(
            ctd_support_file, 3, 4, 10, 11):
        if chemIDtoCAS[chemical] != cas:
            continue
        if 'phosphorylation' in interaction_action:
            if gene not in num_intxs_per_gene:
                num_intxs_per_gene[gene] = 0
            num_intxs_per_gene[gene] += 1

    for n in prednodes:
        gene = uniprot_to_gene[n]
        if gene in num_intxs_per_gene:
            # for now, take the node in the family node with the maximum support
            num_intxs_per_node[n] = num_intxs_per_gene[gene]
    print(
        "\nOf the %d prots with (de)phosphorylation evidence in CTD, %d of the %d net nodes overlap"
        % (len(num_intxs_per_gene), len(num_intxs_per_node), len(prednodes)))

    # # write these counts to a table
    # out_file = "%s-ctd-support.txt" % (opts.node_overlap)
    # print("Writing support counts to %s" % (out_file))
    # with open(out_file, 'w') as out:
    #     out.write("#uniprot\tgene\tmax_num_phospho_intxs\n")
    #     # write the sorted results to a file
    #     out.write('\n'.join(["%s\t%s\t%d" % (N, uniprot_to_gene[N], num_intxs_per_node[N]) for N in sorted(num_intxs_per_node, key=num_intxs_per_node.get, reverse=True)]) + '\n')
    return num_intxs_per_node
コード例 #4
0
def getPvals(resultsprefix, scope, sig_cutoff_type='FDR'):
    """ Function to retreive the pvalues for each chemical automatically.
        Currently only supports k 200 with FDR and BF pval corrections
    """
    print("Getting p-values for scope '%s'" % (scope))
    # get the significant chemicals
    # TODO add a k option and get the column automatically from the header line
    k = 200
    print("Using k %d" % (k))
    pvals_file = "%s/stats/stat-sig-%s/bfcorr_pval_qval.txt" % (resultsprefix,
                                                                scope)
    with open(pvals_file, 'r') as file_handle:
        # example header line:
        #chemical   k10-BFcorr-pval k10-qval    k25-BFcorr-pval k25-qval    k50-BFcorr-pval k50-qval
        header = file_handle.readline().rstrip().split('\t')
    # TODO add an option to get the uncorrected pvals
    if sig_cutoff_type == 'BF':
        #pval_col = 10
        pval_col = header.index("k%d-BFcorr-pval" % k) + 1
    elif sig_cutoff_type == 'FDR':
        #pval_col = 11
        pval_col = header.index("k%d-qval" % k) + 1
    else:
        # TODO add the non-corrected p-value as an option
        print(
            "please enter a valid value for --sig-cutoff-type. Valid options are: 'BF', 'FDR'"
        )
        sys.exit(1)

    chemical_pvals = {
        chemical: float(k_pval)
        for chemical, k_pval in utils.readColumns(pvals_file, 1, pval_col)
    }
    return chemical_pvals
コード例 #5
0
ファイル: alg_utils.py プロジェクト: ofaruqi/FastSinkSource
def setup_sparse_network(network_file, node2idx_file=None, forced=False):
    """
    Takes a network file and converts it to a sparse matrix
    """
    sparse_net_file = network_file.replace('.txt', '.npz')
    if node2idx_file is None:
        node2idx_file = sparse_net_file + "-node-ids.txt"
    if forced is False and (os.path.isfile(sparse_net_file)
                            and os.path.isfile(node2idx_file)):
        print("Reading network from %s" % (sparse_net_file))
        W = sparse.load_npz(sparse_net_file)
        print("\t%d nodes and %d edges" % (W.shape[0], len(W.data) / 2))
        print("Reading node names from %s" % (node2idx_file))
        node2idx = {
            n: int(n2)
            for n, n2 in utils.readColumns(node2idx_file, 1, 2)
        }
        idx2node = {n2: n for n, n2 in node2idx.items()}
        prots = [idx2node[n] for n in sorted(idx2node)]
    elif os.path.isfile(network_file):
        print("Reading network from %s" % (network_file))
        u, v, w = [], [], []
        # TODO make sure the network is symmetrical
        with open(network_file, 'r') as f:
            # add tqdm?
            for line in tqdm(f, total=120000000):
                line = line.rstrip().split('\t')
                u.append(line[0])
                v.append(line[1])
                w.append(float(line[2]))
        print("\tconverting uniprot ids to node indexes / ids")
        # first convert the uniprot ids to node indexes / ids
        prots = sorted(set(list(u)) | set(list(v)))
        node2idx = {prot: i for i, prot in enumerate(prots)}
        i = [node2idx[n] for n in u]
        j = [node2idx[n] for n in v]
        print("\tcreating sparse matrix")
        #print(i,j,w)
        W = sparse.coo_matrix((w, (i, j)),
                              shape=(len(prots), len(prots))).tocsr()
        # make sure it is symmetric
        if (W.T != W).nnz == 0:
            pass
        else:
            print("### Matrix not symmetric!")
            W = W + W.T
            print("### Matrix converted to symmetric.")
        #name = os.path.basename(net_file)
        print("\twriting sparse matrix to %s" % (sparse_net_file))
        sparse.save_npz(sparse_net_file, W)
        print("\twriting node2idx labels to %s" % (node2idx_file))
        with open(node2idx_file, 'w') as out:
            out.write(''.join(
                ["%s\t%d\n" % (prot, i) for i, prot in enumerate(prots)]))
    else:
        print("Network %s not found. Quitting" % (network_file))
        sys.exit(1)

    return W, prots
コード例 #6
0
def getProteins(paths='',
                ranked_edges='',
                max_k=200,
                max_prots=None,
                ties=False):
    """ get the proteins of a network from the paths or ranked edges file
    The *ties* option  can only be used with the paths file
    The *ties* option will find the path score at the max_k, and then continue until that path score is passed
    *rec_tfs* option will include the receptors and TFs in the paths. Only works for the paths option
    """
    # get the proteins of the top k paths, or up to a certain number of proteins
    proteins = set()
    sources = set()
    targets = set()
    # keep track of the k or number of paths if ties is used
    num_paths = 0
    if paths:
        paths = getPaths(paths, max_k, ties)
        num_paths = len(paths)
        for path in paths:
            path = path.split('|')
            # add the start and end of the path as source/target
            sources.add(path[0])
            targets.add(path[-1])
            # add each of the proteins in the path to the set of protiens
            proteins = proteins.union(set(path))
    else:
        # use the ranked edges file
        for p1, p2, k in utils.readColumns(ranked_edges, 1, 2, 3):
            if int(k) > max_k or len(proteins) > max_prots:
                break
            proteins.add(p1)
            proteins.add(p2)

    # TODO add an option to also return family nodes


#    # remove the source/target family nodes from the set of proteins
#    for s in sources:
#        if len(s.split(',')) > 1:
#            proteins.remove(s)
#    for t in targets:
#        if len(t.split(',')) > 1:
#            proteins.remove(t)
#
#    # split family nodes into individual nodes
#    split_family_proteins = set()
#    for p in proteins:
#        split_family_proteins.update(set(p.split(',')))
#    proteins = split_family_proteins

    if ties:
        # return the total number of paths from keeping the ties
        return proteins, num_paths
    else:
        return proteins
コード例 #7
0
def readGraphAttr(graph_attr_file):
    """ 
    Read attributes of nodes and edges from the graph_attr file
    Must have 4 tab-delimited columns. 
    1: Style name
    2: Style value
    3: Nodes/Edges (joined by '|') to apply the style to
    4: This is intended to be either a popup or part of the Graph Description / Legend, but it isn't built yet

    # example node attribute:
    color blue    p1|p2|p3  - 
    # example edge attribute:
    edge_style dotted    p1-p2|p2-p3  - 
    # example compound node. Here p1, p2, and p3 will have the parent attribute set to 'parent1' (i.e. they will belong to the same compound node parent1)
    parent    parent1  p1|p2|p3  - 
    # then to set the attributes of 'parent1', specify it as the node
    color blue    parent1  -
    """
    graph_attr = {}
    # description of a style, style_attr tuple
    # can also contain edge-str: name: value
    # which can be used when building popups
    attr_desc = {}

    # keep the order of the pathways by order of highest posterior probability
    #pathway_colors = collections.OrderedDict()
    print(
        "Adding graph attributes from '%s' (must have 3 tab-delimited columns)"
        % (graph_attr_file))
    # TODO the last column (here always '-') can be given as a description
    #lines = utils.readColumns(graph_attr_file, 1,2,3,4)
    lines = utils.readColumns(graph_attr_file, 1, 2, 3)
    print("\tread %d lines" % (len(lines)))
    # reverse the lines so the pathways at the top of the file will overwrite the pathways at the bottom
    #for style, style_attr, items, desc in lines[::-1]:
    for style, style_attr, items in lines[::-1]:
        for item in items.split('|'):
            # if this is an edge, then split it by the '-'
            if len(item.split('-')) == 2:
                item = tuple(item.split('-'))
            elif len(item.split('-')) > 2:
                print(
                    "Error: '-' found in node name for edge: %s. '-' is used to split an edge."
                    % (item))
                sys.exit(1)

            if item not in graph_attr:
                graph_attr[item] = {}
                graph_attr[item][style] = style_attr
        #attr_desc[(style, style_attr)] = desc
        #graph_attributes[group_number] = {"style": style, "style_attr": style_attr, "prots": prots.split(','), "desc":desc}

    return graph_attr, attr_desc
コード例 #8
0
def get_sig_chemicals(chemical_pvals, pval_col=5, sig_cutoff=0.05):
    """ *chemical_pvals* is the file output by compute_stat_sig.py
    """
    print(
        "Getting the significant chemicals with a pval cutoff of %s from %s" %
        (str(sig_cutoff), chemical_pvals))
    sig_chemicals = []
    for chemical, xk_pval in utils.readColumns(chemical_pvals, 1, pval_col):
        if float(xk_pval) < sig_cutoff:
            sig_chemicals.append(chemical)
    print("%d chemicals are significant" % (len(sig_chemicals)))
    return sig_chemicals
コード例 #9
0
def readNetwork(paths=None, ranked_edges=None, k_limit=200, no_k=False):
    """ Read the PathLinker paths or ranked_edges output. 
        Get all of the edges that have a k less than the k_limit.
    """
    if no_k is False:
        if paths is not None:
            # Predicted paths from pathlinker
            lines = utils.readColumns(paths, 1, 2, 3)
            prededges = {}
            edges = set()
            for k, path_score, path in lines:
                # get all of the edges in the paths that have a k less than the k_limit
                if int(k) > k_limit:
                    break

                path = path.split('|')

                for i in range(len(path) - 1):
                    edge = (path[i], path[i + 1])
                    if edge not in edges:
                        edges.add(edge)
                        prededges[edge] = int(k)

        if ranked_edges is not None:
            # Predicted edges from pathlinker
            lines = utils.readColumns(ranked_edges, 1, 2, 3)
            # get all of the edges that have a k less than the k_limit
            prededges = {(u, v): int(k)
                         for u, v, k in lines if int(k) <= k_limit}
    else:
        if ranked_edges:
            # Set of edges from another source such as a pathway
            lines = utils.readColumns(ranked_edges, 1, 2)
            # keep the edges in a dictionary to work with the rest of the code
            prededges = {(u, v): None for u, v in lines}

    return prededges
コード例 #10
0
def get_already_run_terms(alg_runners, **kwargs):
    # for each alg, taxon and go term pair, see which already exist and skip them
    alg_taxon_terms_to_skip = defaultdict(dict)
    for run_obj in alg_runners:
        alg = run_obj.name
        # define the output file path to see if it already exists
        #exp_type="%sloso" % ("all-sp-" if kwargs['keep_ann'] else '')
        exp_type = "loso"
        out_file = "%s/%s%s%s.txt" % (run_obj.out_dir, exp_type,
                                      run_obj.params_str,
                                      kwargs.get("postfix", ""))

        if os.path.isfile(out_file) and kwargs['forcealg']:
            print(
                "Removing %s as results will be appended to it for each taxon"
                % (out_file))
            os.remove(out_file)
            # the ranks file is for sinksource_bounds
            ranks_file = out_file.replace('.txt', '-ranks.txt')
            if '_bounds' in alg and os.path.isfile(ranks_file):
                print("\tAlso removing %s" % (ranks_file))
                os.remove(ranks_file)
            stats_file = out_file.replace('.txt', '-stats.txt')
            if os.path.isfile(stats_file):
                print("\tAlso removing %s" % (stats_file))
                os.remove(stats_file)
        # if the output file already exists, skip the terms that are already there
        # unless --write-prec-rec is specified with a single term.
        # then only the full prec_rec file will be written
        elif kwargs['write_prec_rec'] and len(kwargs['goterm']) == 1:
            pass
        elif os.path.isfile(out_file):
            print("WARNING: %s results file already exists. Appending to it" %
                  (out_file))
            # check which results already exist and append to the rest
            print("Reading results from %s " % (out_file))
            taxon_terms_completed = utils.readColumns(out_file, 1, 2)
            alg_taxon_terms_to_skip[alg] = {
                taxon: set()
                for taxon, term in taxon_terms_completed
            }
            for taxon, term in taxon_terms_completed:
                alg_taxon_terms_to_skip[alg][taxon].add(term)
            print("\t%d taxon - term pairs already finished" %
                  (len(taxon_terms_completed)))
    return alg_taxon_terms_to_skip
コード例 #11
0
def getPaths(paths_file, max_k=200, ties=False, scores=False):
    paths = []
    if scores:
        paths = {}
    last_score = None
    for k, score, path in utils.readColumns(paths_file, 1, 2, 3):
        # use 6 decimal places
        score = "%0.6f" % (float(score))
        if int(k) > max_k:
            # if this path has the same path score as the previous path, then keep adding its proteins
            if ties and last_score == score:
                pass
            else:
                break
        #path = path.split('|')
        if scores:
            paths[path] = score
        else:
            paths.append(path)
        last_score = score
    return paths
コード例 #12
0
def getFamilyNodes(version, interactome_file=None):
    """ Get the set of family nodes present in the interactome
    If the family-nodes.txt file already exists, then just read from it
    Otherwise read the interactome_file and write the family nodes to family-nodes.txt
    *returns*: the set of family nodes in the interactome
    """
    family_nodes_file = "inputs/%s/family-nodes.txt" % (version)
    if not os.path.isfile(family_nodes_file):
        print(
            "%s does not exist. Getting the family nodes from the interactome"
            % (family_nodes_file))
        # get the set of family nodes from the interactome
        print("Reading the interactome from %s" % (interactome_file))
        family_nodes = set([
            N for U, V in utils.readColumns(interactome_file, 1, 2)
            for N in (U, V) if len(N.split(',')) > 1
        ])
        print("Writing family nodes to %s" % (family_nodes_file))
        with open(family_nodes_file, 'w') as out:
            out.write('\n'.join(family_nodes))
    else:
        family_nodes = utils.readItemSet(family_nodes_file)

    return family_nodes
コード例 #13
0
def splitRecTFsFamilyNodes(chemicals, version, interactome_file):
    """
    """
    # leave some nodes as family nodes as that's how they are in the toxcast data
    map_family_to_prot = {
        # FOS,JUN,FOSL1,FOSL2,JUNB,JUND,FOSB: FOS,JUN
        "P01100,P05412,P15407,P15408,P17275,P17535,P53539": ["P01100,P05412"],
        # FOS,JUN,SP1: FOS,JUN
        "P01100,P05412,P08047": ["P01100,P05412"],
        # FOS,JUN: FOS,JUN
        "P01100,P05412": ["P01100,P05412"],
        # TCF7,TCF7L1,TCF7L2,LEF1: TCF7,TCF7L1,TCF7L2,LEF1
        "P36402,Q9HCS4,Q9NQB0,Q9UJU2": ["P36402,Q9HCS4,Q9NQB0,Q9UJU2"],
        # FOXO3,FOXO4,FOXO1: FOXO3,FOXO4,FOXO1
        "O43524,P98177,Q12778": ["O43524,P98177,Q12778"],
    }

    rec_tfs_file = "inputs/%s/rec-tfs/%%s-rec-tfs.txt" % (version)
    interactomes_dir = "inputs/%s" % (version)
    t_utils.checkDir(interactomes_dir)
    new_interactome_file = "%s/%s-interactome.txt" % (interactomes_dir,
                                                      version)
    # get the set of family nodes from the interactome
    print("Reading the interactome from %s" % (interactome_file))
    lines = utils.readColumns(interactome_file, 1, 2, 3)
    family_nodes = set(
        [N for U, V, w in lines for N in (U, V) if len(N.split(',')) > 1])
    print(
        "Splitting the source/target family nodes of all chemicals in the interactome and writing to %s"
        % (new_interactome_file))
    # set of family nodes to split from all chemicals
    family_to_split = {}
    for chemical in tqdm(chemicals):
        rec, tfs = t_utils.getRecTFs(rec_tfs_file % (chemical))
        for N in family_nodes:
            for n in rec.union(tfs):
                if n in N:
                    if N not in family_to_split:
                        family_to_split[N] = set()
                    family_to_split[N].add(n)
    # leave some tfs as family nodes because that's how they're listed in toxcast
    family_to_split.update(map_family_to_prot)

    split_rec = set()
    split_tfs = set()
    new_interactome = []
    all_new_edges = set()
    # it's a bit ad hoc because the weight of the family edge is the max of the individual edges,
    # and now we're setting the edge weight of the split edges to be the max of the individual edges and the family edge
    new_edge_weights = {}
    #new_edge_ev = {}
    # there could be multiple family edges contributing to a single edge
    for U, V, w in lines:
        new_edges = set()
        # split up the rec/tf family nodes
        if U in family_to_split and V in family_to_split:
            split_rec.add(U)
            split_tfs.add(V)
            for u in family_to_split[U]:
                for v in family_to_split[V]:
                    new_edges.add((u, v))
        elif U in family_to_split:
            split_rec.add(U)
            for u in family_to_split[U]:
                new_edges.add((u, V))
        elif V in family_to_split:
            split_tfs.add(V)
            for v in family_to_split[V]:
                new_edges.add((U, v))
        # otherwise leave the edge as it is
        else:
            new_interactome.append((U, V, w))
            continue

        all_new_edges.update(new_edges)
        for (u, v) in new_edges:
            if (u, v) not in new_edge_weights:
                new_edge_weights[(u, v)] = set()
            new_edge_weights[(u, v)].add(float(w))
            # for now, don't write the evidence to each of the new networks to save on space
            # the evidence is present in the original interactome and the evidence file
            #if (u,v) not in new_edge_ev:
            #    new_edge_ev[(u,v)] = set()
            #new_edge_ev[(u,v)].update(set(ev.split('|')))

    for u, v in all_new_edges:
        w = max(new_edge_weights[(u, v)])
        #ev = '|'.join(new_edge_ev[(u,v)])
        new_interactome.append((u, v, "%0.6f" % w))

    # now write the new interactome
    print("Writing the new interactome with rec/tf family nodes split to %s" %
          (new_interactome_file))
    with open(new_interactome_file, 'w') as out:
        out.write('\n'.join(['\t'.join(line)
                             for line in new_interactome]) + '\n')

    # also write the family nodes that were split
    mapping = getUniprotToGeneMapping(version)
    # also write the mapping from the rec/tf family node to the proteins it came from
    out_file = "inputs/%s/family-split-rec-tfs.txt" % (version)
    print(
        "Writing a mapping of the split family rec/tfs and the protein hits they came from to: %s"
        % (out_file))
    with open(out_file, 'w') as out:
        out.write('\n'.join([
            "%s\t%s\t%s\t%s" %
            (N, '|'.join(family_to_split[N]), mapping[N],
             '|'.join([mapping[n] for n in family_to_split[N]]))
            for N in sorted(family_to_split)
        ]) + '\n')

    print("A total of %d family nodes were split" % (len(family_to_split)))

    # add the zscore penalty to the few family nodes in the ToxCast data
    toxcast_family_nodes = [N[0] for N in map_family_to_prot.values()]
    addRecTFsFamilyNodes(chemicals,
                         version,
                         family_nodes=toxcast_family_nodes,
                         costs=True)
コード例 #14
0
def setup_post_to_graphspace(config_map,
                             selected_goid,
                             alg='fastsinksource',
                             name_postfix='',
                             tags=None,
                             taxon=None,
                             goid_summary_file=None,
                             num_neighbors=1,
                             nodes_to_post=None,
                             **kwargs):

    input_settings, alg_settings, \
            output_settings, out_pref, kwargs = \
            plot_utils.setup_variables(
                    config_map, **kwargs)

    input_dir = input_settings['input_dir']
    dataset = input_settings['datasets'][0]
    for arg in [
            'ssn_target_only', 'ssn_target_ann_only', 'ssn_only',
            'string_target_only', 'string_nontarget_only',
            'limit_to_taxons_file', 'add_target_taxon', 'oracle_weights',
            'rem_neg_neighbors', 'youngs_neg', 'sp_leaf_terms_only'
    ]:
        kwargs[arg] = dataset.get(arg)
    uniprot_taxon_file = "%s/%s" % (input_dir, dataset['taxon_file'])

    # don't need it since we are re-running the alg anyway
    #    # predictions file:
    #    results_dir = "%s/%s/%s" % (
    #        output_settings['output_dir'], dataset['net_version'], dataset['exp_name'])
    #    alg_params = alg_settings[alg]
    #    combos = [dict(zip(alg_params.keys(), val))
    #        for val in itertools.product(
    #            *(alg_params[param] for param in alg_params))]
    #    # TODO allow for multiple
    #    if len(combos) > 1:
    #        print("%d combinations for %s. Using the first one" % (len(combos), alg))
    #    param_combo = combos[0]
    #    # first get the parameter string for this runner
    #    params_str = runner.get_runner_params_str(alg, dataset, param_combo)
    #    prec_rec_str = "prec-rec%s-%s" % (taxon, selected_goid)
    #    exp_type = 'loso'
    #    pred_file = "%s/%s/%s%s%s%s.txt" % (results_dir, alg, exp_type, params_str, kwargs.get('postfix',''), prec_rec_str)
    #    if not os.path.isfile(pred_file):
    #        print("\tPredictions file not found: %s. Quitting" % (pred_file))
    #        sys.exit(1)
    #    print("\treading %s" % (pred_file))
    #    df = pd.read_csv(pred_file, sep='\t')
    #    print(df.head())

    out_dir = "outputs/viz/graphspace/%s-%s/" % (dataset['net_version'].split(
        '/')[-1], dataset['exp_name'].split('/')[-1])
    os.makedirs(out_dir, exist_ok=True)
    print("storing net and ann files to %s" % (out_dir))

    # TODO allow posting without STRING
    net_obj, new_net_obj, ann_obj, eval_ann_obj, species_to_uniprot_idx = \
            load_net_ann_datasets(
        out_dir, taxon,
        dataset, input_settings, alg_settings,
        uniprot_taxon_file, **kwargs)
    W = new_net_obj.W
    prots = ann_obj.prots

    # also run the alg to get the full prediction scores
    # TODO get them from a file?
    alg_settings = {alg: alg_settings[alg]}
    alg_settings[alg]['should_run'] = [True]
    kwargs['verbose'] = True
    alg_runners = run_eval_algs.setup_runners(alg_settings, new_net_obj,
                                              ann_obj,
                                              output_settings['output_dir'],
                                              **kwargs)
    run_obj = alg_runners[0]
    run_obj.goids_to_run = [selected_goid]

    train_ann_mat, test_ann_mat, sp_goterms = eval_loso.leave_out_taxon(
        taxon,
        ann_obj,
        species_to_uniprot_idx,
        eval_ann_obj=eval_ann_obj,
        **kwargs)
    # now run the loso evaluation for this term, and get the scores back
    eval_loso.run_and_eval_algs(run_obj,
                                ann_obj,
                                train_ann_mat,
                                test_ann_mat,
                                taxon=taxon,
                                **kwargs)
    term_scores = np.ravel(
        run_obj.goid_scores[ann_obj.goid2idx[selected_goid]].toarray())
    print("top 10 scores for %s, %s:" % (taxon, selected_goid))
    taxon_prots_idx = list(species_to_uniprot_idx[taxon])
    taxon_prots = [prots[i] for i in taxon_prots_idx]
    taxon_term_scores = term_scores[taxon_prots_idx]
    print('\n'.join(["%s\t%0.4e" % (
        ann_obj.prots[taxon_prots_idx[i]], taxon_term_scores[i]) \
            for i in np.argsort(taxon_term_scores)[::-1][:10]]))

    pos_neg_file = "%s/%s" % (input_dir, dataset['pos_neg_file'])
    #selected_goid = "15643"  # toxic substance binding
    #selected_goid = "9405"  # pathogenesis
    #selected_goid = "98754"  # detoxification
    selected_goname = None
    # build a dictionary of the evidencecode for each prot
    uniprot_to_evidencecode = defaultdict(set)
    annotated_prots = set()
    neg_prots = set()
    if goid_summary_file is None:
        goid_summary_file = pos_neg_file.replace("bp-", '').replace("mf-", '')
        if '-list' in pos_neg_file:
            goid_summary_file = goid_summary_file.replace(
                "-list", "-summary-stats")
        elif '.gz' in pos_neg_file:
            goid_summary_file = goid_summary_file.replace(
                ".tsv.gz", "-summary-stats.tsv")
        else:
            goid_summary_file = goid_summary_file.replace(
                ".tsv", "-summary-stats.tsv")
    df_summary = pd.read_csv(goid_summary_file, sep='\t')
    goid_names = dict(zip(df_summary['GO term'], df_summary['GO term name']))
    #goid_num_anno = dict(zip(df_summary['GO term'], df_summary['# positive examples']))
    print("GO name: %s" % (goid_names[selected_goid]))
    selected_goname = goid_names[selected_goid].replace(' ', '-')[0:20]
    # load the GAIN propagation to get the evidence code
    ev_codes_file = dataset.get('ev_codes_file')
    if ev_codes_file is not None:
        for orf, goid, goname, hierarchy, evidencecode, annotation_type in utils.readColumns(
                ev_codes_file, 1, 2, 3, 4, 5, 6):
            if selected_goid[:3] == "GO:":
                goid = "GO:" + "0" * (7 - len(goid)) + goid
            if goid != selected_goid:
                continue
            selected_goname = goname.replace(' ', '-')[0:20]
            if annotation_type != '1':
                continue

            uniprot_to_evidencecode[orf].add(evidencecode)
    # limit it to the current taxon
    if taxon is not None:
        print("Getting species of each prot from %s" % (uniprot_taxon_file))
        #print("Limiting the prots to those for taxon %s (%s)" % (taxon, selected_species[taxon]))
        print("Limiting the prots to those for taxon %s" % (taxon))
        # for each of the 19 species, leave out their annotations
        # and see how well we can retrieve them
        uniprot_to_species = utils.readDict(uniprot_taxon_file, 1, 2)
        if taxon not in species_to_uniprot_idx:
            print("Error: taxon ID '%d' not found" % (taxon))
            sys.exit()
        # also limit the proteins to those in the network
        print("\t%d prots for taxon %s." % (len(taxon_prots_idx), taxon))
        goid_idx = ann_obj.goid2idx[selected_goid]
        pos, neg = alg_utils.get_goid_pos_neg(train_ann_mat, goid_idx)
        non_taxon_annotated_prots = set([prots[i] for i in pos])
        non_taxon_neg_prots = set([prots[i] for i in neg])
        print("\t%d non-taxon pos, %d non-taxon neg" %
              (len(non_taxon_annotated_prots), len(non_taxon_neg_prots)))
        pos, neg = alg_utils.get_goid_pos_neg(test_ann_mat, goid_idx)
        annotated_prots = set([prots[i] for i in pos])
        neg_prots = set([prots[i] for i in neg])
        print("\t%d taxon pos, %d taxon neg" %
              (len(annotated_prots), len(neg_prots)))

    print("\t%d annotated prots for %s (%s)" %
          (len(annotated_prots), selected_goname, selected_goid))

    #conf_cutoff = 0.2
    conf_cutoff = -1
    predicted_prots = set()
    ranks = {}
    scores = {}
    first_zero_rank = None
    for i, idx in enumerate(np.argsort(taxon_term_scores)[::-1]):
        rank = i + 1
        prot = prots[taxon_prots_idx[idx]]
        predicted_prots.add(prot)
        score = taxon_term_scores[idx]
        scores[prot] = score
        if taxon is not None:
            ranks[prot] = rank
            if score == 0 and first_zero_rank is None:
                first_zero_rank = rank
        else:
            ranks[prot] = rank

            # move the score between 0 and 1 if it's genemania (normally between -1 and 1)
            # as the score is used to set the opacity
            # TODO fix genemania
            #if alg == "genemania":
            #    pred_cut_conf[gene] = local_conf
            #    local_conf = ((float(local_conf) - -1) / float(1--1)) * (1-0) + 0
            #pred_local_conf[gene] = local_conf

    print("\t%d prots with a score" % (len(taxon_term_scores)))
    print("Rank of first zero score: %d" % (first_zero_rank))
    print("Ranks of left-out positives:")
    for gene in sorted(annotated_prots, key=ranks.get):
        print("%s\t%d" % (gene, ranks[gene]))
    print("Including top 30 ranked-proteins of left-out species")
    top_30 = sorted(set(taxon_prots) & set(ranks.keys()), key=ranks.get)[:30]
    if ev_codes_file is not None:
        print("Evidence codes of top 30:")
        for i, gene in enumerate(top_30):
            if gene in uniprot_to_evidencecode:
                print("%s\t%s\t%s" % (i, gene, uniprot_to_evidencecode[gene]))
    top_30 = set(top_30)

    if taxon is not None:
        print(
            "Getting the induced subgraph of the neighbors of the %d annotated nodes"
            % (len(annotated_prots)))
        prededges = set()
        if nodes_to_post is not None:
            print("Getting neighbors of %s" % (', '.join(nodes_to_post)))
            nodes_to_add_neighbors = set(nodes_to_post)
        else:
            nodes_to_add_neighbors = annotated_prots.copy() | top_30
        node2idx = ann_obj.node2idx
        for i in range(opts.num_neighbors):
            #print("Adding neighbors %d" % (i+1))
            curr_nodes_to_add_neighbors = nodes_to_add_neighbors.copy()
            nodes_to_add_neighbors = set()
            print("adding %sneighbors of %d nodes" %
                  ("positive ", len(curr_nodes_to_add_neighbors)))
            for u in curr_nodes_to_add_neighbors:
                #neighbors = set(nx.all_neighbors(G, u))
                neighbors = set(
                    [prots[v] for v in get_mat_neighbors(W, node2idx[u])])
                if opts.node_to_post is None:
                    # UPDATE 2018-10: try adding just the positive neighbors of the node
                    # TODO make this a command-line option
                    neighbors = neighbors & (non_taxon_annotated_prots
                                             | annotated_prots | top_30)
                #if len(neighbors) > 15 and nodes_to_post is None:
                #    print("\tskipping adding neighbors of %s. len(neighbors): %d" % (u, len(neighbors)))
                #    continue
                nodes_to_add_neighbors.update(neighbors)
                prededges.update(set([(u, v) for v in neighbors]))
    else:
        print(
            "Getting the induced subgraph of the %d annotated and %d predicted proteins"
            % (len(annotated_prots), len(predicted_prots)))
        print("not yet implemented. quitting")
        sys.exit()
    #    prededges = set(G.subgraph(annotated_prots.union(predicted_prots)).edges())
    prededges = set([tuple(sorted((u, v))) for u, v in prededges])
    # TODO I should also show the disconnected nodes
    prednodes = set([n for edge in prededges for n in edge])

    print("\t%d nodes, %d edges" % (len(prednodes), len(prededges)))
    if len(prededges) > 1000 or len(prednodes) > 500:
        print("\nToo many nodes/edges. Not posting to GraphSpace. Quitting")
        sys.exit()

    #graph_attr_file = ""
    #graph_attr, attr_desc = readGraphAttr()
    # add the edge weight from the network to attr_desc which will be used for the popup
    # set the edges as the neighbors of the annotated genes
    #prededges = set()
    # get the induced subgraph of the annotated nodes and predicted nodes
    #for n in func_prots:
    #    if not G.has_node(n):
    #        continue
    #    for neighbor in G.neighbors(n):
    #        prededges.add((n, neighbor))

    graph_attr = {n: {} for n in prednodes}
    attr_desc = {n: {} for n in prednodes}

    print("Reading gene names and species for each protein from %s" %
          (uniprot_taxon_file))
    #prot_species = utils.readDict(uniprot_taxon_file, 1, 2)
    uniprot_to_gene = utils.readDict(uniprot_taxon_file, 1, 4)
    # there can be multiple gene names. Just show the first one for now
    uniprot_to_gene = {
        n: gene.split(' ')[0]
        for n, gene in uniprot_to_gene.items()
    }
    node_labels = {}

    print("building graphspace object")
    # get the abbreviation of the species names
    species_names, net_taxons = eval_loso.get_selected_species(
        species_to_uniprot_idx, kwargs['limit_to_taxons_file'])
    sp_abbrv = {
        t: ''.join(subs[0] for subs in sp_name.split(' ')[:2])
        for t, sp_name in species_names.items()
    }
    # for each node, add the prediction values
    for n in tqdm(prednodes):
        # set the name of the node to be the gene name and add the k to the label
        gene_name = uniprot_to_gene.get(n, n)
        curr_taxon = uniprot_to_species[n]
        species_short_name = sp_abbrv[curr_taxon]
        # add the species to the front of the gene name
        label = "%s-%s" % (species_short_name, gene_name)
        uniprot_to_gene[n] = label
        #node_labels[n] = "%s\n%d" % (label, min(ranks[n], 43)) if n in annotated_prots else label
        node_labels[n] = "%s\n%d" % (
            label, ranks[n] if ranks[n] < first_zero_rank else
            first_zero_rank) if n in taxon_prots else label

        # maybe put the labels below the nodes?
        # helps with visualizing the background opacity
        graph_attr[n]['text-valign'] = 'bottom'
        # add the strain name to the popup
        attr_desc[n]['Strain'] = species_names[curr_taxon]
        if n in predicted_prots:
            # don't need to normalize because the confidence values are already between 0 and 1
            if taxon and (n in non_taxon_annotated_prots
                          or n in non_taxon_neg_prots):
                pass
            else:
                # UPDATE: use the node rank instead of the node score
                #graph_attr[n]['background-opacity'] = pred_local_conf[n]
                if n not in ranks:
                    graph_attr[n]['background-opacity'] = scores[n]
                else:
                    #graph_attr[n]['background-opacity'] = scores[n]
                    graph_attr[n]['background-opacity'] = max([
                        0.9 - (ranks[n] / float(first_zero_rank)),
                        float(scores[n])
                    ])
                    attr_desc[n]["%s rank" % (alg_names[alg])] = ranks[n]
            attr_desc[n]["%s prediction score" %
                         (alg_names[alg])] = "%0.4f" % (scores[n])
        #elif n in annotated_prots or (taxon and (n in non_taxon_annotated_prots or n in non_taxon_neg_prots)) \
        #     or n in neg_prots:
        #if n in pred_local_conf:
        #    graph_attr[n]['background-opacity'] = pred_local_conf[n]
        #    attr_desc[n]["Local prediction confidence"] = pred_local_conf[n]
        # also add the annotation to the popup
        if n in uniprot_to_evidencecode:
            codes = uniprot_to_evidencecode[n]
            # TODO add bullet points to the list
            #attr_desc[n]["Evidence code"] = ''.join(["%s (%s)\n" % (c, evidence_code_name[c]) for c in codes])
            # order it by exp, comp, then elec
            evidence_codes = ''.join([
                "<li>%s (%s)</li>" % (c, evidence_code_name[c]) for c in codes
                if evidence_code_type[c] == 'experimental'
            ])
            evidence_codes += ''.join([
                "<li>%s (%s)</li>" % (c, evidence_code_name[c]) for c in codes
                if evidence_code_type[c] == 'computational'
            ])
            evidence_codes += ''.join([
                "<li>%s (%s)</li>" % (c, evidence_code_name[c]) for c in codes
                if evidence_code_type[c] == 'electronic'
            ])
            attr_desc[n]["Evidence code"] = "<ul>%s</ul>" % (evidence_codes)

    # set the width of the edges by the network weight
    edge_weights = defaultdict(float)
    for u, v in tqdm(prededges):
        e = (u, v)
        if e not in attr_desc:
            attr_desc[e] = {}
        if e not in graph_attr:
            graph_attr[e] = {}
        #attr_desc[e]["edge weight"] = G.adj[u][v]]['weight']
        if net_obj.multi_net:
            #attr_desc[e]["Final edge weight"] = "%0.1f" % (W[node2idx[u]][:,node2idx[v]].A.flatten()[0])
            edge_type_weights = []
            # add the weights for the individual string networks
            for i in range(len(net_obj.net_names)):
                net_name = net_obj.net_names[i]
                net_name = "SSN (E-value <= 0.1)" if 'eval-e0_1' in net_name else net_name
                net = net_obj.sparse_networks[i]
                w = net[node2idx[u]][:, node2idx[v]].A.flatten()[0]

                if w != 0:
                    #attr_desc[e][net_name] = "%0.1f" % (w)
                    edge_type_weights.append("<li>%s: %0.1f</li>" %
                                             (net_name, w))
                    edge_weights[e] += w * net_obj.swsn_weights[i]
            attr_desc[e]["Edge weights by type"] = "<ul>%s</ul>" % (''.join(
                sorted(edge_type_weights)))
        else:
            attr_desc[e]["Edge weight"] = "%0.1f" % (
                W[node2idx[u]][:, node2idx[v]].A.flatten()[0])
        # make the edges somewhat opaque for a better visual style
        graph_attr[e]['opacity'] = 0.7

    # set the width of the edges by the network weight
    #edge_weights = {(u,v): float(W[node2idx[u]][:,node2idx[v]].A.flatten()[0]) for u,v in prededges}
    for e, w in edge_weights.items():
        attr_desc[e]["Final edge weight"] = "%0.1f" % (w)
    # TODO set the min and max as parameters or something
    #max_weight = 180
    if net_obj.multi_net:
        max_weight = net_obj.swsn_weights[0] * 180
        print(max_weight)
    else:
        max_weight = 180
    for e in edge_weights:
        if edge_weights[e] > max_weight:
            edge_weights[e] = max_weight
    graph_attr = gs.set_edge_width(prededges,
                                   edge_weights,
                                   graph_attr,
                                   a=1,
                                   b=12,
                                   min_weight=1,
                                   max_weight=max_weight)

    H = nx.Graph()
    H.add_edges_from(prededges)

    # see which DB the edge came from to set the edge color
    print("Getting the edge type from networks")
    if net_obj.multi_net:
        print("\tFrom both STRING and SEQ_SIM")
        seq_sim_edges = set()
        for u, v in prededges:
            # get the SSN weight of this edge. Should be the first network
            net = net_obj.sparse_networks[0]
            w = net[node2idx[u]][:, node2idx[v]].A.flatten()[0]
            if w != 0:
                # these are all undirected, so just store the sorted version
                u, v = tuple(sorted((u, v)))
                # give these the default color
                graph_attr[(u, v)]['color'] = edge_type_color['default']
                seq_sim_edges.add((u, v))

#        string_edges = set()
#        temp_version = '2017_10-string'
#        net = f_settings.NETWORK_template % (temp_version, temp_version)
#        for u,v in utils.readColumns(net, 1, 2):
#            #if (u,v) not in prededges:
#            if not H.has_edge(u,v):
#                continue
#            # give these the default color
#            u,v = tuple(sorted((u,v)))
#            graph_attr[(u,v)]['color'] = edge_type_color['string']
#            string_edges.add((u,v))
        string_edges = prededges.difference(seq_sim_edges)
        print("\t%d edges from seq-sim, %d edges from STRING" %
              (len(seq_sim_edges), len(string_edges)))
        # set the color to STRING if it didn't come from sequence similarity
        for e in string_edges:
            #if 'color' not in graph_attr[e]:
            graph_attr[e]['color'] = edge_type_color['string']

    #elif 'STRING' in f_settings.NETWORK_VERSION_INPUTS[version]:
    #    for e in graph_attr:
    #        graph_attr[e]['color'] = edge_type_color['string']
    else:
        for e in graph_attr:
            graph_attr[e]['color'] = edge_type_color['default']

    # apply the evidence code style to each protein
    for n in prednodes:
        if n in annotated_prots:
            graph_attr[n]['color'] = node_type_color['annotation']
        elif taxon and n in non_taxon_annotated_prots:
            graph_attr[n]['color'] = node_type_color['non-taxon-annotation']
        elif taxon and n in non_taxon_neg_prots:
            graph_attr[n]['color'] = node_type_color[
                'non-taxon-neg-annotation']
        elif n in neg_prots:
            graph_attr[n]['color'] = node_type_color['neg-annotation']
        elif n in predicted_prots:
            graph_attr[n]['color'] = node_type_color['prediction']
        if n in uniprot_to_evidencecode:
            curr_style = ""
            for evidencecode in uniprot_to_evidencecode[n]:
                curr_type = evidence_code_type[evidencecode]
                if curr_type == "experimental":
                    curr_style = annotation_type_styles[curr_type]
                    break
                elif curr_style == "computational":
                    continue
                else:
                    curr_style = annotation_type_styles[curr_type]
            graph_attr[n].update(curr_style)
        # temporary fix to get the non-target positive examples
        if n in non_taxon_annotated_prots:
            graph_attr[n].update(annotation_type_styles['experimental'])

    # TODO build the popups here. That way the popup building logic can be separated from the
    # GSGraph building logic
    popups = {}
    prednodes = set([n for edge in prededges for n in edge])
    for n in prednodes:
        popups[n] = gs.buildNodePopup(n, attr_val=attr_desc)
    for u, v in prededges:
        popups[(u, v)] = gs.buildEdgePopup(u,
                                           v,
                                           node_labels=uniprot_to_gene,
                                           attr_val=attr_desc)

    # Now post to graphspace!
    print("Building GraphSpace graph")
    G = gs.constructGraph(prededges,
                          node_labels=node_labels,
                          graph_attr=graph_attr,
                          popups=popups)

    # TODO add an option to build the 'graph information' tab legend/info
    # build the 'Graph Information' metadata
    #desc = gs.buildGraphDescription(opts.edges, opts.net)
    desc = ''
    metadata = {'description': desc, 'tags': [], 'title': ''}
    if tags is not None:
        metadata['tags'] = tags
    G.set_data(metadata)
    if 'graph_exp_name' in dataset:
        graph_exp_name = dataset['graph_exp_name']
    else:
        graph_exp_name = "%s-%s" % (dataset['exp_name'].split('/')[-1],
                                    dataset['net_version'].split('/')[-1])
    graph_name = "%s-%s-%s-%s%s" % (selected_goname, selected_goid, alg,
                                    graph_exp_name, name_postfix)
    G.set_name(graph_name)

    # rather than call it from here and repeat all the options, return G, and then call this after
    #post_graph_to_graphspace(G, opts.username, opts.password, opts.graph_name, apply_layout=opts.apply_layout, layout_name=opts.layout_name,
    #                         group=opts.group, make_public=opts.make_public)
    return G, graph_name
コード例 #15
0
def get_summary_stats(version="2018_01-toxcast-d2d-p1_5-u1_25",
                      summary_file="network_summaries.csv",
                      scope="permute-dir-undir",
                      forced=False):
    """ Function to aggregate summary statistics for every network
    returns a dataframe containing the counted metrics for each chemical
    """
    TOXCAST_DATA = t_utils.loadToxcastData(t_settings.INTERACTOMES[version])
    #inputs_dir = "inputs/%s/" % (version)
    t_settings.set_version(version)
    inputs_dir = t_settings.INPUTSPREFIX
    outputs_dir = "outputs/%s/weighted" % (version)
    chemicals = utils.readItemList("%s/chemicals.txt" % (inputs_dir), 1)
    #hits_template = "%s/hit-prots/%%s-hit-prots.txt" % (inputs_dir)
    #nonhits_template = "%s/hit-prots/%%s-nonhit-prots.txt" % (inputs_dir)
    #rec_tfs_template = "%s/rec-tfs/%%s-rec-tfs.txt" % (inputs_dir)
    chem_rec, chem_tfs = TOXCAST_DATA.chemical_rec, TOXCAST_DATA.chemical_tfs
    chem_prot_hit_vals = TOXCAST_DATA.chemical_protein_hit
    paths_dir = "%s/edgelinker" % (outputs_dir)
    paths_template = "%s/%%s-paths.txt" % (paths_dir)

    out_dir = "%s/stats/summary" % outputs_dir
    t_utils.checkDir(out_dir)
    summary_file = "%s/%s" % (out_dir, summary_file)
    if os.path.isfile(summary_file) and not forced:
        print(
            "Reading network summary stats from '%s'. Set forced to True to overwrite it."
            % (summary_file))
        df = pd.read_csv(summary_file, index_col=0)
    else:
        print("Reading in the stats from the response networks in", paths_dir)
        chemical_names, chemical_name_to_id = t_utils.getChemicalNameMaps()
        chemical_names = {
            chemical: chemical_names[chemical]
            for chemical in chemicals
        }
        chemical_prots = {}
        chemical_num_paths = {}
        chemical_num_edges = {}
        chemical_avg_path_lengths = {}
        chemical_rec = {}
        chemical_tfs = {}
        chemical_net_rec = {}
        chemical_net_tfs = {}
        chemical_hits = {}
        chemical_nonhits = {}
        chemical_net_hits = {}
        chemical_net_nonhits = {}
        chemical_inter_hits = {}
        chemical_inter_nonhits = {}
        chemical_inter_net_hits = {}
        chemical_inter_net_nonhits = {}
        # also get the q-value for each chemical
        chemical_pvals = {}
        pvals_file = "%s/stats/stat-sig-%s/gpd-pval.txt" % (outputs_dir, scope)
        if os.path.isfile(pvals_file):
            with open(pvals_file, 'r') as file_handle:
                header = file_handle.readline().rstrip().split('\t')
            pval_col = header.index("200") + 1
            chemical_pvals = {
                chem: pval
                for chem, pval in utils.readColumns(pvals_file, 1, pval_col)
            }
        chemical_qvals = {}
        qvals_file = "%s/stats/stat-sig-%s/bfcorr_pval_qval.txt" % (
            outputs_dir, scope)
        if os.path.isfile(qvals_file):
            chemical_qvals = t_utils.getPvals(outputs_dir,
                                              scope,
                                              sig_cutoff_type="FDR")
        for chemical in tqdm(chemicals):
            #prots, paths = getProteins(paths=paths_template % chemical, max_k=200, ties=True)
            paths = t_utils.getPaths(paths_template % chemical,
                                     max_k=200,
                                     ties=True)
            prots = set()
            num_paths = len(paths)
            edges = set()
            path_lengths = []
            for path in paths:
                path = path.split('|')
                # path length is the number of edges in a path
                path_lengths.append(len(path) - 1)
                prots = prots.union(set(path))
                for i in range(len(path) - 1):
                    edges.add((path[i], path[i + 1]))

            chemical_prots[chemical] = len(prots)
            chemical_num_paths[chemical] = len(paths)
            chemical_avg_path_lengths[chemical] = np.mean(path_lengths)
            chemical_num_edges[chemical] = len(edges)
            #rec, tfs = t_utils.getRecTFs(rec_tfs_template % chemical)
            rec, tfs = chem_rec[chemical], chem_tfs[chemical]
            chemical_rec[chemical] = len(rec)
            chemical_tfs[chemical] = len(tfs)
            chemical_net_rec[chemical] = len(prots.intersection(rec))
            chemical_net_tfs[chemical] = len(prots.intersection(tfs))
            # read the hits and nonhits for each chemical to calculate how many of them are in the network
            #hits = utils.readItemSet(hits_template % chemical, 1)
            #nonhits = utils.readItemSet(nonhits_template % chemical, 1)
            hits = set([p for p, hit_val in chem_prot_hit_vals[chemical].items() \
                    if hit_val == 1])
            nonhits = set([p for p, hit_val in chem_prot_hit_vals[chemical].items() \
                    if hit_val == 0])
            chemical_hits[chemical] = len(hits)
            chemical_nonhits[chemical] = len(nonhits)
            chemical_net_hits[chemical] = len(hits.intersection(prots))
            chemical_net_nonhits[chemical] = len(nonhits.intersection(prots))
            # subtract the rec and tfs to get just the intermediate hits and nonhits
            chemical_inter_hits[chemical] = len(hits.difference(
                rec.union(tfs)))
            chemical_inter_nonhits[chemical] = len(
                nonhits.difference(rec.union(tfs)))
            chemical_inter_net_hits[chemical] = len(
                hits.intersection(prots).difference(rec.union(tfs)))
            chemical_inter_net_nonhits[chemical] = len(
                nonhits.intersection(prots).difference(rec.union(tfs)))

        # write these metrics to a file
        df = pd.DataFrame({
            "name": chemical_names,
            "prots": chemical_prots,
            "num_paths": chemical_num_paths,
            "pvals": chemical_pvals,
            "qvals": chemical_qvals,
            "num_edges": chemical_num_edges,
            "avg_path_lengths": chemical_avg_path_lengths,
            "net_rec": chemical_net_rec,
            "net_tfs": chemical_net_tfs,
            "hit_rec": chemical_rec,
            "hit_tfs": chemical_tfs,
            "net_hits": chemical_net_hits,
            "net_nonhits": chemical_net_nonhits,
            'hits': chemical_hits,
            'nonhits': chemical_nonhits,
            "inter_net_hits": chemical_inter_net_hits,
            "inter_net_nonhits": chemical_inter_net_nonhits,
            "inter_hits": chemical_inter_hits,
            "inter_nonhits": chemical_inter_nonhits,
        })
        print("Writing: ", summary_file)
        df.to_csv(summary_file,
                  header=True,
                  columns=[
                      'name', 'prots', 'num_paths', 'num_edges',
                      'avg_path_lengths', 'hits', 'nonhits', 'net_hits',
                      'net_nonhits', 'hit_rec', 'hit_tfs', 'net_rec',
                      'net_tfs', 'inter_net_hits', 'inter_net_nonhits',
                      'inter_hits', 'inter_nonhits', 'pvals', 'qvals'
                  ])

    # change the index or chemical id to unicode (string)
    #df.index = df.index.map(unicode)

    return df
コード例 #16
0
def build_graph_and_post(
        version,
        interactome,
        rec_tfs_file,
        RESULTSPREFIX,
        paths_file,
        chemical,
        max_k=200,
        graph_name="test",
        #postfix='-'+version, tag=version, chemical_color_file=)
        graph_attr_file=None,
        ev_file=None,
        datadir="/home/jeffl/svnrepo/data",
        **kwargs):
    # get the "evidence version" which is used to get the CSBDB related files
    #ev_version = t_utils.get_ev_version(version)

    PPI = interactome
    lines = utils.readColumns(PPI, 1, 2, 3)
    global PPIEDGES, PPIWEIGHTS
    PPIEDGES = [(u, v) for u, v, w in lines]
    PPIWEIGHTS = {(u, v): float(w) for u, v, w in lines}

    prededges = readNetwork(paths=paths_file, k_limit=max_k)

    sources = set()
    targets = set()
    lines = utils.readColumns(rec_tfs_file, 1, 2)
    sources = set([
        acc for acc, node_type in lines
        if node_type.lower() in ['source', 'receptor']
    ])
    targets = set([
        acc for acc, node_type in lines
        if node_type.lower() in ['target', 'tf']
    ])
    # human_rec = set()
    # human_tfs = set()
    # if opts.human_rec_tfs is not None:
    #     # also get the human rec and tfs
    #     lines = utils.readColumns(opts.human_rec_tfs,1,2)
    #     human_rec = set([acc for acc, node_type in lines if node_type.lower() in ['source', 'receptor']])
    #     human_tfs = set([acc for acc, node_type in lines if node_type.lower() in ['target', 'tf']])

    # TODO tempororay fix for family nodes
    prednodes = set([t for t, h in prededges
                     ]).union(set([h for t, h in prededges]))

    global uniprot_to_gene
    #uniprot_to_gene = utils.readDict(getev.getMappingFile(ev_version, datadir), 1, 2)
    uniprot_to_gene = utils.readDict(kwargs['mapping_file'], 1, 2)

    # get attributes of nodes and edges from the graph_attr file
    graph_attr = {}
    # description of a style, style_attr tuple
    attr_desc = {}
    if graph_attr_file is not None:
        graph_attr, attr_desc = gs_base.readGraphAttr(graph_attr_file)

    # if opts.chemicalID:
    #     global CHEMICALS
    #     CHEMICALS = t_utils.loadChemicalMap(PPI)

    if kwargs.get('ctd_support_file'):
        num_intxs_per_node = get_ctd_support(chemical, prednodes,
                                             kwargs['ctd_support_file'])
        # set the double border attribute for nodes nodes with any CTD support
        for n in num_intxs_per_node:
            graph_attr[n]['style'] = 'double'
            graph_attr[n]['border_color'] = 'maroon'
            graph_attr[n]['border_width'] = 10

    # set the case study colors - all nodes are gray by default
    if kwargs.get('case_study') is True:
        if graph_attr_file is not None:
            # if there are other colors present, then make the sources and targets gray by default because they could have other colors
            NODE_COLORS.update(CASESTUDY_NODE_COLORS)
            EDGE_COLORS.update(CASESTUDY_EDGE_COLORS)

    # get the evidence supporting each edge
    evidence, edge_types, edge_dir = gs_utils.getEvidence(
        prededges.keys(), evidence_file=ev_file)

    # Now post to graphspace!
    #G = gs.constructGraph(pred_edges, node_labels=uniprot_to_gene, graph_attr=graph_attr, popups=popups)
    #G = gs_base.constructGraph(prededges, node_labels=uniprot_to_gene, graph_attr=graph_attr, attr_desc=attr_desc)
    G = constructGraph(prededges,
                       sources,
                       targets,
                       node_labels=uniprot_to_gene,
                       evidence=evidence,
                       edge_types=edge_types,
                       edge_dir=edge_dir,
                       graph_attr=graph_attr,
                       attr_desc=attr_desc,
                       **kwargs)
    print("Graph has %d nodes and %d edges" %
          (G.number_of_nodes(), G.number_of_edges()))

    # put the parent nodes and the nodes in the parent nodes in a grid layout automatically
    print("Setting the x and y coordinates of each node in a grid layout")
    # relabel the nodes to their names
    graph_attr = {
        uniprot_to_gene.get(n, n): attr
        for n, attr in graph_attr.items()
    }
    layout = gs_utils.grid_layout(G, graph_attr)
    for node, (x, y) in layout.items():
        G.set_node_position(node_name=node, x=x, y=y)

    # before posting, see if we want to write the Graph's JSON to a file
    if kwargs.get('out_pref') is not None:
        print(
            "Writing graph and style JSON files to:\n\t%s-graph.json \n\t%s-style.json"
            % (kwargs['out_pref'], kwargs['out_pref']))
        with open(kwargs['out_pref'] + "-graph.json", 'w') as out:
            json.dump(G.get_graph_json(), out, indent=2)
        with open(kwargs['out_pref'] + "-style.json", 'w') as out:
            json.dump(G.get_style_json(), out, indent=2)

    G.set_tags(kwargs.get('tags', []))
    G.set_name(graph_name)

    gs_base.post_graph_to_graphspace(G,
                                     kwargs['username'],
                                     kwargs['password'],
                                     graph_name,
                                     apply_layout=kwargs['apply_layout'],
                                     layout_name=kwargs['layout_name'],
                                     group=kwargs['group'],
                                     make_public=kwargs['make_public'])
コード例 #17
0
def addEdgeDir(interactome_file,
               dir_trumps_undir=True,
               evidence_file=None,
               new_interactome_file=None):
    """ Add T/F for if the edge is directed or not as a 4th column 
    """
    if new_interactome_file is None:
        new_interactome_file = interactome_file
        print("Re-writing %s with edge direction as a fourth column" %
              (interactome_file))
    else:
        print("Reading %s and adding edge direction as a fourth column to %s" %
              (interactome_file, new_interactome_file))
    edges = set(utils.readColumns(interactome_file, 1, 2, 3))

    # ensure there are no self edges
    num_edges = len(edges)
    edges = [(u, v, w) for u, v, w in edges if u != v]
    if len(edges) != num_edges:
        print("%d self-edges removed" % (num_edges - len(edges)))

    edge_weights = {(u, v): w for u, v, w in edges}
    # also add the direction to the interactome
    print("Getting the edge direction of all edges in the interactome")
    if evidence_file is None:
        # try to get the edge direction automatically
        if '2018_01' in interactome_file:
            evidence_version = "2018_01pathlinker"
        if '2017_01' in interactome_file:
            evidence_version = "2017_01pathlinker"
        else:
            evidence_version = "2016_05pathlinker"
        evidence_file = getev.getEvidenceFile(evidence_version,
                                              t_settings.DATADIR)
    # the evidence file has if the edges are directed or not.
    # get that information here using get_interaction_evidence.py
    edge_dir = getev.getEdgeDir(edge_weights.keys(),
                                evidence_file,
                                split_family_nodes=True,
                                add_ev_to_family_edges=True)

    # UPDATE 2017-12-07: After splitting the family receptors and TFs,
    # some of the undirected edges that are trumped by directed edges are back again. Make sure they're removed
    G = nx.Graph()
    dirG = nx.DiGraph()
    for u, v in edge_weights:
        if edge_dir[(u, v)] is True:
            dirG.add_edge(u, v)
        else:
            G.add_edge(u, v)

    # now remove trumped edges
    trumped_edges = 0
    for u, v in dirG.edges():
        if G.has_edge(u, v):
            trumped_edges += 1
            G.remove_edge(u, v)

    print("%d undir edges trumped by dir edges" % (trumped_edges))
    print("%d directed, %d undirected edges" %
          (dirG.number_of_edges(), G.number_of_edges()))
    combG = nx.DiGraph()
    combG.add_edges_from(dirG.edges(), type="Dir")
    combG.add_edges_from(G.to_directed().edges(), type="Undir")
    print("%d total edges" % (combG.number_of_edges()))

    #new_edges = dirG.edges()
    #for u,v in G.edges():
    #    new_edges.append((u,v))
    #    new_edges.append((v,u))
    #if len(new_edges) != len(set(new_edges)):
    #    print("ERROR: there are duplicates")
    if combG.number_of_edges() != (dirG.number_of_edges() +
                                   (G.number_of_edges() * 2)):
        print(
            "ERROR: len(new_edges) != dirG.number_of_edges() + (G.number_of_edges()*2):"
        )
        print("%d != %d + (%d*2)" %
              (combG.number_of_edges(), dirG.number_of_edges(),
               G.number_of_edges()))
        sys.exit()

    # now write the interactome file again
    with open(new_interactome_file, 'w') as out:
        for u, v in combG.edges():
            #dir_string = "Dir" if edge_dir[(u,v)] else "Undir"
            w = edge_weights[(u,
                              v)] if (u,
                                      v) in edge_weights else edge_weights[(v,
                                                                            u)]
            out.write("%s\t%s\t%s\t%s\n" % (u, v, w, combG.edge[u][v]['type']))
コード例 #18
0
                  help='Also store a pdf of the figures')
(opts, args) = parser.parse_args()

for i, version in enumerate(opts.version):
    print("")
    print("-" * 30)
    t_settings.set_version(version)
    chemicals = sorted(
        utils.readItemList("%s/chemicals.txt" % (t_settings.INPUTSPREFIX)))

    interactome = t_settings.INTERACTOME

    print(
        "Getting the weight, cost and direction of each edge from the interactome %s"
        % (interactome))
    lines = utils.readColumns(interactome, 1, 2, 3, 4)
    edge_weights = {(u, v): float(w) for u, v, w, d in lines}
    edge_dir = {(u,v): True if d.lower() in ['true','t','dir','directed'] else False \
                for u,v,w,d in lines}

    # get the evidence from get_interaction_evidence.py
    #evidence_file = getev.getEvidenceFile(evidence_version, t_settings.DATADIR)
    #edge_dir = getev.getEdgeDir(edge_weights.keys(), evidence_file, split_family_nodes=False, add_ev_to_family_edges=False)
    # TODO try just the directed edges
    #weights = [edge_weights[e] for e in edge_weights if edge_dir[e] is True]
    weights = edge_weights.values()
    # for now, don't show the costs from the edges of weight 0
    costs = [-log(max(0.000001, w)) for w in weights if w != 0]
    #costs = [cost for cost in costs if cost < 4.5]
    # also incorporate the penalty into the plot
    penalty = log(t_settings.EDGE_PENALTIES.get(version, 1))
コード例 #19
0
def permute_and_run_edgelinker(opts, random_index):
    if opts.write_score_counts:
        rand_scores_k = "%s/rand-networks/rand-%d-med-scores-k.txt" % (
            opts.write_score_counts, random_index)
        # if the final score counts file already exists, then don't do anything
        if os.path.isfile(rand_scores_k) and not opts.forced:
            print("%s already exists. Skipping." % (rand_scores_k))
            return
        chemical_k_scores = "%s/chemical-k-median-scores.txt" % (
            opts.write_score_counts)
        if not os.path.isfile(chemical_k_scores):
            print(
                "Error: %s does not exist. Run compute_stat_sig.py with the --write-counts option to write it. Quitting"
                % (chemical_k_scores))
            return

    t_utils.checkDir("%s/networks" % (opts.out_dir))
    rec_tfs_file_template = "%s/rec-tfs/%%s-rec-tfs.txt" % (opts.inputs_dir)
    chemicals = sorted(
        utils.readItemList("%s/chemicals.txt" % opts.inputs_dir, col=1))
    if opts.single_chem:
        chemicals = opts.single_chem

    if opts.permute_rec_tfs is not None:
        # if specified, "permute" the sets of receptors and tfs for each chemical instead of the interactome
        print("Writing random sets of rec/tfs for each chemical to %s" %
              (opts.out_dir))
        rec_tfs_file_template = "%s/%%s/%d-random-rec-tfs.txt" % (opts.out_dir,
                                                                  random_index)
        all_rec, all_tfs = t_utils.getRecTFs(opts.permute_rec_tfs)
        #chemical_num_rectfs_file = "%s/chemical_num_rectfs.txt" % (opts.inputs_dir)
        #lines = utils.readColumns(chemical_num_rectfs_file, 2, 3, 4)
        #for chem, num_rec, num_tfs in tqdm(lines):
        for chemical in tqdm(chemicals, disable=opts.verbose):
            out_file = rec_tfs_file_template % (chemical)
            if not os.path.isfile(out_file) or opts.forced:
                rec, tfs, costs, zscores = t_utils.getRecTFs(
                    t_settings.REC_TFS_FILE % (opts.inputs_dir, chemical),
                    costs=True)
                rec = list(rec)
                tfs = list(tfs)

                out_dir = "%s/%s" % (opts.out_dir, chemical)
                t_utils.checkDir(out_dir)
                random_rec = random.sample(all_rec, len(rec))
                # apply the costs to the random rec and tfs
                for i in range(len(rec)):
                    costs[random_rec[i]] = costs[rec[i]]
                    zscores[random_rec[i]] = zscores[rec[i]]
                random_tfs = random.sample(all_tfs, len(tfs))
                for i in range(len(tfs)):
                    costs[random_tfs[i]] = costs[tfs[i]]
                    zscores[random_tfs[i]] = zscores[tfs[i]]
                t_utils.writeRecTFs(out_file,
                                    random_rec,
                                    random_tfs,
                                    costs=costs,
                                    zscores=zscores)
        # use the original interactome
        permuted_network_out_file = opts.interactome
        print("Using the original interactome %s" %
              (permuted_network_out_file))
    else:
        # default is to permute the interactome
        permuted_network_out_file = '%s/networks/permuted-network%d.txt' % (
            opts.out_dir, random_index)
        if not os.path.isfile(permuted_network_out_file) or opts.forced:
            # don't log transform. The weights will be log transformed by the edgelinker code
            #G = cycLinker.readNetwork(opts.interactome, weight=True, logtransform=False)
            # UPDATE: 2017-12-07: try using the direction of the edges from the fourth column of the interactome instead of splitting based on if the edge is bidirected or not
            G = nx.DiGraph()
            dir_edges = []
            undir_edges = []
            lines = utils.readColumns(opts.interactome, 1, 2, 3, 4)
            if len(lines) == 0:
                print(
                    "ERROR: interactome should have 4 columns: a, b, w, and True/False for directed/undirected. Quitting"
                )
                sys.exit()
            for u, v, w, directed in lines:
                G.add_edge(u, v, weight=float(w))
                if directed.lower() in ["true", "t", "dir", 'directed']:
                    dir_edges.append((u, v))
                elif directed.lower() not in [
                        "false", 'f', 'undir', 'undirected'
                ]:
                    print(
                        "ERROR: Unknown directed edge type '%s'. 4th column should be T/F to indicdate directed/undirected"
                        % (directed.lower()))
                    print("Quitting.")
                    sys.exit()
                elif u < v:
                    undir_edges.append((u, v))

            if opts.undirected:
                # swap all edges as undirected edges
                permG = permute_network.permute_network(
                    G.to_undirected(), num_iterations=opts.num_iterations)
                permG = permG.to_directed()
            elif opts.split_by_weight:
                # split the edges into bins by weight and swap the directed and undirected edges separately
                # if specified by the user
                permG = permute_network.permute_network(
                    G,
                    swap_phys_sig_sep=opts.swap_phys_sig_sep,
                    split_weight=opts.split_by_weight,
                    num_iterations=opts.num_iterations)
            elif opts.swap_phys_sig_sep:
                # swap the directed and undirected edges separately
                permG = permute_network.permute_network(
                    G,
                    swap_phys_sig_sep=opts.swap_phys_sig_sep,
                    num_iterations=opts.num_iterations,
                    edge_lists=(undir_edges, dir_edges))
            else:
                # if none of the options are specified, then swap everything as directed edges
                permG = permute_network.permute_network(
                    G, num_iterations=opts.num_iterations)
            print("Writing %s" % (permuted_network_out_file))
            nx.write_weighted_edgelist(permG,
                                       permuted_network_out_file,
                                       comments='#',
                                       delimiter='\t')
        else:
            print("Using %s" % (permuted_network_out_file))

    # now run edgelinker on each of the chemicals using the permuted network
    # if version is netpath, use the different type of input file
    # TODO fix this
    # PATHLINKERDATAVERSIONS
    #if 'kegg' in opts.inputs_dir or 'netpath' in opts.inputs_dir:
    #    rec_tfs_file_template = "%s/rec-tfs/%%s-nodes.txt" % (opts.inputs_dir)
    in_files = []
    out_files = []
    for chemical in tqdm(chemicals, disable=opts.verbose):
        rec_tfs_file = rec_tfs_file_template % (chemical)
        in_files.append(os.path.abspath(rec_tfs_file))
        out_dir = "%s/%s" % (opts.out_dir, chemical)
        t_utils.checkDir(out_dir)
        out_pref = "%s/%d-random" % (out_dir, random_index)
        out_files.append(os.path.abspath(out_pref))
        # python implementation of edgelinker is taking too long. Switching to java for now.
        #run_write_edgelinker(permG, rec_tfs_file, opts.k, out_pref)
        # run the java implementation of edgelinker below

    # write the in and out files to the networks dir
    edgelinker_in_files = '%s/networks/permuted-network%d-infiles.txt' % (
        opts.out_dir, random_index)
    with open(edgelinker_in_files, 'w') as out:
        out.write('\n'.join(in_files))
    edgelinker_out_files = '%s/networks/permuted-network%d-outfiles.txt' % (
        opts.out_dir, random_index)
    with open(edgelinker_out_files, 'w') as out:
        out.write('\n'.join(out_files))
    print("Running edgelinker on chemical %s: %s" % (chemical, out_pref))
    run_edgelinker.runEdgeLinker(permuted_network_out_file,
                                 cyclinker_in_files,
                                 cyclinker_out_files,
                                 opts.k,
                                 edge_penalty=EDGE_PENALTY,
                                 rec_tfs_penalty=REC_TFS_PENALTY,
                                 multi_run=True)

    if opts.write_score_counts:
        # now that edgelinker has been run on all of the chemical sources/targets,
        # get the path counts for the chemical network's path scores
        # import compute_stat_sig.py and run the code directly. This avoids the issues of re-importing the libraries from baobab
        print(
            "Writing the counts for each of the scores for random index: '%d'"
            % (random_index))
        stat_sig = compute_stat_sig.StatSig(random_paths_dir=opts.out_dir,
                                            k_limit=opts.k,
                                            num_random=(random_index,
                                                        random_index),
                                            out_dir=opts.write_score_counts)
        stat_sig.write_rand_counts(chemicals=chemicals, forced=opts.forced)
#        cmd = "python src/compute_stat_sig.py " + \
#              " --chemicals %s/chemicals.txt " % (opts.inputs_dir) + \
#              " --random-paths-dir %s/ " % (opts.out_dir) + \
#              " -P --k-limit %d " % (opts.k) + \
#              " --num-random %d %d" % (random_index, random_index) + \
#              " --group-by-prob " + \
#              " --write-rand-counts " + \
#              " --out-dir %s " % (opts.write_score_counts)
#        if opts.forced:
#            cmd += " --forced "
#        print(cmd)
#        subprocess.check_call(cmd.split())

#if opts.run_mgsa_random:
#    run_mgsa_random(random_index)

    if opts.cleanup:
        print(
            "Deleting the generated permuted network and the edgelinker output files"
        )
        if permuted_network_out_file != opts.interactome:
            os.remove(permuted_network_out_file)
        os.remove(edgelinker_in_files)
        # remove the individual output files
        for cyc_out_file in out_files:
            # # 2017-02-17 - temporarilly don't remove the paths file for running MGSA
            os.remove(cyc_out_file + "-paths.txt")
            os.remove(cyc_out_file + "-ranked-edges.txt")
        os.remove(edgelinker_out_files)