Esempio n. 1
0
def main(config_map, **kwargs):
    """
    *config_map*: everything in the config file
    *kwargs*: all of the options passed into the script
    """
    # extract the general variables from the config map
    input_settings, input_dir, output_dir, alg_settings, kwargs \
        = config_utils.setup_config_variables(config_map, **kwargs)

    uniprot_to_gene = {}

    # or we could get a distribution of distances for each virus node
    # load human-virus ppis
    df = pd.read_csv(kwargs['sarscov2_human_ppis'], sep='\t')
    edges = zip(df[df.columns[0]], df[df.columns[1]])
    edges = [(v.replace("SARS-CoV2 ", ""), h) for v, h in edges]
    virus_nodes = [v for v, h in edges]
    krogan_nodes = [h for v, h in edges]
    virhost_edges = edges

    # for each dataset, extract the path(s) to the prediction files,
    # read in the predictions, and test for the statistical significance of overlap
    for dataset in input_settings['datasets']:
        dataset_name = config_utils.get_dataset_name(dataset)
        print("Loading data for %s" % (dataset['net_version']))
        # load the network and the positive examples for each term
        net_obj, ann_obj, eval_ann_obj = run_eval_algs.setup_dataset(
            dataset, input_dir, **kwargs)
        prots, node2idx = net_obj.nodes, net_obj.node2idx
        print("\t%d total prots" % (len(prots)))
        # TODO using this for the SARS-CoV-2 project,
        # but this should really be a general purpose script
        # and to work on any number of terms
        orig_pos_idx, _ = alg_utils.get_term_pos_neg(ann_obj.ann_matrix, 0)
        orig_pos = [prots[p] for p in orig_pos_idx]
        print("\t%d original positive examples" % (len(orig_pos)))

        # convert the krogan nodes and drugs to ids
        #drug_nodes_idx = [node2idx[d] for d in drug_nodes if d in node2idx]
        krogan_nodes_idx = [node2idx[n] for n in orig_pos if n in node2idx]

        # TODO add the diffusion option as well
        if kwargs.get('analysis_type') == 'diffusion_analysis':
            eval_diffusion(dataset, net_obj, krogan_nodes_idx, **kwargs)

        elif kwargs.get('analysis_type') == 'shortest_paths':
            eval_shortest_paths(dataset, net_obj, krogan_nodes_idx, **kwargs)

        elif kwargs.get('analysis_type') == 'degrees':
            plot_degrees(net_obj.W, krogan_nodes, dataset_name, **kwargs)
Esempio n. 2
0
def main(config_map_list, **kwargs):

    input_settings, input_dir, output_dir, alg_settings, kwargs \
        = config_utils.setup_config_variables(config_map_list[0], **kwargs)

    alphas = []
    alpha_settings = alg_settings['genemaniaplus']['alpha'][0]
    if isinstance(alpha_settings, Iterable):
        for a in alpha_settings:
            alphas.append(a)
    else:
        alphas.append(alpha_settings)

    for alpha in alphas:
        interactors = []
        interactor_values = []
        rand_values = []

        uniprot_to_gene = {}
        if kwargs.get('id_mapping_file'):
            print("Reading %s" % (kwargs['id_mapping_file']))
            df = pd.read_csv(kwargs['id_mapping_file'], sep='\t', header=0)
            ## keep only the first gene for each UniProt ID
            uniprot_to_gene = {
                p: genes.split(' ')[0]
                for p, genes in zip(df['Entry'], df['Gene names'].astype(str))
            }

        for config_map in config_map_list:
            """
      *config_map*: everything in the config file
      *kwargs*: all of the options passed into the script
      """
            # extract the general variables from the config map
            input_settings, input_dir, output_dir, alg_settings, kwargs \
                = config_utils.setup_config_variables(config_map, **kwargs)

            # for each dataset, extract the path(s) to the prediction files,
            # read in the predictions, and test for the statistical significance of overlap
            for dataset in input_settings['datasets']:
                dataset_name = config_utils.get_dataset_name(dataset)
                print("Loading data for %s" % (dataset['net_version']))
                # load the network and the positive examples for each term
                net_obj, ann_obj, eval_ann_obj = run_eval_algs.setup_dataset(
                    dataset, input_dir, **kwargs)
                prots, node2idx = net_obj.nodes, net_obj.node2idx
                print("\t%d total prots" % (len(prots)))

                orig_pos_idx, _ = alg_utils.get_term_pos_neg(
                    ann_obj.ann_matrix, 0)
                orig_pos = [prots[p] for p in orig_pos_idx]
                print("\t%d original positive examples" % (len(orig_pos)))
                # convert the virus interactor nodes to ids
                interactor_nodes_idx = [
                    node2idx[n] for n in orig_pos if n in node2idx
                ]

                interactor_value, rand_value = eval_diffusion_cross_validation(
                    dataset, net_obj, interactor_nodes_idx, uniprot_to_gene,
                    alpha, **kwargs)

                interactors.append(dataset['exp_name'])
                interactor_values.append(interactor_value)
                rand_values.append(rand_value)

        plot_diffusion_cross_validation(interactors, interactor_values,
                                        rand_values, alpha, **kwargs)
def main(config_map, **kwargs):
    """
    *config_map*: everything in the config file
    *kwargs*: all of the options passed into the script
    """
    # extract the general variables from the config map
    input_settings, input_dir, output_dir, alg_settings, kwargs \
        = config_utils.setup_config_variables(config_map, **kwargs)
    algs = config_utils.get_algs_to_run(alg_settings, **kwargs)
    print("algs: %s" % (str(algs)))
    del kwargs['algs']

    # load the namespace mappings
    uniprot_to_gene = None
    if kwargs.get('id_mapping_file'):
        uniprot_to_gene = enrichment.load_gene_names(
            kwargs.get('id_mapping_file'))
        kwargs['uniprot_to_gene'] = uniprot_to_gene

    # genesets_to_test = config_map.get('genesets_to_test')
    # if genesets_to_test is None or len(genesets_to_test) == 0:
    #     print("ERROR: no genesets specified to test for overlap. " +
    #           "Please add them under 'genesets_to_test'. \nQuitting")
    #     sys.exit()

    # # first load the gene sets
    # geneset_groups = {}
    # for geneset_to_test in genesets_to_test:
    #     name = geneset_to_test['name']
    #     gmt_file = "%s/genesets/%s/%s" % (
    #         input_dir, name, geneset_to_test['gmt_file'])
    #     if not os.path.isfile(gmt_file):
    #         print("WARNING: %s not found. skipping" % (gmt_file))
    #         sys.exit()

    #     geneset_groups[name] = utils.parse_gmt_file(gmt_file)

    # store all the enriched terms in a single dataframe
    all_dfs = {g: pd.DataFrame() for g in ['BP', 'CC', 'MF']}

    num_algs_with_results = 0
    # for each dataset, extract the path(s) to the prediction files,
    # read in the predictions, and test for the statistical significance of overlap
    for dataset in input_settings['datasets']:
        print("Loading data for %s" % (dataset['net_version']))
        base_out_dir = "%s/enrichment/%s/%s" % (
            output_dir, dataset['net_version'], dataset['exp_name'])
        # load the network and the positive examples for each term
        net_obj, ann_obj, _ = run_eval_algs.setup_dataset(
            dataset, input_dir, **kwargs)
        prots = net_obj.nodes
        prot_universe = set(prots)
        print("\t%d prots in universe" % (len(prot_universe)))
        # TODO using this for the SARS-CoV-2 project,
        # but this should really be a general purpose script
        # and to work on any number of terms
        orig_pos_idx, _ = alg_utils.get_term_pos_neg(ann_obj.ann_matrix, 0)
        orig_pos = [prots[p] for p in orig_pos_idx]
        #print("\t%d original positive examples" % (len(orig_pos)))
        if kwargs.get('add_orig_pos_to_prot_universe'):
            pos_neg_file = "%s/%s" % (input_dir, dataset['pos_neg_file'])
            df = pd.read_csv(pos_neg_file, sep='\t')
            orig_pos = df[df['2020-03-sarscov2-human-ppi'] == 1]['prots']
            print("\t%d original positive examples" % (len(orig_pos)))
            prot_universe = set(prots) | set(orig_pos)
            print("\t%d prots in universe after adding them to the universe" %
                  (len(prot_universe)))

        # now load the predictions, test at the various k values, and TODO plot
        k_to_test = enrichment.get_k_to_test(dataset, **kwargs)
        print("\ttesting %d k value(s): %s" %
              (len(k_to_test), ", ".join([str(k) for k in k_to_test])))

        # now load the prediction scores
        dataset_name = config_utils.get_dataset_name(dataset)
        alg_pred_files = config_utils.get_dataset_alg_prediction_files(
            output_dir, dataset, alg_settings, algs, **kwargs)
        for alg, pred_file in alg_pred_files.items():
            if not os.path.isfile(pred_file):
                print("Warning: %s not found. skipping" % (pred_file))
                continue
            num_algs_with_results += 1
            print("reading: %s" % (pred_file))
            df = pd.read_csv(pred_file, sep='\t')
            # remove the original positives
            df = df[~df['prot'].isin(orig_pos)]
            df.reset_index(inplace=True, drop=True)
            #df = df[['prot', 'score']]
            df.sort_values(by='score', ascending=False, inplace=True)
            if kwargs.get('stat_sig_cutoff'):
                df = config_utils.get_pvals_apply_cutoff(
                    df, pred_file, **kwargs)
            # write these results to file
            pred_filtered_file = "%s/%s/%s-filtered%s.tsv" % (
                base_out_dir, alg, os.path.basename(pred_file).split('.')[0],
                "-p%s" % str(kwargs['stat_sig_cutoff']).replace('.', '_')
                if kwargs.get('stat_sig_cutoff') else "")
            os.makedirs(os.path.dirname(pred_filtered_file), exist_ok=True)
            if kwargs.get(
                    'force_run') or not os.path.isfile(pred_filtered_file):
                print("writing %s" % (pred_filtered_file))
                df.to_csv(pred_filtered_file, sep='\t', index=None)

            for k in k_to_test:
                topk_predictions = list(df.iloc[:k]['prot'])

                # now run clusterProfiler from R
                out_dir = pred_filtered_file.split('.')[0]
                bp_df, mf_df, cc_df = enrichment.run_clusterProfiler_GO(
                    topk_predictions,
                    out_dir,
                    prot_universe=prot_universe,
                    forced=kwargs.get('force_run'),
                    **kwargs)
                for ont, df in [('BP', bp_df), ('MF', mf_df), ('CC', cc_df)]:
                    # make it into a multi-column-level dataframe
                    tuples = [(dataset_name, alg, col) for col in df.columns]
                    index = pd.MultiIndex.from_tuples(tuples)
                    df.columns = index
                    all_dfs[ont] = pd.concat([all_dfs[ont], df], axis=1)

    if num_algs_with_results == 0:
        print("No results found. Quitting")
        sys.exit()

    if kwargs.get('compare_krogan_terms'):
        krogan_dir = kwargs['compare_krogan_terms']
        for geneset, g_df in all_dfs.items():
            #if geneset == 'GO':
            #    for ont in ['BP', 'MF', 'CC']:
            # load the enriched terms for the krogan nodes
            out_file = "%s/enrich-%s.csv" % (krogan_dir, geneset)
            if not os.path.isfile(out_file):
                print("ERROR: %s not found. Quitting" % (out_file))
                sys.exit()
            print("\treading %s" % (out_file))
            df = pd.read_csv(out_file, index_col=0)
            # drop the terms that don't have a pval < 0.01 and aren't in the FSS results
            terms_to_keep = set(list(g_df.index.values)) | set(
                list(df[df['p.adjust'] < kwargs.get('pval_cutoff', 0.01)]
                     ['ID'].values))
            print("\t%d krogan terms to keep" % (len(terms_to_keep)))
            df = df[df['ID'].isin(terms_to_keep)]
            # also apply the
            tuples = [('Krogan', '-', col) for col in df.columns]
            index = pd.MultiIndex.from_tuples(tuples)
            df.columns = index
            all_dfs[geneset] = pd.concat([all_dfs[geneset], df], axis=1)

    # now write the combined df to a file
    out_pref = kwargs.get('out_pref')
    if out_pref is None:
        pval_str = str(kwargs.get('pval_cutoff', 0.01)).replace('.', '_')
        out_pref = "%s/enrichment/combined%s-%s/%s-" % (
            output_dir, "-krogan" if kwargs.get('compare_krogan_terms') else
            "", pval_str, os.path.basename(kwargs['config']).split('.')[0])
    for geneset, df in all_dfs.items():
        if kwargs.get('file_per_alg'):
            df = df.swaplevel(0, 1, axis=1)
            for alg, df_alg in df.groupby(level=0, axis=1):
                df_alg.dropna(how='all', inplace=True)
                # TODO add back the krogan terms
                #if kwargs.get('compare_krogan_terms') and :
                print(df_alg.head())
                out_file = "%s%s-k%s-%s.csv" % (out_pref, alg, k_to_test[0],
                                                geneset)
                write_combined_table(df_alg, out_file, dataset_level=1)
        else:
            out_file = "%sk%s-%s.csv" % (out_pref, k_to_test[0], geneset)
            write_combined_table(df, out_file, dataset_level=0)
def main(config_map, **kwargs):
    """
    *config_map*: everything in the config file
    *kwargs*: all of the options passed into the script
    """
    # extract the general variables from the config map
    input_settings, input_dir, output_dir, alg_settings, kwargs \
        = config_utils.setup_config_variables(config_map, **kwargs)
    algs = config_utils.get_algs_to_run(alg_settings, **kwargs)
    del kwargs['algs']

    node_list = [] 
    if kwargs.get('node_list_file'):
        print("Reading %s" % (kwargs['node_list_file']))
        node_list = set(pd.read_csv(kwargs['node_list_file'], sep='\t', comment='#', header=None, squeeze=True).tolist())
    if kwargs.get('drug_list_file'):
        print("Reading %s" % (kwargs['drug_list_file']))
        drug_list = set(pd.read_csv(kwargs['drug_list_file'], sep='\t', comment='#', header=None, usecols=[0], squeeze=True).tolist())
        #node_list |= drug_list
    if kwargs.get('node_to_post'):
        node_list |= set(kwargs['node_to_post'])

    # this dictionary will hold all the styles 
    graph_attr = defaultdict(dict)
    attr_desc = defaultdict(dict)
    if kwargs.get('graph_attr_file'):
        graph_attr, attr_desc = gs.readGraphAttr(kwargs['graph_attr_file'])
    # load the namespace mappings
    uniprot_to_gene = None
    # also add the protein name
    uniprot_to_prot_names = None
    # these 
    node_desc = defaultdict(dict)
    if kwargs.get('id_mapping_file'):
        df = pd.read_csv(kwargs['id_mapping_file'], sep='\t', header=0) 
        ## keep only the first gene for each UniProt ID
        uniprot_to_gene = {p: genes.split(' ')[0] for p, genes in zip(df['Entry'], df['Gene names'].astype(str))}
        if 'Protein names' in df.columns:
            uniprot_to_prot_names = dict(zip(df['Entry'], df['Protein names'].astype(str)))
            node_desc = {n: {'Protein names': uniprot_to_prot_names[n]} for n in uniprot_to_prot_names}
    drug_nodes = None
    if kwargs.get('drug_id_mapping_file'):
        print("Reading %s" % (kwargs['drug_id_mapping_file']))
        df = pd.read_csv(kwargs['drug_id_mapping_file'], sep='\t', header=0) 
        # just add the drug name mapping to the mapping dictionary already in place
        uniprot_to_gene.update({d: name for d, name in zip(df['drugbank_id'], df['name'].astype(str))})
        for d, name in uniprot_to_gene.items():
            new_name = fix_drug_name(name)
            uniprot_to_gene[d] = new_name
        # now get extra drug info
        #uniprot_to_prot_names.update({d: group_nodes for d, group_nodes in zip(df['drugbank_id'], df['group_nodes'].astype(str))})
        drug_nodes = set(list(df['drugbank_id'].values))
    drugG = None 
    if kwargs.get('drug_targets_file'):
        print("Reading %s" % (kwargs['drug_targets_file']))
        df = pd.read_csv(kwargs['drug_targets_file'], sep='\t', header=None) 
        drugG = nx.from_pandas_edgelist(df, source=0, target=1)
    if kwargs.get('drug_target_info_file'):
        print("Reading %s" % (kwargs['drug_target_info_file']))
        df = pd.read_csv(kwargs['drug_target_info_file'], sep='\t') 
        # also get the pmid and references from the table to add as popup info
        drug_target_pmids = {}
        drug_target_action = {}
        pmid_citations = {}
        for d, p, action, pmids, citations in df[['drugbank_id', 'uniprot_id', 'actions', 'pubmed_ids', 'citations']].values:
            drug_target_action[(d,p)] = action
            if pd.isnull(pmids):
                continue
            drug_target_pmids[(d,p)] = str(pmids).split('|')
            pmid_citations.update(dict(zip(str(pmids).split('|'), str(citations).split('|'))))
    if kwargs.get('enriched_terms_file'):
        print("Reading %s" % (kwargs['enriched_terms_file']))
        df = pd.read_csv(kwargs['enriched_terms_file'], header=[0,1,2], index_col=0) 
        df2 = df.copy()
        print(df.head())
        # get the prots per term
        df2.columns = df2.columns.droplevel([0,1])
        df2 = df2['geneID']
        df2.columns = list(range(len(df2.columns)))
        for i in range(1,len(df2.columns)):
            df2[0] = df2[0] + '/' + df2[i]
        print(df2.head())
        #print(df2['geneID'].head())
        term_ann = dict(zip(df2.index, df2[0].values))
        term_ann = {t: str(ann).split('/') for t,ann in term_ann.items()}
        print("\t%d terms, %s" % (
            len(term_ann),
            ", ".join("%s: %d ann" % (t, len(term_ann[t])) for t in kwargs['term_to_highlight']))) 
        # also get the term name, and the enrichment p-value(?)
        term_names = dict(zip(df.index, df[('Description', 'Unnamed: 1_level_1', 'Unnamed: 1_level_2')]))
        nodes = set()
        # setup their graph_attributes for posting
        # reverse so that if a gene is annotated to multiple terms, then the first term gets priority of parent
        for i, t in enumerate(kwargs['term_to_highlight'][::-1]):
            name = term_names[t]
            color = GO_term_colors[i]
            graph_attr[name]['color'] = color
            # add this node as the parent to the other nodes
            for n in term_ann[t]:
                graph_attr[n]['parent'] = name
                graph_attr[n]['color'] = color
                nodes.add(n)
            # TODO add the link
            # this is used to make the popup
            attr_desc[('parent', name)] = t
        # if the node list file is not specified, then set the annotated nodes as the node list
        if not kwargs.get('node_list_file'):
            node_list = nodes
    if kwargs.get('drug_targets_only'):
        new_node_list = set([n for n in node_list if drugG.has_node(n)])
        if len(new_node_list) > 2:
            node_list = new_node_list

    # load human-virus ppis
    df = pd.read_csv(kwargs['sarscov2_human_ppis'], sep='\t')
    edges = zip(df[df.columns[0]], df[df.columns[1]])
    edges = [(v.replace("SARS-CoV2 ",""), h) for v,h in edges]
    virus_nodes = [v for v,h in edges]
    krogan_nodes = [h for v,h in edges]
    virhost_edges = edges 

#    genesets_to_test = config_map.get('genesets_to_test')
#    if genesets_to_test is None or len(genesets_to_test) == 0:
#        print("ERROR: no genesets specified to test for overlap. " +
#              "Please add them under 'genesets_to_test'. \nQuitting") 
#        sys.exit()
#
#    # first load the gene sets
#    # TODO use these
#    geneset_group_nodes = {}
#    for geneset_to_test in genesets_to_test:
#        name = geneset_to_test['name'] 
#        gmt_file = "%s/genesets/%s/%s" % (
#            input_dir, name, geneset_to_test['gmt_file'])
#        if not os.path.isfile(gmt_file):
#            print("WARNING: %s not found. skipping" % (gmt_file))
#            sys.exit()
#
#        geneset_group_nodes[name] = setup_datasets.parse_gmt_file(gmt_file)  

    # for each dataset, extract the path(s) to the prediction files,
    # read in the predictions, and test for the statistical significance of overlap 
    for dataset in input_settings['datasets']:
        print("Loading data for %s" % (dataset['net_version']))
        # load the network and the positive examples for each term
        net_obj, ann_obj, eval_ann_obj = run_eval_algs.setup_dataset(
            dataset, input_dir, **kwargs) 
        # find the shortest path(s) from each drug node to any virus node
        if kwargs.get('edge_weight_cutoff'):
            print("\tapplying edge weight cutoff %s" % (kwargs['edge_weight_cutoff']))
            W = net_obj.W
            W = W.multiply((W > kwargs['edge_weight_cutoff']).astype(int))
            net_obj.W = W
            num_nodes = np.count_nonzero(W.sum(axis=0))  # count the nodes with at least one edge
            num_edges = (len(W.data) / 2)  # since the network is symmetric, the number of undirected edges is the number of entries divided by 2
        print("\t%d nodes and %d edges" % (num_nodes, num_edges))
        prots = net_obj.nodes
        print("\t%d total prots" % (len(prots)))
        # TODO using this for the SARS-CoV-2 project,
        # but this should really be a general purpose script
        # and to work on any number of terms 
        orig_pos_idx, _ = alg_utils.get_term_pos_neg(ann_obj.ann_matrix, 0)
        orig_pos = [ann_obj.prots[p] for p in orig_pos_idx]
        print("pos & krogan: %d; pos - krogan: %d; krogan - pos: %d" % (
            len(set(orig_pos) & set(krogan_nodes)),
            len(set(orig_pos) - set(krogan_nodes)),
            len(set(krogan_nodes) - set(orig_pos))))
        print("\t%d original positive examples" % (len(orig_pos)))
        #pos_neg_file = "%s/%s" % (input_dir, dataset['pos_neg_file'])
        #df = pd.read_csv(pos_neg_file, sep='\t')
        #orig_pos = df[df['2020-03-sarscov2-human-ppi'] == 1]['prots']
        #print("\t%d original positive examples" % (len(orig_pos)))

        # now load the predictions, test at the various k values, and TODO plot
        k_to_test = get_k_to_test(dataset, **kwargs)
        print("\tposting %d k values: %s" % (len(k_to_test), ", ".join([str(k) for k in k_to_test])))

        # now load the prediction scores
        alg_pred_files = config_utils.get_dataset_alg_prediction_files(
            output_dir, dataset, alg_settings, algs, use_alg_name=False, **kwargs)
        for alg, pred_file in alg_pred_files.items():
            if not os.path.isfile(pred_file):
                print("Warning: %s not found. skipping" % (pred_file))
                continue
            print("reading: %s" % (pred_file))
            df = pd.read_csv(pred_file, sep='\t')
            scores = dict(zip(df['prot'], df['score']))
            # remove the original positives
            df = df[~df['prot'].isin(orig_pos)]
            df.reset_index(inplace=True, drop=True)
            df = df[['prot', 'score']]
            curr_scores = dict(zip(df['prot'], df['score']))
            df.sort_values(by='score', ascending=False, inplace=True)
            #print(df.head())
            if len(node_list) == 0:
                node_list = set(list(df[:k_to_test[0]]['prot']))
            pred_nodes = node_list
            if kwargs.get('paths_to_virus'):
                pred_nodes = get_paths_to_virus_nodes(node_list, net_obj, virhost_edges, **kwargs)
            node_types = {} 
            # if the drug nodes were part of the network, then get the top predicted drugs from the prediction scores
            if drug_nodes is not None and drugG is None:
                if len(node_list) > 0:
                    top_k_drug_nodes = node_list
                elif kwargs.get('paths_to_virus'):
                    top_k_drug_nodes = list(df[df['prot'].isin(drug_nodes)][:k_to_test[0]]['prot'])
                pred_nodes = get_paths_to_virus_nodes(top_k_drug_nodes, net_obj, virhost_edges, **kwargs)
                node_types = {d: 'drug' for d in drug_nodes}

            pred_nodes -= (set(virus_nodes) | set(krogan_nodes))
            all_nodes = set(pred_nodes) | set(krogan_nodes)

            # build the network to post
            pred_edges, graph_attr, attr_desc2, node_type_rank = build_subgraph(
                alg, pred_nodes, curr_scores, all_nodes,
                net_obj, graph_attr, node_types,
                min_edge_width=2 if not kwargs.get('edge_weight_cutoff') else 3,
                max_edge_width=8 if not kwargs.get('edge_weight_cutoff') else 6,
                **kwargs)
            attr_desc.update(attr_desc2)

            # add the node rank to the name of the node
            for n, rank in node_type_rank.items():
                uniprot_to_gene[n] += "\n%s"%rank 

            # now also add the virus edges for the human prots that interact with predicted prots
            net_nodes = set([n for e in pred_edges for n in e])
            # add the drugs that target the predicted nodes, if specified
            if drugG is not None:
                drugs_skipped = set()
                before = len(pred_edges)
                if len(node_list) > 0:
                    drugs_with_target = [n for n in node_list if drugG.has_node(n)]
                else:
                    drugs_with_target = [n for n in net_nodes if drugG.has_node(n)]
                for n in drugs_with_target:
                    for d in drugG.neighbors(n):
                        if drugG.degree[d] >= kwargs.get('degree_cutoff',1000):
                            drugs_skipped.add(d)
                        elif kwargs.get('drug_list_file') and d not in drug_list:
                            continue
                        else:
                            pred_edges.add((d,n))
                            graph_attr[d].update(drug_node_styles)
                            graph_attr[(d,n)].update(drug_edge_styles) 
                            # also add a popup with the # targets
                            attr_desc[d]['# targets'] = drugG.degree[d]
                print("\tadded %d drug-target edges" % (len(pred_edges) - before))
                if len(drugs_skipped) > 0:
                    print("\t%d drug-target edges skipped from drug with > %s targets: %s" % (len(drugs_skipped), kwargs.get('degree_cutoff',100), ', '.join(drugs_skipped)))
            pred_edges.update(set([(v,h) for v,h in virhost_edges if h in net_nodes]))
            net_nodes = set([n for e in pred_edges for n in e])
            print("\t%d edges, %d nodes" % (len(pred_edges), len(net_nodes)))

            # add the styles for the virus and krogan nodes
            for n in net_nodes:
                if n in virus_nodes:
                    # styles for the virus nodes
                    graph_attr[n] = virus_node_styles 
                    #graph_attr[n]['group'] = virus_group
                elif n in krogan_nodes:
                    # styles for the human nodes
                    graph_attr[n].update(krogan_node_styles)
                    #graph_attr[n]['group'] = 
                #elif drug_nodes is not None and n in drug_nodes:
                #    graph_attr[n]['group'] = "DrugBank Drugs"
                #else: 
                #    graph_attr[n]['group'] = "Human Proteins"
            for e in virhost_edges:
                graph_attr[e] = virhost_edge_styles

            evidence=None
            if kwargs.get('edge_evidence_file'):
                evidence, _,_ = gs_utils.getEvidence(pred_edges, evidence_file=kwargs['edge_evidence_file'])
            if kwargs.get('drug_target_info_file'):
                evidence = defaultdict(dict) if evidence is None else evidence
                for (drug, target), pmids in drug_target_pmids.items():
                    references = [{'pmid': pmid, 'text': pmid_citations[pmid]} for pmid in pmids]
                    evidence[(drug, target)]['DrugBank'] = references

            # Now post to graphspace!
            print("Building GraphSpace graph")
            popups = {}
            for i, n in enumerate(net_nodes):
                if n in virus_nodes:
                    continue
                if node_desc is not None and n in node_desc:
                    attr_desc[n].update(node_desc[n])
                node_type = 'drugbank' if drug_nodes and n in drug_nodes else 'uniprot'
                popups[n] = gs.buildNodePopup(n, node_type=node_type, attr_val=attr_desc)
            for u,v in pred_edges:
                popups[(u,v)] = gs.buildEdgePopup(u,v, node_labels=uniprot_to_gene, attr_val=attr_desc, evidence=evidence)
            G = gs.constructGraph(pred_edges, node_labels=uniprot_to_gene, graph_attr=graph_attr, popups=popups)
            
            # set of group nodes to add to the graph
            if kwargs.get('parent_nodes'):
                #group_nodes_to_add = [virus_group, krogan_group, drug_group, human_group]
                group_nodes_to_add = set(attr['parent'] for n, attr in graph_attr.items() if 'parent' in attr) 
                add_group_nodes(G, group_nodes_to_add, graph_attr, attr_desc)

            # TODO add an option to build the 'graph information' tab legend/info
            # build the 'Graph Information' metadata
            #desc = gs.buildGraphDescription(opts.edges, opts.net)
            desc = ''
            metadata = {'description':desc,'tags':kwargs.get('tags',[]), 'title':''}
            G.set_data(metadata)
            if 'graph_exp_name' in dataset:
                graph_exp_name = dataset['graph_exp_name']
            else:
                #graph_exp_name = config_utils.get_dataset_name(dataset) 
                graph_exp_name = "%s-%s" % (dataset['exp_name'].split('/')[-1], dataset['net_version'].split('/')[-1])
            graph_name = "%s-%s-k%s%s" % (
                alg, graph_exp_name, k_to_test[0], kwargs.get('name_postfix',''))
                #"test","", "")

            if kwargs.get('term_to_highlight'):
                graph_name += "-%s-%s" % (kwargs['term_to_highlight'][0], term_names[kwargs['term_to_highlight'][0]].replace(' ','-')[:25])
            if kwargs.get('node_to_post') is not None:
                graph_name += '-'.join(kwargs['node_to_post'])
            G.set_name(graph_name)
            # also set the legend
            G = set_legend(G)
            # write the posted network to a file if specified
            if kwargs.get('out_pref'):
                out_file = "%s%s.txt" % (kwargs['out_pref'], graph_name)
                os.makedirs(os.path.dirname(out_file), exist_ok=True)
                print("writing network to %s" % (out_file))
                # remove any newlines from the node name if they're there
                node_labels = {n: n.replace('\n','-') for n in G.nodes(data=False)}
                # TODO write the node data as well
                G2 = nx.relabel_nodes(G, node_labels, copy=True)
                nx.write_edgelist(G2, out_file)

            gs.post_graph_to_graphspace(
                    G, kwargs['username'], kwargs['password'], graph_name, 
                    apply_layout=kwargs['apply_layout'], layout_name=kwargs['layout_name'],
                    group=kwargs['group'], make_public=kwargs['make_public'])
Esempio n. 5
0
def main(config_map, **kwargs):
    """
    *config_map*: everything in the config file
    *kwargs*: all of the options passed into the script
    """
    # extract the general variables from the config map
    input_settings, input_dir, output_dir, alg_settings, kwargs \
        = config_utils.setup_config_variables(config_map, **kwargs)
    #algs = config_utils.get_algs_to_run(alg_settings, **kwargs)
    #del kwargs['algs']
    for dataset in input_settings['datasets']:
        print("Loading data for %s" % (dataset['net_version']))
        # load the network and the positive examples for each term
        net_obj, ann_obj, eval_ann_obj = run_eval_algs.setup_dataset(
            dataset, input_dir, **kwargs)
        # find the shortest path(s) from each drug node to any virus node
        if kwargs.get('edge_weight_cutoff'):
            print("\tapplying edge weight cutoff %s" %
                  (kwargs['edge_weight_cutoff']))
            W = net_obj.W
            W = W.multiply((W > kwargs['edge_weight_cutoff']).astype(int))
            net_obj.W = W
            num_nodes = np.count_nonzero(
                W.sum(axis=0))  # count the nodes with at least one edge
            num_edges = (
                len(W.data) / 2
            )  # since the network is symmetric, the number of undirected edges is the number of entries divided by 2
            print("\t%d nodes and %d edges" % (num_nodes, num_edges))

    graph_attr = defaultdict(dict)
    attr_desc = defaultdict(dict)
    if kwargs.get('graph_attr_file'):
        graph_attr, attr_desc = gs.readGraphAttr(kwargs['graph_attr_file'])
    # load the namespace mappings
    uniprot_to_gene = None
    # also add the protein name
    uniprot_to_prot_names = None
    # these
    node_desc = defaultdict(dict)
    if kwargs.get('id_mapping_file'):
        df = pd.read_csv(kwargs['id_mapping_file'], sep='\t', header=0)
        ## keep only the first gene for each UniProt ID
        uniprot_to_gene = {
            p: genes.split(' ')[0]
            for p, genes in zip(df['Entry'], df['Gene names'].astype(str))
        }
        if 'Protein names' in df.columns:
            uniprot_to_prot_names = dict(
                zip(df['Entry'], df['Protein names'].astype(str)))
            node_desc = {
                n: {
                    'Protein names': uniprot_to_prot_names[n]
                }
                for n in uniprot_to_prot_names
            }

    # load human-virus ppis
    print("reading %s" % (kwargs['sarscov2_human_ppis']))
    df = pd.read_csv(kwargs['sarscov2_human_ppis'], sep='\t')
    edges = zip(df[df.columns[0]], df[df.columns[1]])
    #edges = [(v.replace("SARS-CoV2 ",""), uniprot_to_gene[h]) for v,h in edges]
    edges = [(v.replace("SARS-CoV2 ", ""), h) for v, h in edges]
    virus_nodes = [v for v, h in edges]
    krogan_nodes = [h for v, h in edges]
    virhost_edges = edges

    print("reading %s" % (kwargs['simplified_terms_file']))
    df = pd.read_csv(kwargs['simplified_terms_file'], sep='\t')
    kwargs['terms_to_highlight'] = df[df.columns[0]]
    term_names = dict(zip(df[df.columns[0]], df[df.columns[1]]))
    term_names = {
        t: name.replace('\\n', '\n')
        for t, name in term_names.items()
    }
    # if there's a term_colors column, use that
    term_colors = dict(zip(df[df.columns[0]], df['term_colors']))
    print(df.head())

    # read the enrichment results
    df, term_ann = load_enrichment_file(**kwargs)
    # make a parent node per term in the simplified file
    # also get the term name, and the enrichment p-value(?)
    #term_names = dict(zip(df.index, df[('Description', 'Unnamed: 1_level_1', 'Unnamed: 1_level_2')]))
    pred_nodes = set()
    # setup their graph_attributes for posting
    covered_prots = set()
    prot_per_parent_term = defaultdict(set)
    # TODO need to order these correctly so that each term will have prots
    for i, t in enumerate(kwargs['terms_to_highlight']):
        name = term_names[t]
        color = term_colors[t]
        # UPDATE, make the terms be regular nodes
        graph_attr[name].update(group_node_styles)
        graph_attr[name]["background-opacity"] = 0.8
        graph_attr[name]['color'] = color
        graph_attr[name]['shape'] = 'ellipse'
        graph_attr[name]['text-valign'] = 'bottom'
        # add this node as the parent to the other nodes
        if kwargs.get('virus_nodes'):
            pass
        else:
            #for n in set(term_ann[t]) - covered_prots:
            for n in set(term_ann[t]) & set(krogan_nodes):
                graph_attr[n]['parent'] = name
                graph_attr[n]['color'] = color
                pred_nodes.add(n)
                covered_prots.add(n)
                prot_per_parent_term[name].add(n)
        # TODO add the link
        # this is used to make the popup
        #attr_desc[('parent', name)] = t
    for parent, prots in prot_per_parent_term.items():
        print("%d prots for %s" % (len(prots), parent))
    # if the node list file is not specified, then set the annotated nodes as the node list
    #if not kwargs.get('node_list_file'):
    #    node_list = nodes

    # make the group node be a name, and the individual nodes be the terms(?)
    # for now, just leave the terms as regular nodes
    # make the size of the node be the number of proteins annotated
    #setup_term_nodes(term_ann, term_names)
    term_num_ann = {
        term_names[t]: len(ann)
        for t, ann in term_ann.items() if t in term_names
    }
    graph_attr = gs.set_node_size(
        term_names.values(),
        term_num_ann,
        graph_attr,
    )  # a=20, b=80, min_weight=None, max_weight=None)
    # add the number of ann to the popup, and the GO ID
    attr_desc.update({('# ann', t): num_ann
                      for t, num_ann in term_num_ann.items()})
    attr_desc.update({('GO ID', term_names[t]): t for t in term_names})

    krogan_attr = setup_krogan_group_nodes(virhost_edges,
                                           graph_attr=graph_attr,
                                           **kwargs)
    for n in krogan_attr:
        graph_attr[n].update(krogan_attr[n])
    # add an edge from each virus group node to the GO term nodes, and weight by...
    curr_pred_nodes, pred_edges = setup_virus_to_term_edges(
        net_obj, virhost_edges, term_ann, term_names)
    if not kwargs.get('virus_nodes'):
        pred_nodes.update(
            set([n for n in curr_pred_nodes if n in set(krogan_nodes)]))
    pred_nodes.update(
        set([t for t, h in pred_edges]).union(set([h for t, h in pred_edges])))
    # make sure all the terms are added as nodes
    pred_nodes.update(set(term_names.values()))

    # also add styles to the edges
    for e in pred_edges:
        graph_attr[e]['color'] = dark_gray
        graph_attr[e]['width'] = 5
        graph_attr[e]['opacity'] = 0.5
    graph_attr = gs.set_edge_width(pred_edges,
                                   pred_edges,
                                   graph_attr,
                                   a=kwargs.get('min_edge_width', 2),
                                   b=kwargs.get('max_edge_width', 12))

    #print(graph_attr['Q92769'])
    popups = {}
    for n in pred_nodes:
        if n in attr_desc:
            popups[n] = gs.buildNodePopup(n, attr_val=attr_desc)
            #popups[n] = gs.buildNodePopup(n, node_type=node_type, attr_val=attr_desc)
    node_labels = {}
    G = gs.constructGraph(pred_edges,
                          prednodes=pred_nodes,
                          node_labels=uniprot_to_gene,
                          graph_attr=graph_attr,
                          popups=popups)

    # set of group nodes to add to the graph
    #if kwargs.get('parent_nodes'):
    #group_nodes_to_add = [virus_group, krogan_group, drug_group, human_group]
    #group_nodes_to_add = set(attr['parent'] for n, attr in graph_attr.items() if 'parent' in attr)
    #add_group_nodes(G, group_nodes_to_add, graph_attr, attr_desc)
    desc = ''
    metadata = {
        'description': desc,
        'tags': kwargs.get('tags', []),
        'title': ''
    }
    G.set_data(metadata)
    if 'graph_exp_name' in dataset:
        graph_exp_name = dataset['graph_exp_name']
    else:
        #graph_exp_name = config_utils.get_dataset_name(dataset)
        graph_exp_name = "%s-%s" % (dataset['exp_name'].split('/')[-1],
                                    dataset['net_version'].split('/')[-1])
    #graph_name = "%s-k%s%s" % (
    #    graph_exp_name, k_to_test[0], kwargs.get('name_postfix',''))
    graph_name = "%s%s" % (graph_exp_name, kwargs.get('name_postfix', ''))
    #"test","", "")

    if kwargs.get('term_to_highlight'):
        graph_name += "-%sterms" % (len(kwargs['term_to_highlight']))
    G.set_name(graph_name)
    # also set the legend
    #G = set_legend(G)
    # write the posted network to a file if specified
    if kwargs.get('out_pref'):
        out_file = "%s%s.txt" % (kwargs['out_pref'], graph_name)
        os.makedirs(os.path.dirname(out_file), exist_ok=True)
        print("writing network to %s" % (out_file))
        net_data = nx.node_link_data(G)
        net_data.update(G.get_style_json())
        #print(list(net_data.items())[:10])
        print("writing %s" % (out_file.replace('.txt', '.json')))
        with open(out_file.replace('.txt', '.json'), 'w') as outfile:
            json.dump(net_data, outfile, indent=4, sort_keys=True)
        # remove any newlines from the node name if they're there
        node_labels = {n: n.replace('\n', '-') for n in G.nodes(data=False)}
        # TODO write the node data as well
        G2 = nx.relabel_nodes(G, node_labels, copy=True)
        nx.write_edgelist(G2, out_file)
        sys.exit()

    # put the parent nodes and the nodes in the parent nodes in a grid layout automatically
    print("Setting the x and y coordinates of each node in a grid layout")
    # relabel the nodes to their names
    graph_attr = {
        uniprot_to_gene.get(n, n): attr
        for n, attr in graph_attr.items()
    }
    layout = gs_utils.grid_layout(G, graph_attr)
    for node, (x, y) in layout.items():
        G.set_node_position(node_name=node, x=x, y=y)

    print("%d nodes and %d edges to post" %
          (G.number_of_nodes(), G.number_of_edges()))
    gs.post_graph_to_graphspace(G,
                                kwargs['username'],
                                kwargs['password'],
                                graph_name,
                                apply_layout=kwargs['apply_layout'],
                                layout_name=kwargs['layout_name'],
                                group=kwargs['group'],
                                make_public=kwargs['make_public'])
Esempio n. 6
0
def main(config_map, **kwargs):
    """
    *config_map*: everything in the config file
    *kwargs*: all of the options passed into the script
    """
    # extract the general variables from the config map
    input_settings, input_dir, output_dir, alg_settings, kwargs \
        = config_utils.setup_config_variables(config_map, **kwargs)

    # load the namespace mappings
    uniprot_to_gene = None
    if kwargs.get('id_mapping_file'):
        uniprot_to_gene = load_gene_names(kwargs.get('id_mapping_file'))
        kwargs['uniprot_to_gene'] = uniprot_to_gene

    genesets_to_test = config_map.get('genesets_to_test')
    if genesets_to_test is None or len(genesets_to_test) == 0:
        print("ERROR: no genesets specified to test for overlap. " +
              "Please add them under 'genesets_to_test'. \nQuitting")
        sys.exit()

    # first load the gene sets
    geneset_groups = {}
    for geneset_to_test in genesets_to_test:
        name = geneset_to_test['name']
        gmt_file = "%s/genesets/%s/%s" % (input_dir, name,
                                          geneset_to_test['gmt_file'])
        if not os.path.isfile(gmt_file):
            print("WARNING: %s not found. skipping" % (gmt_file))
            sys.exit()

        geneset_groups[name] = utils.parse_gmt_file(gmt_file)

    df = pd.read_csv(kwargs['prot_list_file'], sep='\t', header=None)
    prots_to_test = list(df[df.columns[0]])
    print("%d prots for which to test enrichment. (top 10: %s)" %
          (len(prots_to_test), prots_to_test[:10]))
    prot_universe = None
    # load the protein universe file
    if kwargs.get('prot_universe_file') is not None:
        df = pd.read_csv(kwargs['prot_universe_file'], sep='\t', header=None)
        prot_universe = df[df.columns[0]]
        print("\t%d prots in universe" % (len(prot_universe)))
        if kwargs.get('add_prot_list_to_prot_universe'):
            # make sure the list of proteins passed in are in the universe
            size_prot_universe = len(prot_universe)
            prot_universe = set(prots_to_test) | set(prot_universe)
            if len(prot_universe) != size_prot_universe:
                print(
                    "\t%d prots from the prots_to_test added to the universe" %
                    (len(prot_universe) - size_prot_universe))

    out_pref = kwargs.get('out_pref')
    if out_pref is None:
        out_pref = "outputs/enrichment/%s" % (os.path.basename(
            kwargs['prot_list_file']).split('.')[0])

    bp_df, mf_df, cc_df = run_clusterProfiler_GO(
        prots_to_test,
        out_pref,
        prot_universe=prot_universe,
        forced=kwargs.get('force_run'),
        **kwargs)
Esempio n. 7
0
def main(config_map, **kwargs):
    """
    *config_map*: everything in the config file
    *kwargs*: all of the options passed into the script
    """
    # extract the general variables from the config map
    input_settings, input_dir, output_dir, alg_settings, kwargs \
        = config_utils.setup_config_variables(config_map, **kwargs)

    uniprot_to_gene = {}
    if kwargs.get('id_mapping_file'):
        print("Reading %s" % (kwargs['id_mapping_file']))
        df = pd.read_csv(kwargs['id_mapping_file'], sep='\t', header=0)
        ## keep only the first gene for each UniProt ID
        uniprot_to_gene = {
            p: genes.split(' ')[0]
            for p, genes in zip(df['Entry'], df['Gene names'].astype(str))
        }

    # # or we could get a distribution of distances for each virus node
    # # load human-virus ppis
    # df = pd.read_csv(kwargs['sarscov2_human_ppis'], sep='\t')
    # edges = zip(df[df.columns[0]], df[df.columns[1]])
    # edges = [(v.replace("SARS-CoV2 ",""), h) for v,h in edges]
    # virus_nodes = [v for v,h in edges]
    # krogan_nodes = [h for v,h in edges]
    # virhost_edges = edges

    # for each dataset, extract the path(s) to the prediction files,
    # read in the predictions, and test for the statistical significance of overlap
    for dataset in input_settings['datasets']:
        dataset_name = config_utils.get_dataset_name(dataset)
        print("Loading data for %s" % (dataset['net_version']))
        # load the network and the positive examples for each term
        net_obj, ann_obj, eval_ann_obj = run_eval_algs.setup_dataset(
            dataset, input_dir, **kwargs)
        prots, node2idx = net_obj.nodes, net_obj.node2idx
        print("\t%d total prots" % (len(prots)))
        # TODO using this for the SARS-CoV-2 project,
        # but this should really be a general purpose script
        # and to work on any number of terms
        orig_pos_idx, _ = alg_utils.get_term_pos_neg(ann_obj.ann_matrix, 0)
        orig_pos = [prots[p] for p in orig_pos_idx]
        print("\t%d original positive examples" % (len(orig_pos)))

        # convert the krogan nodes and drugs to ids
        #drug_nodes_idx = [node2idx[d] for d in drug_nodes if d in node2idx]
        krogan_nodes_idx = [node2idx[n] for n in orig_pos if n in node2idx]

        # load the top predictions
        alg_pred_files = config_utils.get_dataset_alg_prediction_files(
            output_dir, dataset, alg_settings, ['genemaniaplus'], **kwargs)

        k = kwargs.get('k', 332)
        cutoff = kwargs.get('cutoff', 0.01)
        # get the alpha values to use
        alphas = alg_settings['genemaniaplus']['alpha']
        alpha_frac_main_contr_nonnbrs = {}
        alpha_nodes_pos_nbr_dfsn = {}
        alpha_dfs = {}
        for alpha, alg in zip(alphas, alg_pred_files):
            print("Setting alpha=%s" % (alpha))
            pred_file = alg_pred_files[alg]
            if not os.path.isfile(pred_file):
                print("Warning: %s not found. skipping" % (pred_file))
                continue
            print("reading %s for alpha=%s" % (pred_file, alpha))
            df = pd.read_csv(pred_file, sep='\t')
            # remove the original positives
            df = df[~df['prot'].isin(orig_pos)]
            df.reset_index(inplace=True, drop=True)

            sig_cutoff = kwargs.get('stat_sig_cutoff')
            if sig_cutoff:
                df = config_utils.get_pvals_apply_cutoff(
                    df, pred_file, **kwargs)

            if k > len(df['prot']):
                print("ERROR: k %s > num predictions %s. Quitting" %
                      (k, len(df['prot'])))
                sys.exit()

            diff_mat_file = "%sdiffusion-mat-a%s.npy" % (
                net_obj.out_pref, str(alpha).replace('.', '_'))

            pred_scores = np.zeros(len(net_obj.nodes))
            df = df[:k]
            top_k_pred = df['prot']
            top_k_pred_idx = [net_obj.node2idx[n] for n in top_k_pred]
            pred_scores[top_k_pred_idx] = df['score'].values

            sig_str = "-sig%s" % (str(sig_cutoff).replace(
                '.', '_')) if sig_cutoff else ""
            out_pref = "outputs/viz/%s/%s/diffusion-analysis/cutoff%s-k%s-a%s%s" % (
                dataset['net_version'], dataset['exp_name'], cutoff, k, alpha,
                sig_str)

            M_inv = gm.get_diffusion_matrix(net_obj.W,
                                            alpha=alpha,
                                            diff_mat_file=diff_mat_file)

            frac_main_contr_nonnbrs, nodes_pos_nbr_dfsn = get_effective_diffusion_score(
                pred_scores,
                M_inv,
                net_obj,
                krogan_nodes_idx,
                alpha=alpha,
                diff_mat_file=diff_mat_file,
                out_pref=out_pref,
                **kwargs)
            # # exploratory analysis:
            # dist_main_contributors(pred_scores, M_inv, krogan_nodes_idx, k, net_obj.W, prots, uniprot_to_gene)

            alpha_frac_main_contr_nonnbrs[alpha] = frac_main_contr_nonnbrs
            # convert the node IDs to protein IDs
            nodes_pos_nbr_dfsn = {
                prots[n]: val
                for n, val in nodes_pos_nbr_dfsn.items()
            }
            alpha_nodes_pos_nbr_dfsn[alpha] = nodes_pos_nbr_dfsn
            alpha_dfs[alpha] = df

        #out_pref = "outputs/viz/%s/%s/diffusion-comp/%s-%s-rand-set-diffusion-comp.pdf" % (
        out_pref = "outputs/viz/%s/%s/diffusion-analysis/cutoff%s-k%s%s" % (
            dataset['net_version'], dataset['exp_name'], cutoff, k, sig_str)
        os.makedirs(os.path.dirname(out_pref), exist_ok=True)

        # now plot
        f, axes = plt.subplots(ncols=len(alphas),
                               figsize=(max([6, len(alphas) * 3]), 6),
                               sharey=True)
        if len(alphas) == 1:
            axes = [axes]
        for alpha, ax in zip(alphas, axes):
            plot_fracs(alpha_frac_main_contr_nonnbrs[alpha].values(),
                       alpha,
                       ax=ax,
                       cutoff=kwargs.get('cutoff', 0.5))
        #for i, alpha in enumerate(alphas):
        #    frac_main_contr_nonnbrs = frac_main_contr_nonnbrs_list
        out_file = "%s-effective-diffusion.pdf" % (out_pref)
        #print(out_file)
        # plt.savefig(out_file, bbox_inches='tight')
        plt.close()

        out_pref = "outputs/viz/%s/%s/diffusion-analysis/k%s%s" % (
            dataset['net_version'], dataset['exp_name'], k, sig_str)
        out_file = "%s-frac-nonnbr-dfsn.pdf" % (out_pref)

        df = pd.DataFrame(alpha_nodes_pos_nbr_dfsn)
        #print(df)
        print("median effective diffusion values:")
        print(df.median())
        ylabel = 'Fraction Non-Nbr Diffusion'
        plot_effective_diffusion(df,
                                 out_file,
                                 xlabel="Alpha",
                                 ylabel=ylabel,
                                 title="")

        ## also make a scatterplot of the nodes rank with the effective diffusion
        for alpha in alphas:
            pred_df = alpha_dfs[alpha]
            pred_df.set_index('prot', inplace=True)
            #alpha_diff_df = df[alpha]
            df['score'] = pred_df[pred_df.index.isin(df.index)]['score']
            #print(df.head())

            # out_file = "%s-alpha%s-eff-diff-by-score.pdf" % (out_pref, alpha)
            # plot_eff_diff_by_rank(df, alpha, out_file, xlabel="Score", ylabel="Effective Diffusion", title="Alpha=%s, k=%s" % (alpha, k))

            if kwargs.get('terms_file'):
                out_file = "%s-alpha%s-eff-diff-per-term.pdf" % (out_pref,
                                                                 alpha)
                plot_eff_diff_per_term(df[alpha],
                                       kwargs['terms_file'],
                                       out_file,
                                       title="Alpha=%s, k=%s" % (alpha, k))

            # TODO count the distance away of the main contributors
            # count the # krogan proteins each prediction is connected to(?)
            out_file = "%s-alpha%s-num-pos-nbrs.pdf" % (out_pref, alpha)
            plot_num_krogan_nbrs(df, alpha, krogan_nodes_idx, net_obj,
                                 out_file)