def write_global_rec_tfs(self, rec_tfs_file, out_file):
        """ write the complete list of receptors and tfs from the intput file that are in the interactome
        """
        print("\t Creating the '%s' file set of receptors and tfs from %s" %
              (out_file, rec_tfs_file))
        if not self.net:
            self.net = self.build_network(self.interactome)

        # first get the list of all human rec and tfs
        receptors, tfs = t_utils.getRecTFs(rec_tfs_file)

        # all of the rec and tfs should be in the interactome AND rec have outgoing edges, tfs incoming edges
        # No protein should be both a rec and tf
        # remove receptors and tfs that aren't in the interactome
        receptors = set([
            r for r in receptors
            if r in self.net and len(self.net.out_edges(r)) > 0
        ])
        tfs = set([
            tf for tf in tfs
            if tf in self.net and len(self.net.in_edges(tf)) > 0
        ])
        # remove receptors and tfs that are in both
        receptors = set([r for r in receptors if r not in tfs])
        tfs = set([tf for tf in tfs if tf not in receptors])

        # now write the output file
        t_utils.checkDir(os.path.dirname(out_file))
        t_utils.writeRecTFs(out_file, receptors, tfs)
Esempio n. 2
0
def call_post_to_graphspace(version, chemicals, **kwargs):
    INPUTSPREFIX, RESULTSPREFIX, interactome = t_settings.set_version(version)
    t_utils.checkDir("%s/graphspace" % (RESULTSPREFIX))
    # write the color files of each chemical and return a dictionary of the chemical and its color file
    #if kwargs['revigo_colors']:
    chemical_color_files = None
    if kwargs.get('revigo_file') or kwargs.get('term_counts_file'):
        chemical_color_files, function_colors = write_revigo_color_files(
            chemicals, RESULTSPREFIX, forced=kwargs['forcepost'], **kwargs)
        kwargs['function_colors'] = function_colors
        print(chemical_color_files)
    kwargs['tags'] = kwargs['tags'] + [version] if kwargs.get('tags') else [
        version
    ]
    # post everything to graphspace!
    for chemical in chemicals:
        # get the chemical name. make sure it doesn't have any '%' or '[',']' as that will break posting
        chemName = chemIDtoName[chemical].replace('%', '').replace('[',
                                                                   '').replace(
                                                                       ']', '')
        rec_tfs_file = t_settings.REC_TFS_FILE % (INPUTSPREFIX, chemical)
        edgelinker_output_file = "%s/edgelinker/%s-paths.txt" % (RESULTSPREFIX,
                                                                 chemical)
        output_json = "%s/graphspace/%s-graph%s.json" % (
            RESULTSPREFIX, chemical, kwargs.get('name_postfix', ''))
        proteins, num_paths = t_utils.getProteins(paths=edgelinker_output_file,
                                                  max_k=kwargs['k_to_post'],
                                                  ties=True)
        if not kwargs['forcepost'] and os.path.isfile(output_json):
            print("%s already exists. Use --forcepost to overwrite it" %
                  (output_json))
        else:
            build_graph_and_post(
                version,
                interactome,
                rec_tfs_file,
                RESULTSPREFIX,
                edgelinker_output_file,
                chemical,
                max_k=num_paths,
                graph_name="%s-%s-%s%s" %
                (chemName, chemical, version, kwargs.get('name_postfix', '')),
                #name_postfix='-'+version, tag=version, chemical_color_file=)
                graph_attr_file=chemical_color_files.get(chemical)
                if chemical_color_files is not None else None,
                ev_file=kwargs['evidence_file'],
                out_pref="%s/graphspace/%s%s" %
                (RESULTSPREFIX, chemical, kwargs.get('name_postfix', '')),
                **kwargs)
    def write_assayed_rec_tfs(self, rec_tfs_files, out_file):
        """ write the complete list of receptors and tfs from the intput file that are in the interactome
        Should only be called after each chemical's list of rec and tfs have been written
        """
        print(
            "\tCreating the '%s' file set of receptors and tfs from %d rec_tfs_files"
            % (out_file, len(rec_tfs_files)))
        if not self.net:
            self.net = self.build_network(self.interactome)

        receptors = set()
        tfs = set()
        for rec_tfs_file in rec_tfs_files:
            # first get all of the receptors and tfs
            rec, t = t_utils.getRecTFs(rec_tfs_file)
            receptors.update(set(rec))
            tfs.update(set(t))

        # all of the rec and tfs should be in the interactome
        # No protein should be both a rec and tf
        # remove receptors and tfs that aren't in the interactome
        orig_len_rec = len(receptors)
        orig_len_tfs = len(tfs)
        receptors = set([
            r for r in receptors
            if r in self.net and len(self.net.out_edges(r)) > 0
        ])
        tfs = set([
            tf for tf in tfs
            if tf in self.net and len(self.net.in_edges(tf)) > 0
        ])
        print("\tRemoved %d recptors and %d tfs not in the interactome" %
              (orig_len_rec - len(receptors), orig_len_tfs - len(tfs)))
        # remove receptors and tfs that are in both
        orig_len_rec = len(receptors)
        orig_len_tfs = len(tfs)
        receptors = set([r for r in receptors if r not in tfs])
        tfs = set([tf for tf in tfs if tf not in receptors])
        print(
            "\tRemoved %d recptors and %d tfs that were in both the receptors and tfs sets"
            % (orig_len_rec - len(receptors), orig_len_tfs - len(tfs)))

        # now write the output file
        t_utils.checkDir(os.path.dirname(out_file))
        t_utils.writeRecTFs(out_file, receptors, tfs)
    def write_all_rec_tfs(self, out_file):
        """ write the complete list of receptors and tfs perturbed by any chemical as well as the assay name
        """
        print("\t Creating the '%s' file set of receptors and tfs" %
              (out_file))
        if not self.net:
            self.net = self.build_network(self.interactome)

        t_utils.checkDir(os.path.dirname(out_file))
        out = open(out_file, 'w')
        out.write("#uniprot_acc\tnode_type\tassay_name\n")
        for r in self.receptor_assays:
            for acc in self.assayNametoAccListHuman[r]:
                if acc in self.net:
                    out.write(acc + '\t' + 'receptor' + '\t' + r + '\n')
        for tf in self.tf_assays:
            for acc in self.assayNametoAccListHuman[tf]:
                if acc in self.net:
                    out.write(acc + '\t' + 'tf' + '\t' + tf + '\n')
        out.close()
Esempio n. 5
0
def load_prots(chemicals, paths_dir, out_dir, k_limit=200, **kwargs):
    if kwargs['run_on_hits']:
        chem_prots = toxcast_data.chemical_protein_hit
        reports_dir = "%s/chemical-hits-reports/" % (out_dir)
    else:
        # one chemical and protein ID pair on each line
        prots_file = "%s/chem-prots.txt" % (out_dir)
        if not kwargs['forced'] and os.path.isfile(prots_file):
            print("reading %s. Use --forced to overwrite" % (prots_file))
            s = pd.read_csv(prots_file,
                            sep='\t',
                            index_col=0,
                            header=None,
                            squeeze=True)
            # now convert it back to a dictionary
            chem_prots = {
                chem: prots.to_list()
                for chem, prots in s.groupby(s.index)
            }
        else:
            # load the proteins in each chemical's network
            edgelinker_output = paths_dir + '/%s-paths.txt'
            print("Reading paths for each chemical network from: %s" %
                  (edgelinker_output))
            chem_prots = {}
            for chemical in chemicals:
                proteins = t_utils.getProteins(paths=edgelinker_output %
                                               chemical,
                                               max_k=k_limit)
                chem_prots[chemical] = list(proteins)
            s = pd.Series(chem_prots).explode()
            print("writing %s" % (prots_file))
            s.to_csv(prots_file, sep='\t', header=False)

        reports_dir = "%s/chemical-reports/" % (out_dir)
    t_utils.checkDir(os.path.dirname(reports_dir))
    return chem_prots, reports_dir
    # first plot the distribution of edge weights in the response networks
    # get all of the response network edges and their edge weight in the interactome
    cyclinker_file = "%s/cyclinker/%%s-paths.txt" % (t_settings.RESULTSPREFIX)
    for chemical in tqdm(chemicals):
        edges = t_utils.getEdges(paths_file=cyclinker_file % (chemical),
                                 max_k=200,
                                 ties=True)
        #tqdm.write("%d edges" % len(edges))
        for edge in edges:
            network_weights.append(edge_weights[edge])

    # now plot the distribution of edge weights
    #out_file_name = "response-network-weight-dist-dir-k500-%s.png" % (version)
    out_file_name = "response-network-weight-dist-k200-%s.png" % (version)
    out_dir = "%s/plots/edge-weights/" % (t_settings.RESULTSPREFIX)
    t_utils.checkDir(out_dir)
    out_file = "%s/%s" % (out_dir, out_file_name)
    if opts.compare_versions:
        out_dir_compare_versions = "viz/version_plots/edge-weights/"
        t_utils.checkDir(out_dir_compare_versions)
        out_file_compare_versions = "%s/%s" % (out_dir_compare_versions,
                                               out_file_name)

    print("Plotting response network edge weight histogram to %s" % (out_file))

    fig, ax = plt.subplots()
    print("%s of edges with a weight > %s" %
          (len([w for w in network_weights if w > 0.95]) /
           float(len(network_weights)), 0.95))

    ax.hist(network_weights, bins=30)
Esempio n. 7
0
def main(chemicals,
         paths_dir,
         out_dir,
         k_limit=200,
         forced=False,
         pval_cutoff=0.05,
         corr_type="BF",
         **kwargs):

    t_utils.checkDir(out_dir)

    # one chemical and protein ID pair on each line
    prots_file = "%s/chem-prots.txt" % (out_dir)

    if kwargs['run_on_hits']:
        toxcast_data = t_utils.loadToxcastData()
        chem_prots = toxcast_data.chemical_protein_hit
        reports_dir = "%s/chemical-hits-reports/" % (out_dir)
    else:
        if not forced and os.path.isfile(prots_file):
            print("reading %s. Use --forced to overwrite" % (prots_file))
            s = pd.read_csv(prots_file,
                            sep='\t',
                            index_col=0,
                            header=None,
                            squeeze=True)
            # now convert it back to a dictionary
            chem_prots = {
                chem: prots.to_list()
                for chem, prots in s.groupby(s.index)
            }
        else:
            # load the proteins in each chemical's network
            cyclinker_output = paths_dir + '/%s-paths.txt'
            print("Reading paths for each chemical network from: %s" %
                  (cyclinker_output))
            chem_prots = {}
            for chemical in chemicals:
                proteins = t_utils.getProteins(paths=cyclinker_output %
                                               chemical,
                                               max_k=k_limit)
                chem_prots[chemical] = list(proteins)
            s = pd.Series(chem_prots).explode()
            print("writing %s" % (prots_file))
            s.to_csv(prots_file, sep='\t', header=False)

        reports_dir = "%s/chemical-reports/" % (out_dir)
    t_utils.checkDir(os.path.dirname(reports_dir))
    # run the DAVID analysis on each of them
    client = None
    # reset the client each time. Maybe this isn't needed?
    for chem in tqdm(chemicals):
        chart_file = "%s/%s.txt" % (reports_dir, chem)
        if not forced and os.path.isfile(chart_file):
            print("%s already exists. Use --forced to overwrite" %
                  (chart_file))
            continue
        if client is None:
            print("Setting up david client")
            client = david_client.DAVIDClient()
            client.set_category('GOTERM_BP_DIRECT')
        print(chem)
        prots = chem_prots[chem]
        # pass the list of proteins
        client.setup_inputs(','.join(prots),
                            idType='UNIPROT_ACCESSION',
                            listName=chem)
        # make sure we're using the right list
        #print(client.client.service.getCurrentList())
        # build the functional annotation chart
        client.build_functional_ann_chart()

        # and write each to a file
        #print("writing %s" % (chart_file))
        client.write_functional_ann_chart(chart_file)

    pval_col = "Pvalue"
    if corr_type == "BF":
        pval_col = "Bonferroni"
    elif corr_type == "BH":
        pval_col = "Benjamini"
    # now read each of them and write a combined file
    dfs = []
    for chem in chemicals:
        chart_file = "%s/%s.txt" % (reports_dir, chem)
        if not os.path.isfile(chart_file):
            print("%s doesn't exist. Skipping" % (chart_file))
            continue
        df = pd.read_csv(chart_file, sep='\t')
        # apply the p-value cutoff
        df = df[df[pval_col] < pval_cutoff]
        df = df[['Term', pval_col]]
        # split the name and id
        df['GOID'] = df['Term'].apply(lambda x: x.split('~')[0])
        df['Term'] = df['Term'].apply(lambda x: x.split('~')[1])
        df['Chemical'] = chem
        print(len(df))
        dfs.append(df)

    df_all = pd.concat(dfs)
    print(df_all.head())
    all_terms_file = "%s/%schemical%s-sig-terms-%s-c%s.tsv" % (
        out_dir, len(chemicals), "-hits" if kwargs['run_on_hits'] else "s",
        pval_col.lower(), str(pval_cutoff).replace('.', '_'))
    df_all.to_csv(all_terms_file,
                  sep='\t',
                  index=None,
                  columns=['Chemical', 'Term', 'GOID', pval_col])

    # now compare the overlap of the enriched terms!
    #df_all.groupby('Term').value_counts()
    counts = df_all[['Term', 'GOID']].value_counts()
    print(counts)
    counts_file = "%s/%schemicals-sig-terms-%s-c%s-counts.tsv" % (
        out_dir, len(chemicals), pval_col.lower(), str(pval_cutoff).replace(
            '.', '_'))
    print("writing to %s" % (counts_file))
    counts.to_csv(counts_file, header=False, sep='\t')
    def write_chemical_perturbed_rec_tfs(self,
                                         chemicals_file,
                                         rec_tfs_dir,
                                         include_zscore_weight=False):
        """ write the chemicals file as well as the perturbed rec and tfs for each chemical
        We are writing a single file for each chemical so we can use each file for running pathlinker/cyclinker
        """
        print(
            "\tWriting the chemicals file (%s) as well as the perturbed rec and tfs for each chemical in %s"
            % (chemicals_file, rec_tfs_dir))
        t_utils.checkDir(rec_tfs_dir)

        # first write a general file with the hit rec and tf per chemical
        out_file = "%s/../chem_rec_tfs.gmt" % (rec_tfs_dir)
        self.write_chem_rec_tfs(out_file)
        # also write a table with the number of hit rec and tfs per chemical
        out_file = "%s/../chemical_num_rectfs.txt" % (rec_tfs_dir)
        self.write_chem_num_rectfs(out_file)

        # keep track of all of the receptors and tfs and write them to a file as well
        all_rec = set()
        all_tfs = set()

        chemicals = self.chemical_rec.keys()

        # first write the chemicals
        with open(chemicals_file, 'w') as out:
            out.write('\n'.join([
                "%s\t%s" % (chemical, self.chemIDtoName[chemical])
                for chemical in chemicals
            ]))

        if include_zscore_weight is True:
            zscores = []
            for chem, prots in self.chemical_rec.items():
                zscores += [
                    self.chemical_protein_zscore[chem][p] for p in prots
                ]
            for chem, prots in self.chemical_tfs.items():
                zscores += [
                    self.chemical_protein_zscore[chem][p] for p in prots
                ]
            # use the maximum to normalize the zscores
            max_zscore = max(zscores)
            print("max zscore is: %0.2f" % max_zscore)

        for chem in tqdm(chemicals):
            # some of the self.chemicals have spaces in their names, so use the ID rather than the name.
            rec = set(self.chemical_rec[chem])
            tfs = set(self.chemical_tfs[chem])
            all_rec.update(rec)
            all_tfs.update(tfs)
            if include_zscore_weight is False:
                t_utils.writeRecTFs(
                    "%s/%s-rec-tfs.txt" % (rec_tfs_dir, chemical), rec, tfs)
            else:
                # convert the zscore to a cost by taking 1 - (zscore / max zscore)
                # the lower the zscore, the higher the cost will be
                zscores = {}
                curr_zscores = self.chemical_protein_zscore[chem]
                for prots in (self.chemical_rec[chem],
                              self.chemical_tfs[chem]):
                    for p in prots:
                        zscore = curr_zscores[p]
                        zscores[p] = zscore if not pd.isnull(zscore) else 0
                costs = {
                    p: 1 - (zscore / float(max_zscore))
                    for p, zscore in zscores.items()
                }
                t_utils.writeRecTFs("%s/%s-rec-tfs.txt" % (rec_tfs_dir, chem),
                                    rec,
                                    tfs,
                                    costs=costs,
                                    zscores=zscores)

        out_file = "%s/all-rec-tfs.txt" % (rec_tfs_dir)
        print("Writing all of the assayed receptors and tfs to the file: %s" %
              (out_file))
        t_utils.writeRecTFs(out_file, all_rec, all_tfs)
    def __init__(self,
                 include_nuclear_receptors=False,
                 forced=False,
                 verbose=False):
        # boolean value to either include or exclude nuclear receptors
        self.include_nuclear_receptors = include_nuclear_receptors
        # option to print(various parsing statistics)
        self.verbose = verbose
        self.forced = forced
        # inputs dir
        # 2019-08: Updating to use toxcast v3 data
        self.input_dir = "inputs/toxcast-tox21-v3"
        # date present in the file names of the files for this version
        self.version_date = "190708"

        # input files
        self.assay_file = "%s/Assay_Summary_%s.csv" % (self.input_dir,
                                                       self.version_date)
        self.s2_assay_file = "%s/S2-ToxCast-Assays.tsv" % (self.input_dir)
        self.chemical_summary_file = "%s/Chemical_Summary_%s.csv" % (
            self.input_dir, self.version_date)
        self.zscore_file = "%s/zscore_Matrix_%s.csv" % (self.input_dir,
                                                        self.version_date)
        self.hitc_file = "%s/hitc_Matrix_%s.csv" % (self.input_dir,
                                                    self.version_date)
        #self.chemical_types_file = "%s/chemical_types.tsv" % (self.input_dir)

        # output files
        self.parsed_dir = "%s/parsed" % (self.input_dir)
        t_utils.checkDir(self.parsed_dir)
        self.chem_rec_tfs_file = "%s/chem_rec_tfs.gmt" % (self.parsed_dir)
        self.chem_hits_file = "%s/chem_prot_hits.csv" % (self.parsed_dir)
        self.chem_zscore_file = "%s/chem_prot_zscores.csv" % (self.parsed_dir)

        # mapping dictionaries
        self.chemIDtoName = {}
        self.chemNametoID = {}
        self.chemIDtoTYPE = {}
        self.chemTYPEtoID = defaultdict(set)
        self.assayNametoAccHuman = {}
        self.assayAcctoNameHuman = defaultdict(set)
        self.assayNametoAccListHuman = {}
        self.assayNametoType = {}

        # assay type anaylsis
        self.assay_types = []
        # key is the assay type.
        # each assay type has a list of the proteins perturbed for each chemical
        self.assay_type_hits = {}
        # key is chemical ID, assay results list (0,1,-1 or NA) is the value
        self.chemical_assay_hits = {}
        # key is chemical ID, list of z-scores is the value
        self.chemical_assay_zscores = {}
        # list of assays in the hits file
        self.hit_assays = []
        # these next two are from the Assay_Summary or S2_Assay_Summary
        # key is assay, value is type_sub
        self.intended_target_type_sub = {}
        # key is assay, value is family
        self.intended_target_family = {}
        # receptor and tf assays
        # key is assay, value is acc
        self.receptor_assays = {}
        self.tf_assays = {}

        # the sets of hit receptors and TFs (from the Assay summary file)for each chemical
        self.chemical_rec = defaultdict(set)
        self.chemical_tfs = defaultdict(set)
        # proteins stored as uniprot accession IDs
        # prot: 0 or 1. Each protein is labelled a 'hit' if any of the assays are 'hit'
        self.chemical_protein_hit = defaultdict(dict)
        # prot: zscore. This is the largest zscore value of any of the hit (1) assays
        self.chemical_protein_zscore = defaultdict(dict)
Esempio n. 10
0
def write_revigo_color_files(chemicals, RESULTSPREFIX, **kwargs):
    """
    If a file downloaded from REVIGO is passed in, then use that to set the term colors and boxes.
    Otherwise, just remove the most frequent term. Hopefully there isn't too much overlap in the remaining terms.
        TODO Another possible strategy is to cluster the terms myself and select a single term per cluster.
    """

    # assign a color to each term
    out_dir = "%s/graphspace/colors" % (RESULTSPREFIX)
    t_utils.checkDir(out_dir)
    #print("Writing REVIGO colors to %s for %d chemicals. (limit of %d colors)" % (out_dir, len(chemicals), len(colors)))

    chem_color_files = {}

    for chemical in chemicals:
        out_prefix = "%s/%s" % (out_dir, chemical)
        out_file = "%s-colors.tsv" % (out_prefix)
        # first read the david results file
        david_file = "%s/stats/go-analysis/chemical-reports/%s.txt" % (
            RESULTSPREFIX, chemical)
        print("reading %s" % (david_file))
        df = pd.read_csv(david_file, sep='\t')
        print(df.head())
        # get just the term ids
        df['term'] = df['Term'].apply(lambda x: x[:x.find('~')])
        df['name'] = df['Term'].apply(lambda x: x.split('~')[1])
        # build a dictionary from the term to the prots
        orig_term_names = dict(zip(df['term'], df['name']))
        term_prots = {
            t: prots.replace(', ', '|')
            for t, prots in zip(df['term'], df['Genes'])
        }
        term_pvals = dict(zip(df['term'], df['Bonferroni']))
        term_names = dict(zip(df['term'], df['name']))
        name_to_term = dict(zip(df['name'], df['term']))

        # read the revigo file and extract the GO term info
        if kwargs.get('revigo_file'):
            if not os.path.isfile(kwargs['revigo_file']):
                print("ERROR: --revigo-file '%s' not found." %
                      (kwargs['revigo_file']))
                sys.exit()
            print("reading %s" % (kwargs['revigo_file']))
            df_r = pd.read_csv(kwargs['revigo_file'], sep=',')
            print(df_r.head())
            # sort by pval
            #df_r = df_r.sort_values("log10 p-value")
            term_names = dict(zip(df_r['term_ID'], df_r['description']))
            selected_terms = list(term_names.keys())
        elif kwargs.get('term_counts_file'):
            if not os.path.isfile(kwargs['term_counts_file']):
                print("ERROR: --term-counts-file '%s' not found." %
                      (kwargs['term_counts_file']))
                sys.exit()
            print("reading %s" % (kwargs['term_counts_file']))
            df_r = pd.read_csv(kwargs['term_counts_file'],
                               sep='\t',
                               names=['term_name', 'count'])
            print(df_r.head())
            freq_cutoff = kwargs.get('freq_cutoff', .75)
            print("applying a frequency cutoff of %s" % (freq_cutoff))
            df_r['freq'] = df_r['count'] / df_r['count'].max()
            term_freq = dict(zip(df_r['term_name'], df_r['freq']))
            # sort by pval
            df = df.sort_values("Bonferroni")
            # apply a cutoff of 0.01
            df = df[df['Bonferroni'] < 0.01]
            selected_terms = []
            for name in df['name']:
                if term_freq[name] < freq_cutoff:
                    selected_terms.append(name_to_term[name])

        term_popups = {}
        link_template = "<a style=\"color:blue\" href=\"https://www.ebi.ac.uk/QuickGO/GTerm?id=%s\" target=\"DB\">%s</a>"
        for term in selected_terms:
            term_link = link_template % (term, term)
            popup = "<b>QuickGO</b>: %s" % (term_link)
            popup += "<br><b>p-value</b>: %0.2e" % (float(term_pvals[term]))
            term_popups[term] = popup

        function_colors = write_colors_file(out_file, selected_terms,
                                            term_names, term_prots,
                                            term_popups)
        chem_color_files[chemical] = out_file

        new_func_colors = defaultdict(dict)
        for term in function_colors:
            new_func_colors[term]['prots'] = term_prots[term]
            new_func_colors[term]['color'] = function_colors[term]
            new_func_colors[term][
                'link'] = "https://www.ebi.ac.uk/QuickGO/GTerm?id=%s" % (term)
            new_func_colors[term]['name'] = orig_term_names[term]
            # if uid in pathway_colors[pathway]['prots']:
            #     pathway_link = '<a style="color:%s" href="%s">%s</a>' % (pathway_colors[pathway]['color'], pathway_colors[pathway]['link'], pathway)
    return chem_color_files, new_func_colors
Esempio n. 11
0
def main(chemicals,
         paths_dir,
         out_dir,
         pval_cutoff=0.05,
         corr_type="BF",
         **kwargs):

    global toxcast_data, uniprot_to_gene
    toxcast_data = t_utils.loadToxcastData()
    chemIDtoCAS, chemCAStoID = get_chemical_map(toxcast_data)
    uniprot_to_gene_df = pd.read_csv(kwargs['mapping_file'],
                                     sep='\t',
                                     header=None)
    uniprot_to_gene = dict(uniprot_to_gene_df.values)
    t_utils.checkDir(out_dir)

    chem_prots, reports_dir = load_prots(chemicals, paths_dir, out_dir,
                                         **kwargs)

    ctd_genes, ctd_chem_itxs = load_ctd_data(kwargs['ctd_file'], chemCAStoID)

    # To get the background set of genes for the hypergeometric test,
    # get the proteins that are both in CTD and in the interactome
    print("reading %s" % (kwargs['interactome']))
    df = pd.read_csv(kwargs['interactome'], sep='\t', comment='#', header=None)
    ppi_prots = set(df[0]) | set(df[1])
    # map both to the gene name space
    ppi_genes = set(uniprot_to_gene[p] for p in ppi_prots)
    print("\t%d interactome_genes" % (len(ppi_genes)))

    background_genes = ppi_genes & ctd_genes
    print("%d genes both in the interactome and in CTD" %
          (len(background_genes)))
    print(
        "limiting CTD phosphorylation interactions to those in the interactome"
    )
    ctd_chem_itxs = {c: p & ppi_genes for c, p in ctd_chem_itxs.items()}
    # also add the other chemicals
    for c in chemicals:
        if c not in ctd_chem_itxs:
            ctd_chem_itxs[c] = set()

    chem_pval = {}
    chem_net_prots_with_ctd = {}
    # TODO try both making random subsets and the hypergeometric test
    pop_size = len(background_genes)
    #num_success_states_in_pop = len(set(p for c,p in ctd_itxs.items()))
    for chem, prots in chem_prots.items():
        genes = set(uniprot_to_gene[p] for p in prots)
        #if len(genes) != len(prots):
        #    print("Warning: %s: num genes != num prots! (%s, %s)" % (chem, len(genes), len(prots)))
        num_genes_with_ctd = len(genes & ctd_chem_itxs[chem])
        chem_net_prots_with_ctd[chem] = num_genes_with_ctd
        # number of draws is the # genes in the network
        num_draws = len(genes)
        # number of successes is the # genes in the network with a CTD interaction
        num_successes = num_genes_with_ctd
        # number of success stats in the population is the number of phosphorylation interactions of this chemical
        num_success_states_in_pop = len(ctd_chem_itxs[chem])
        M, n, N, k = pop_size, num_success_states_in_pop, num_draws, num_successes
        # Use k-1 since the survival function (sf) gives 1-cdf. The cdf at k gives the probability of drawing k or fewer. The sf at k is the probability of drawing k+1 or more
        # https://blog.alexlenail.me/understanding-and-implementing-the-hypergeometric-test-in-python-a7db688a7458
        # https://github.com/scipy/scipy/issues/7837
        pval = hypergeom.sf(k - 1, M, n, N)
        chem_pval[chem] = pval

    # now write to a file
    out_file = "%s/CTD-stat-sig.tsv" % (out_dir)
    print("writing %s" % (out_file))
    with open(out_file, 'w') as out:
        header_line = '\t'.join([
            "ChemID", "ChemName", "# net prots", "# CTD phospho prots",
            "# overlap", "pval", "BF corr-pval"
        ])
        out.write(header_line + '\n')
        for chem, prots in chem_prots.items():
            name = toxcast_data.chemIDtoName[chem]
            out.write('\t'.join(
                str(x) for x in [
                    chem, name,
                    len(prots),
                    len(ctd_chem_itxs[chem]), chem_net_prots_with_ctd[chem],
                    chem_pval[chem], chem_pval[chem] * len(chemicals)
                ]) + '\n')
def get_summary_stats(version="2018_01-toxcast-d2d-p1_5-u1_25",
                      summary_file="network_summaries.csv",
                      scope="permute-dir-undir",
                      forced=False):
    """ Function to aggregate summary statistics for every network
    returns a dataframe containing the counted metrics for each chemical
    """
    TOXCAST_DATA = t_utils.loadToxcastData(t_settings.INTERACTOMES[version])
    #inputs_dir = "inputs/%s/" % (version)
    t_settings.set_version(version)
    inputs_dir = t_settings.INPUTSPREFIX
    outputs_dir = "outputs/%s/weighted" % (version)
    chemicals = utils.readItemList("%s/chemicals.txt" % (inputs_dir), 1)
    #hits_template = "%s/hit-prots/%%s-hit-prots.txt" % (inputs_dir)
    #nonhits_template = "%s/hit-prots/%%s-nonhit-prots.txt" % (inputs_dir)
    #rec_tfs_template = "%s/rec-tfs/%%s-rec-tfs.txt" % (inputs_dir)
    chem_rec, chem_tfs = TOXCAST_DATA.chemical_rec, TOXCAST_DATA.chemical_tfs
    chem_prot_hit_vals = TOXCAST_DATA.chemical_protein_hit
    paths_dir = "%s/edgelinker" % (outputs_dir)
    paths_template = "%s/%%s-paths.txt" % (paths_dir)

    out_dir = "%s/stats/summary" % outputs_dir
    t_utils.checkDir(out_dir)
    summary_file = "%s/%s" % (out_dir, summary_file)
    if os.path.isfile(summary_file) and not forced:
        print(
            "Reading network summary stats from '%s'. Set forced to True to overwrite it."
            % (summary_file))
        df = pd.read_csv(summary_file, index_col=0)
    else:
        print("Reading in the stats from the response networks in", paths_dir)
        chemical_names, chemical_name_to_id = t_utils.getChemicalNameMaps()
        chemical_names = {
            chemical: chemical_names[chemical]
            for chemical in chemicals
        }
        chemical_prots = {}
        chemical_num_paths = {}
        chemical_num_edges = {}
        chemical_avg_path_lengths = {}
        chemical_rec = {}
        chemical_tfs = {}
        chemical_net_rec = {}
        chemical_net_tfs = {}
        chemical_hits = {}
        chemical_nonhits = {}
        chemical_net_hits = {}
        chemical_net_nonhits = {}
        chemical_inter_hits = {}
        chemical_inter_nonhits = {}
        chemical_inter_net_hits = {}
        chemical_inter_net_nonhits = {}
        # also get the q-value for each chemical
        chemical_pvals = {}
        pvals_file = "%s/stats/stat-sig-%s/gpd-pval.txt" % (outputs_dir, scope)
        if os.path.isfile(pvals_file):
            with open(pvals_file, 'r') as file_handle:
                header = file_handle.readline().rstrip().split('\t')
            pval_col = header.index("200") + 1
            chemical_pvals = {
                chem: pval
                for chem, pval in utils.readColumns(pvals_file, 1, pval_col)
            }
        chemical_qvals = {}
        qvals_file = "%s/stats/stat-sig-%s/bfcorr_pval_qval.txt" % (
            outputs_dir, scope)
        if os.path.isfile(qvals_file):
            chemical_qvals = t_utils.getPvals(outputs_dir,
                                              scope,
                                              sig_cutoff_type="FDR")
        for chemical in tqdm(chemicals):
            #prots, paths = getProteins(paths=paths_template % chemical, max_k=200, ties=True)
            paths = t_utils.getPaths(paths_template % chemical,
                                     max_k=200,
                                     ties=True)
            prots = set()
            num_paths = len(paths)
            edges = set()
            path_lengths = []
            for path in paths:
                path = path.split('|')
                # path length is the number of edges in a path
                path_lengths.append(len(path) - 1)
                prots = prots.union(set(path))
                for i in range(len(path) - 1):
                    edges.add((path[i], path[i + 1]))

            chemical_prots[chemical] = len(prots)
            chemical_num_paths[chemical] = len(paths)
            chemical_avg_path_lengths[chemical] = np.mean(path_lengths)
            chemical_num_edges[chemical] = len(edges)
            #rec, tfs = t_utils.getRecTFs(rec_tfs_template % chemical)
            rec, tfs = chem_rec[chemical], chem_tfs[chemical]
            chemical_rec[chemical] = len(rec)
            chemical_tfs[chemical] = len(tfs)
            chemical_net_rec[chemical] = len(prots.intersection(rec))
            chemical_net_tfs[chemical] = len(prots.intersection(tfs))
            # read the hits and nonhits for each chemical to calculate how many of them are in the network
            #hits = utils.readItemSet(hits_template % chemical, 1)
            #nonhits = utils.readItemSet(nonhits_template % chemical, 1)
            hits = set([p for p, hit_val in chem_prot_hit_vals[chemical].items() \
                    if hit_val == 1])
            nonhits = set([p for p, hit_val in chem_prot_hit_vals[chemical].items() \
                    if hit_val == 0])
            chemical_hits[chemical] = len(hits)
            chemical_nonhits[chemical] = len(nonhits)
            chemical_net_hits[chemical] = len(hits.intersection(prots))
            chemical_net_nonhits[chemical] = len(nonhits.intersection(prots))
            # subtract the rec and tfs to get just the intermediate hits and nonhits
            chemical_inter_hits[chemical] = len(hits.difference(
                rec.union(tfs)))
            chemical_inter_nonhits[chemical] = len(
                nonhits.difference(rec.union(tfs)))
            chemical_inter_net_hits[chemical] = len(
                hits.intersection(prots).difference(rec.union(tfs)))
            chemical_inter_net_nonhits[chemical] = len(
                nonhits.intersection(prots).difference(rec.union(tfs)))

        # write these metrics to a file
        df = pd.DataFrame({
            "name": chemical_names,
            "prots": chemical_prots,
            "num_paths": chemical_num_paths,
            "pvals": chemical_pvals,
            "qvals": chemical_qvals,
            "num_edges": chemical_num_edges,
            "avg_path_lengths": chemical_avg_path_lengths,
            "net_rec": chemical_net_rec,
            "net_tfs": chemical_net_tfs,
            "hit_rec": chemical_rec,
            "hit_tfs": chemical_tfs,
            "net_hits": chemical_net_hits,
            "net_nonhits": chemical_net_nonhits,
            'hits': chemical_hits,
            'nonhits': chemical_nonhits,
            "inter_net_hits": chemical_inter_net_hits,
            "inter_net_nonhits": chemical_inter_net_nonhits,
            "inter_hits": chemical_inter_hits,
            "inter_nonhits": chemical_inter_nonhits,
        })
        print("Writing: ", summary_file)
        df.to_csv(summary_file,
                  header=True,
                  columns=[
                      'name', 'prots', 'num_paths', 'num_edges',
                      'avg_path_lengths', 'hits', 'nonhits', 'net_hits',
                      'net_nonhits', 'hit_rec', 'hit_tfs', 'net_rec',
                      'net_tfs', 'inter_net_hits', 'inter_net_nonhits',
                      'inter_hits', 'inter_nonhits', 'pvals', 'qvals'
                  ])

    # change the index or chemical id to unicode (string)
    #df.index = df.index.map(unicode)

    return df
Esempio n. 13
0
def splitRecTFsFamilyNodes(chemicals, version, interactome_file):
    """
    """
    # leave some nodes as family nodes as that's how they are in the toxcast data
    map_family_to_prot = {
        # FOS,JUN,FOSL1,FOSL2,JUNB,JUND,FOSB: FOS,JUN
        "P01100,P05412,P15407,P15408,P17275,P17535,P53539": ["P01100,P05412"],
        # FOS,JUN,SP1: FOS,JUN
        "P01100,P05412,P08047": ["P01100,P05412"],
        # FOS,JUN: FOS,JUN
        "P01100,P05412": ["P01100,P05412"],
        # TCF7,TCF7L1,TCF7L2,LEF1: TCF7,TCF7L1,TCF7L2,LEF1
        "P36402,Q9HCS4,Q9NQB0,Q9UJU2": ["P36402,Q9HCS4,Q9NQB0,Q9UJU2"],
        # FOXO3,FOXO4,FOXO1: FOXO3,FOXO4,FOXO1
        "O43524,P98177,Q12778": ["O43524,P98177,Q12778"],
    }

    rec_tfs_file = "inputs/%s/rec-tfs/%%s-rec-tfs.txt" % (version)
    interactomes_dir = "inputs/%s" % (version)
    t_utils.checkDir(interactomes_dir)
    new_interactome_file = "%s/%s-interactome.txt" % (interactomes_dir,
                                                      version)
    # get the set of family nodes from the interactome
    print("Reading the interactome from %s" % (interactome_file))
    lines = utils.readColumns(interactome_file, 1, 2, 3)
    family_nodes = set(
        [N for U, V, w in lines for N in (U, V) if len(N.split(',')) > 1])
    print(
        "Splitting the source/target family nodes of all chemicals in the interactome and writing to %s"
        % (new_interactome_file))
    # set of family nodes to split from all chemicals
    family_to_split = {}
    for chemical in tqdm(chemicals):
        rec, tfs = t_utils.getRecTFs(rec_tfs_file % (chemical))
        for N in family_nodes:
            for n in rec.union(tfs):
                if n in N:
                    if N not in family_to_split:
                        family_to_split[N] = set()
                    family_to_split[N].add(n)
    # leave some tfs as family nodes because that's how they're listed in toxcast
    family_to_split.update(map_family_to_prot)

    split_rec = set()
    split_tfs = set()
    new_interactome = []
    all_new_edges = set()
    # it's a bit ad hoc because the weight of the family edge is the max of the individual edges,
    # and now we're setting the edge weight of the split edges to be the max of the individual edges and the family edge
    new_edge_weights = {}
    #new_edge_ev = {}
    # there could be multiple family edges contributing to a single edge
    for U, V, w in lines:
        new_edges = set()
        # split up the rec/tf family nodes
        if U in family_to_split and V in family_to_split:
            split_rec.add(U)
            split_tfs.add(V)
            for u in family_to_split[U]:
                for v in family_to_split[V]:
                    new_edges.add((u, v))
        elif U in family_to_split:
            split_rec.add(U)
            for u in family_to_split[U]:
                new_edges.add((u, V))
        elif V in family_to_split:
            split_tfs.add(V)
            for v in family_to_split[V]:
                new_edges.add((U, v))
        # otherwise leave the edge as it is
        else:
            new_interactome.append((U, V, w))
            continue

        all_new_edges.update(new_edges)
        for (u, v) in new_edges:
            if (u, v) not in new_edge_weights:
                new_edge_weights[(u, v)] = set()
            new_edge_weights[(u, v)].add(float(w))
            # for now, don't write the evidence to each of the new networks to save on space
            # the evidence is present in the original interactome and the evidence file
            #if (u,v) not in new_edge_ev:
            #    new_edge_ev[(u,v)] = set()
            #new_edge_ev[(u,v)].update(set(ev.split('|')))

    for u, v in all_new_edges:
        w = max(new_edge_weights[(u, v)])
        #ev = '|'.join(new_edge_ev[(u,v)])
        new_interactome.append((u, v, "%0.6f" % w))

    # now write the new interactome
    print("Writing the new interactome with rec/tf family nodes split to %s" %
          (new_interactome_file))
    with open(new_interactome_file, 'w') as out:
        out.write('\n'.join(['\t'.join(line)
                             for line in new_interactome]) + '\n')

    # also write the family nodes that were split
    mapping = getUniprotToGeneMapping(version)
    # also write the mapping from the rec/tf family node to the proteins it came from
    out_file = "inputs/%s/family-split-rec-tfs.txt" % (version)
    print(
        "Writing a mapping of the split family rec/tfs and the protein hits they came from to: %s"
        % (out_file))
    with open(out_file, 'w') as out:
        out.write('\n'.join([
            "%s\t%s\t%s\t%s" %
            (N, '|'.join(family_to_split[N]), mapping[N],
             '|'.join([mapping[n] for n in family_to_split[N]]))
            for N in sorted(family_to_split)
        ]) + '\n')

    print("A total of %d family nodes were split" % (len(family_to_split)))

    # add the zscore penalty to the few family nodes in the ToxCast data
    toxcast_family_nodes = [N[0] for N in map_family_to_prot.values()]
    addRecTFsFamilyNodes(chemicals,
                         version,
                         family_nodes=toxcast_family_nodes,
                         costs=True)
Esempio n. 14
0
def permute_and_run_edgelinker(opts, random_index):
    if opts.write_score_counts:
        rand_scores_k = "%s/rand-networks/rand-%d-med-scores-k.txt" % (
            opts.write_score_counts, random_index)
        # if the final score counts file already exists, then don't do anything
        if os.path.isfile(rand_scores_k) and not opts.forced:
            print("%s already exists. Skipping." % (rand_scores_k))
            return
        chemical_k_scores = "%s/chemical-k-median-scores.txt" % (
            opts.write_score_counts)
        if not os.path.isfile(chemical_k_scores):
            print(
                "Error: %s does not exist. Run compute_stat_sig.py with the --write-counts option to write it. Quitting"
                % (chemical_k_scores))
            return

    t_utils.checkDir("%s/networks" % (opts.out_dir))
    rec_tfs_file_template = "%s/rec-tfs/%%s-rec-tfs.txt" % (opts.inputs_dir)
    chemicals = sorted(
        utils.readItemList("%s/chemicals.txt" % opts.inputs_dir, col=1))
    if opts.single_chem:
        chemicals = opts.single_chem

    if opts.permute_rec_tfs is not None:
        # if specified, "permute" the sets of receptors and tfs for each chemical instead of the interactome
        print("Writing random sets of rec/tfs for each chemical to %s" %
              (opts.out_dir))
        rec_tfs_file_template = "%s/%%s/%d-random-rec-tfs.txt" % (opts.out_dir,
                                                                  random_index)
        all_rec, all_tfs = t_utils.getRecTFs(opts.permute_rec_tfs)
        #chemical_num_rectfs_file = "%s/chemical_num_rectfs.txt" % (opts.inputs_dir)
        #lines = utils.readColumns(chemical_num_rectfs_file, 2, 3, 4)
        #for chem, num_rec, num_tfs in tqdm(lines):
        for chemical in tqdm(chemicals, disable=opts.verbose):
            out_file = rec_tfs_file_template % (chemical)
            if not os.path.isfile(out_file) or opts.forced:
                rec, tfs, costs, zscores = t_utils.getRecTFs(
                    t_settings.REC_TFS_FILE % (opts.inputs_dir, chemical),
                    costs=True)
                rec = list(rec)
                tfs = list(tfs)

                out_dir = "%s/%s" % (opts.out_dir, chemical)
                t_utils.checkDir(out_dir)
                random_rec = random.sample(all_rec, len(rec))
                # apply the costs to the random rec and tfs
                for i in range(len(rec)):
                    costs[random_rec[i]] = costs[rec[i]]
                    zscores[random_rec[i]] = zscores[rec[i]]
                random_tfs = random.sample(all_tfs, len(tfs))
                for i in range(len(tfs)):
                    costs[random_tfs[i]] = costs[tfs[i]]
                    zscores[random_tfs[i]] = zscores[tfs[i]]
                t_utils.writeRecTFs(out_file,
                                    random_rec,
                                    random_tfs,
                                    costs=costs,
                                    zscores=zscores)
        # use the original interactome
        permuted_network_out_file = opts.interactome
        print("Using the original interactome %s" %
              (permuted_network_out_file))
    else:
        # default is to permute the interactome
        permuted_network_out_file = '%s/networks/permuted-network%d.txt' % (
            opts.out_dir, random_index)
        if not os.path.isfile(permuted_network_out_file) or opts.forced:
            # don't log transform. The weights will be log transformed by the edgelinker code
            #G = cycLinker.readNetwork(opts.interactome, weight=True, logtransform=False)
            # UPDATE: 2017-12-07: try using the direction of the edges from the fourth column of the interactome instead of splitting based on if the edge is bidirected or not
            G = nx.DiGraph()
            dir_edges = []
            undir_edges = []
            lines = utils.readColumns(opts.interactome, 1, 2, 3, 4)
            if len(lines) == 0:
                print(
                    "ERROR: interactome should have 4 columns: a, b, w, and True/False for directed/undirected. Quitting"
                )
                sys.exit()
            for u, v, w, directed in lines:
                G.add_edge(u, v, weight=float(w))
                if directed.lower() in ["true", "t", "dir", 'directed']:
                    dir_edges.append((u, v))
                elif directed.lower() not in [
                        "false", 'f', 'undir', 'undirected'
                ]:
                    print(
                        "ERROR: Unknown directed edge type '%s'. 4th column should be T/F to indicdate directed/undirected"
                        % (directed.lower()))
                    print("Quitting.")
                    sys.exit()
                elif u < v:
                    undir_edges.append((u, v))

            if opts.undirected:
                # swap all edges as undirected edges
                permG = permute_network.permute_network(
                    G.to_undirected(), num_iterations=opts.num_iterations)
                permG = permG.to_directed()
            elif opts.split_by_weight:
                # split the edges into bins by weight and swap the directed and undirected edges separately
                # if specified by the user
                permG = permute_network.permute_network(
                    G,
                    swap_phys_sig_sep=opts.swap_phys_sig_sep,
                    split_weight=opts.split_by_weight,
                    num_iterations=opts.num_iterations)
            elif opts.swap_phys_sig_sep:
                # swap the directed and undirected edges separately
                permG = permute_network.permute_network(
                    G,
                    swap_phys_sig_sep=opts.swap_phys_sig_sep,
                    num_iterations=opts.num_iterations,
                    edge_lists=(undir_edges, dir_edges))
            else:
                # if none of the options are specified, then swap everything as directed edges
                permG = permute_network.permute_network(
                    G, num_iterations=opts.num_iterations)
            print("Writing %s" % (permuted_network_out_file))
            nx.write_weighted_edgelist(permG,
                                       permuted_network_out_file,
                                       comments='#',
                                       delimiter='\t')
        else:
            print("Using %s" % (permuted_network_out_file))

    # now run edgelinker on each of the chemicals using the permuted network
    # if version is netpath, use the different type of input file
    # TODO fix this
    # PATHLINKERDATAVERSIONS
    #if 'kegg' in opts.inputs_dir or 'netpath' in opts.inputs_dir:
    #    rec_tfs_file_template = "%s/rec-tfs/%%s-nodes.txt" % (opts.inputs_dir)
    in_files = []
    out_files = []
    for chemical in tqdm(chemicals, disable=opts.verbose):
        rec_tfs_file = rec_tfs_file_template % (chemical)
        in_files.append(os.path.abspath(rec_tfs_file))
        out_dir = "%s/%s" % (opts.out_dir, chemical)
        t_utils.checkDir(out_dir)
        out_pref = "%s/%d-random" % (out_dir, random_index)
        out_files.append(os.path.abspath(out_pref))
        # python implementation of edgelinker is taking too long. Switching to java for now.
        #run_write_edgelinker(permG, rec_tfs_file, opts.k, out_pref)
        # run the java implementation of edgelinker below

    # write the in and out files to the networks dir
    edgelinker_in_files = '%s/networks/permuted-network%d-infiles.txt' % (
        opts.out_dir, random_index)
    with open(edgelinker_in_files, 'w') as out:
        out.write('\n'.join(in_files))
    edgelinker_out_files = '%s/networks/permuted-network%d-outfiles.txt' % (
        opts.out_dir, random_index)
    with open(edgelinker_out_files, 'w') as out:
        out.write('\n'.join(out_files))
    print("Running edgelinker on chemical %s: %s" % (chemical, out_pref))
    run_edgelinker.runEdgeLinker(permuted_network_out_file,
                                 cyclinker_in_files,
                                 cyclinker_out_files,
                                 opts.k,
                                 edge_penalty=EDGE_PENALTY,
                                 rec_tfs_penalty=REC_TFS_PENALTY,
                                 multi_run=True)

    if opts.write_score_counts:
        # now that edgelinker has been run on all of the chemical sources/targets,
        # get the path counts for the chemical network's path scores
        # import compute_stat_sig.py and run the code directly. This avoids the issues of re-importing the libraries from baobab
        print(
            "Writing the counts for each of the scores for random index: '%d'"
            % (random_index))
        stat_sig = compute_stat_sig.StatSig(random_paths_dir=opts.out_dir,
                                            k_limit=opts.k,
                                            num_random=(random_index,
                                                        random_index),
                                            out_dir=opts.write_score_counts)
        stat_sig.write_rand_counts(chemicals=chemicals, forced=opts.forced)
#        cmd = "python src/compute_stat_sig.py " + \
#              " --chemicals %s/chemicals.txt " % (opts.inputs_dir) + \
#              " --random-paths-dir %s/ " % (opts.out_dir) + \
#              " -P --k-limit %d " % (opts.k) + \
#              " --num-random %d %d" % (random_index, random_index) + \
#              " --group-by-prob " + \
#              " --write-rand-counts " + \
#              " --out-dir %s " % (opts.write_score_counts)
#        if opts.forced:
#            cmd += " --forced "
#        print(cmd)
#        subprocess.check_call(cmd.split())

#if opts.run_mgsa_random:
#    run_mgsa_random(random_index)

    if opts.cleanup:
        print(
            "Deleting the generated permuted network and the edgelinker output files"
        )
        if permuted_network_out_file != opts.interactome:
            os.remove(permuted_network_out_file)
        os.remove(edgelinker_in_files)
        # remove the individual output files
        for cyc_out_file in out_files:
            # # 2017-02-17 - temporarilly don't remove the paths file for running MGSA
            os.remove(cyc_out_file + "-paths.txt")
            os.remove(cyc_out_file + "-ranked-edges.txt")
        os.remove(edgelinker_out_files)