コード例 #1
0
ファイル: conftest.py プロジェクト: lisabang/Camoco
def ZmSAM2(Zm5bFGS):
    if cf.test.force.COB:
        tools.del_dataset("Expr", "ZmSAM2", force=True)
    if not tools.available_datasets("Expr", "ZmSAM2"):
        return co.COB.from_table(
            os.path.join(
                cf.options.testdir,
                "raw",
                "Expr",
                "RNASEQ",
                "TranscriptomeProfiling_B73_Atlas_SAM_FGS_LiLin_20140316.txt.gz",
            ),
            "ZmSAM2",
            "Maize Root Network, but loose",
            Zm5bFGS,
            rawtype="RNASEQ",
            max_gene_missing_data=0.4,
            min_single_sample_expr=1,
            min_expr=0.01,
            quantile=False,
            dry_run=False,
            max_val=250,
        )
    else:
        return co.COB("ZmSAM2")
コード例 #2
0
def locality(args):
    log = coblog()
    log("\n"
        "-----------------------\n"
        "   Network Locality    \n"
        "-----------------------\n")
    # Generate output dirs
    if args.out != sys.stdout:
        args.out = "{}_Locality.tsv".format(args.out.replace(".tsv", ""))
    if os.path.dirname(args.out) != "":
        os.makedirs(os.path.dirname(args.out), exist_ok=True)
    if os.path.exists("{}_Locality.tsv".format(args.out.replace(".tsv", ""))):
        log("{}_Locality.csv exists! Skipping!".format(
            args.out.replace(".tsv", "")))
        return None
    # Grab the COB object
    cob = co.COB(args.cob)
    gwas = co.GWAS(args.gwas)
    # If there is a different score for 'significant', update the COB object
    if args.sig_edge_zscore is not None:
        cob.set_sig_edge_zscore(args.sig_edge_zscore)
    # If all, grab a generater
    if "all" in args.terms:
        terms = gwas.iter_terms()
    else:
        # Otherwise get the term out of the GWAS
        terms = (gwas[x] for x in args.terms)

    # Add in text for axes
    locality = pd.DataFrame([generate_data(cob, x, args) for x in terms])
    locality.to_csv(args.out, sep="\t", index=None)
コード例 #3
0
def AtSeed(AtTair10):
    if cf.test.force.COB:
        tools.del_dataset('Expr', 'AtSeed', force=True)
    if not tools.available_datasets('Expr', 'AtSeed'):
        Seed = [
            'GSE12404',  #'GSE30223',
            'GSE1051',
            'GSE11852',
            'GSE5634'
        ]
        SeedFam = sum([
            co.Family.from_file(
                os.path.join(cf.options.testdir, 'raw', 'GSE',
                             '{}_family.soft.gz'.format(x))) for x in Seed
        ])
        #SeedFam.to_keepfile("SeedKeep.tsv", keep_hint='seed')
        return co.COB.from_DataFrame(
            SeedFam.series_matrix(keepfile=os.path.join(
                cf.options.testdir, 'raw', 'GSE', 'SeedKeep.tsv')),
            'AtSeed',
            'Arabidopsis Seed',
            AtTair10,
            rawtype='MICROARRAY',
            quantile=True)
    else:
        return co.COB('AtSeed')
コード例 #4
0
def AtGen(AtTair10):
    if cf.test.force.COB:
        tools.del_dataset('Expr', 'AtGen', force=True)
    if not tools.available_datasets('Expr', 'AtGen'):
        General = [
            'GSE18975', 'GSE39384', 'GSE19271', 'GSE5632', 'GSE39385',
            'GSE5630', 'GSE15617', 'GSE5617', 'GSE5686', 'GSE2473', 'GSE5633',
            'GSE5620', 'GSE5628', 'GSE5624', 'GSE5626', 'GSE5621', 'GSE5622',
            'GSE5623', 'GSE5625', 'GSE5688'
        ]
        GenFam = sum([
            co.Family.from_file(
                os.path.join(cf.options.testdir, 'raw', 'GSE',
                             '{}_family.soft.gz'.format(x))) for x in General
        ])
        #GenFam.to_keepfile("GenKeep.tsv")
        return co.COB.from_DataFrame(
            GenFam.series_matrix(keepfile=os.path.join(
                cf.options.testdir, 'raw', 'GSE', 'GenKeep.tsv')),
            'AtGen',
            'Arab General',
            AtTair10,
            rawtype='MICROARRAY',
            quantile=True)
    else:
        return co.COB('AtGen')
コード例 #5
0
def AtRoot(AtTair10):
    if cf.test.force.COB:
        tools.del_dataset('Expr', 'AtRoot', force=True)
    if not tools.available_datasets('Expr', 'AtRoot'):
        Root = [
            'GSE14578', 'GSE46205', 'GSE7631', 'GSE10576', 'GSE42007',
            'GSE34130', 'GSE21611', 'GSE22966', 'GSE7641', 'GSE5620',
            'GSE8934', 'GSE5628', 'GSE30095', 'GSE30097', 'GSE5624', 'GSE5626',
            'GSE5749', 'GSE5621', 'GSE5622', 'GSE5623', 'GSE5625', 'GSE5688'
        ]
        RootFam = sum([
            co.Family.from_file(
                os.path.join(cf.options.testdir, 'raw', 'GSE',
                             '{}_family.soft.gz'.format(x))) for x in Root
        ])
        #RootFam.to_keepfile("RootKeep.tsv", keep_hint='root')
        return co.COB.from_DataFrame(
            RootFam.series_matrix(keepfile=os.path.join(
                cf.options.testdir, 'raw', 'GSE', 'RootKeep.tsv')),
            'AtRoot',
            'Arab Root',
            AtTair10,
            rawtype='MICROARRAY',
            quantile=True)
    else:
        return co.COB('AtRoot')
コード例 #6
0
def cistrans(args):
    cob = co.COB(args.cob)
    if args.out == None:
        args.out = "{}_cistrans".format(cob.name)
    # open an output file
    out = open(args.out + ".summary.txt", "w")
    # np.newaxis adds an empty axis in that position of the slice
    # the sklearn module requires the values to be in the rows:
    # http://scikit-learn.org/stable/auto_examples/neighbors/plot_kde_1d.html
    coex = cob._coex_DataFrame(sig_only=False)
    cis = coex.score[coex.distance <= args.cis_distance].values[:, np.newaxis]
    trans = coex.score[np.isinf(coex.distance)].values[:, np.newaxis]
    X_plot = np.linspace(-10, 10, 1000)[:, np.newaxis]
    str = "Found {:,} cis interactions and {:,} trans interactions".format(
        cis.shape[0], trans.shape[0])
    print(str)
    print(str, file=out)
    # Fit the kernel
    kd = KernelDensity(bandwidth=0.2)
    kd.fit(cis)
    cis_kde = np.exp(kd.score_samples(X_plot))
    plt.fill(X_plot, cis_kde, alpha=0.5, label="Cis Interactions")
    # Fit the trans
    kd.fit(trans[0:50000])
    trans_kde = np.exp(kd.score_samples(X_plot))
    plt.fill(X_plot, trans_kde, alpha=0.5, label="Trans Interactions")
    plt.legend()
    plt.title("Cis vs Trans Density: {}".format(cob.name))
    # Calculate the mann whitney U test
    u, pval = sp.stats.mannwhitneyu(cis[:, 0], trans[:, 0])
    print("P-val: {}".format(pval))
    print("P-val: {}".format(pval), file=out)
    print("Figure saved: {}".format(args.out + ".png"))
    plt.savefig(args.out + ".png")
コード例 #7
0
def ZmRNASeqTissueAtlas(Zm5bFGS):
    if cf.test.force.COB:
        print('Rebuilding ZmRNASeqTissueAtlas')
        tools.del_dataset('COB', 'ZmRNASeqTissueAtlas', force=True)
        tools.del_dataset('Expr', 'ZmRNASeqTissueAtlas', force=True)
    if not tools.available_datasets('Expr', 'ZmRNASeqTissueAtlas'):
        # Build it
        return co.COB.from_table(
            os.path.join(
                cf.options.testdir,
                'raw',
                'Expr',
                'RNASEQ',
                'MaizeRNASeqTissue.tsv.bz2',
            ),
            'ZmRNASeqTissueAtlas',
            'Maize RNASeq Tissue Atlas Network, Sekhon 2013, PLoS ONE',
            Zm5bFGS,
            rawtype='RNASEQ',
            max_gene_missing_data=0.3,
            max_accession_missing_data=0.08,
            min_single_sample_expr=1,
            min_expr=0.001,
            quantile=False,
            max_val=300,
            dry_run=True)
    else:
        return co.COB('ZmRNASeqTissueAtlas')
コード例 #8
0
def main():

    # Load both the co-expression networks into memory
    KLS = co.COB("KLS")
    KSS = co.COB("KSS")

    # Get the genes from each co-expression network and create an intersection
    # (only keep a list of genes in both networks)
    common_genes = np.intersect1d(KLS._expr.index.values,
                                  KSS._expr.index.values)

    # Calculate the length of the genes they have in common
    length = len(common_genes)

    # allocate array for pearson correlation coefficients
    pcc = np.zeros(length)

    # iterate over every gene
    for i in range(length):

        # reset scores for kls and kss
        kls_score = np.zeros(length)
        kss_score = np.zeros(length)

        # fill these arrays with score values by querying the co-expression network database
        for j in range(length):
            # skip same index
            if i == j: continue

            # get co-expression scores from the networks
            kls_score[j] = KLS.coexpression_base(common_genes[i],
                                                 common_genes[j])["score"]
            kss_score[j] = KSS.coexpression_base(common_genes[i],
                                                 common_genes[j])["score"]

        # calculate the pearson correlation coefficient
        (pcc[i], p_value) = st.pearsonr(kls_score, kss_score)

        # print the values in a tab separated manner
        msg = "%s\t%s\t%s" % (i, common_genes[i], pcc[i])
        print(msg)
        print(msg, file=sys.stderr)
コード例 #9
0
ファイル: Overlap.py プロジェクト: ChengYunazhi/Camoco
    def SNP2Gene_breakdown(self,COB=None):
        '''
        Provides a breakdown of SNP to gene mapping parameters for each term in the Overlap.
        Includes the number of initial Loci, the number of collapsed Loci (within a window)
        and the number of candidate genes (within a window and up to a flank limit)

        Parameters
        ----------
        COB : str (default: 'average')
            If specfified, the results will be composed only of SNP to gene
            mappings from a single COB network. If 'average' is specified,
            the results will be the SET of genes across all COB networks.
        '''
        # Get some help
        def bp_to_kb(bp):
            return "{}KB".format(int(bp/1000))
        def get_level(df,level):
            ''' Returns the level values by name '''
            level_index = df.columns.names.index(level)
            return df.columns.levels[level_index]
        # Prepare the data frame results
        if COB == None:
            results = self.results
        else:
            results = self.results.query('COB=="{}"'.format(COB))
        # Total for the Ionome
        ont = co.GWAS(self.results.Ontology.unique()[0])
        ref = co.COB(self.results.COB.unique()[0])._parent_refgen
        # Make an aggregate term
        total = co.Term('total',loci=set(chain(* [x.loci for x in ont.terms()])))
        # Calculate number of SNPs
        snps = pd.DataFrame(pd.pivot_table(results,index="Term",values='TermLoci'))
        snps.columns = pd.MultiIndex.from_product([['GWAS SNPs'],['-'],['-']],names=['Name','WindowSize','FlankLimit'])
        snps.ix['Total'] = len(total.loci)
        # Calculate number of Candidate Loci
        loci = pd.pivot_table(results,index="Term",columns=['WindowSize'],values='TermCollapsedLoci')
        for window_size in loci.columns:
            loci.ix['Total',window_size] = len(total.effective_loci(window_size))
        loci.columns = pd.MultiIndex.from_product([['Collapsed Loci'],list(map(bp_to_kb,loci.columns)),['-']],names=['Name','WindowSize','FlankLimit'])
        # Calculate number of Candidate Genes
        genes = pd.pivot_table(results,index='Term',columns=['WindowSize','FlankLimit'],values='gene',aggfunc=lambda x: len(set(x)))
        for window_size in get_level(genes,'WindowSize'):
            for flank_limit in get_level(genes,'FlankLimit'):
                genes.ix['Total',(window_size,flank_limit)] = len(ref.candidate_genes(total.effective_loci(window_size=window_size),flank_limit=flank_limit))
        genes.columns = pd.MultiIndex.from_product(
            [['Candidate Genes'],
                list(map(bp_to_kb,get_level(genes,"WindowSize"))),
                get_level(genes,'FlankLimit')
            ],
            names=['Name','WindowSize','FlankLimit']
        )
        results = snps.join(loci).join(genes)
        #ionome_eff_loci = [len()]
        return results.astype(int)
コード例 #10
0
def plot_local_vs_cc(term, filename=None, bootstraps=1):
    RZM = co.COB('ROOT').refgen # use root specific for bootstraps
    pylab.clf()
    for _ in range(0, bootstraps):
        graph = co.COB('ROOT').graph(term.bootstrap_flanking_genes(RZM))
        degree = np.array(graph.degree())
        cc = np.array(graph.transitivity_local_undirected(weights='weight'))
        nan_mask = np.isnan(cc)
        pylab.scatter(degree[~nan_mask], cc[~nan_mask], alpha=0.05)
    # plot empirical
    graph = COB('ROOT').graph(term.flanking_genes(RZM))
    degree = np.array(graph.degree())
    cc = np.array(graph.transitivity_local_undirected(weights='weight'))
    nan_mask = np.isnan(cc)
    pylab.scatter(degree[~nan_mask], cc[~nan_mask])
    pylab.xlabel('Local Degree')
    pylab.ylabel('Clustering Coefficient')
    if filename is None:
        filename = "{}_cc.png".format(term.id)
    pylab.savefig(filename)
コード例 #11
0
def main():
    (options, args) = parser.parse_args()
    if not options.network_name: parser.error("Must specify name of network")

    network = co.COB(options.network_name)
    degree = network.degree
    
    i = 0
    for index, row in degree.iterrows():
        msg = "%s\t%s\t%s" % (i, index, row["Degree"])
        print(msg)
        i += 1
コード例 #12
0
def plot_local_global_degree(term, filename=None, bootstraps=1):
    ROOT = co.COB("ROOT")
    RZM = ROOT.refgen # use root specific for bootstraps
    hood = ROOT.neighborhood(term.flanking_genes(RZM))
    bshood = pd.concat([ROOT.neighborhood(term.bootstrap_flanking_genes(RZM)) for _ in range(0, bootstraps)])
    pylab.clf()
    pylab.scatter(bshood['local'], bshood['global'], alpha=0.05)
    pylab.scatter(hood['local'], hood['global'], c='r')
    pylab.xlabel('Local Degree')
    pylab.ylabel('Global Degree')
    pylab.title('{} Locality'.format(term.id))
    if filename is None:
        filename = "{}_locality.png".format(term.id)
    pylab.savefig(filename)
コード例 #13
0
def ZmSAM(Zm5bFGS):
    if cf.test.force.COB:
        tools.del_dataset('Expr', 'ZmSAM', force=True)
    if not tools.available_datasets('Expr', 'ZmSAM'):
        return co.COB.from_table(os.path.join(
            cf.options.testdir, 'raw', 'Expr', 'RNASEQ',
            'TranscriptomeProfiling_B73_Atlas_SAM_FGS_LiLin_20140316.txt.gz'),
                                 'ZmSAM',
                                 'Maize Root Network',
                                 Zm5bFGS,
                                 rawtype='RNASEQ',
                                 max_gene_missing_data=0.4,
                                 min_expr=0.1,
                                 quantile=False,
                                 dry_run=False,
                                 max_val=250)
    else:
        return co.COB('ZmSAM')
コード例 #14
0
def ZmRoot(Zm5bFGS):
    if cf.test.force.COB:
        tools.del_dataset('Expr', 'ZmRoot', force=True)
    if not tools.available_datasets('Expr', 'ZmRoot'):
        return co.COB.from_table(os.path.join(cf.options.testdir, 'raw',
                                              'Expr', 'RNASEQ',
                                              'ROOTFPKM.tsv.gz'),
                                 'ZmRoot',
                                 'Maize Root Network',
                                 Zm5bFGS,
                                 rawtype='RNASEQ',
                                 max_gene_missing_data=0.3,
                                 max_accession_missing_data=0.08,
                                 min_single_sample_expr=1,
                                 min_expr=0.001,
                                 quantile=False,
                                 max_val=300)
    else:
        return co.COB('ZmRoot')
コード例 #15
0
def ZmPAN(Zm5bFGS):
    if cf.test.force.COB:
        tools.del_dataset('Expr', 'ZmPAN', force=True)
    if not tools.available_datasets('Expr', 'ZmPAN'):
        return co.COB.from_table(os.path.join(cf.options.testdir, 'raw',
                                              'Expr', 'RNASEQ',
                                              'PANGenomeFPKM.txt.gz'),
                                 'ZmPAN',
                                 'Maize Root Network',
                                 Zm5bFGS,
                                 rawtype='RNASEQ',
                                 max_gene_missing_data=0.4,
                                 min_expr=1,
                                 quantile=False,
                                 dry_run=False,
                                 sep=',',
                                 max_val=300)
    else:
        return co.COB('ZmPAN')
コード例 #16
0
ファイル: conftest.py プロジェクト: lisabang/Camoco
def ZmRoot(Zm5bFGS):
    if cf.test.force.COB:
        tools.del_dataset("Expr", "ZmRoot", force=True)
    if not tools.available_datasets("Expr", "ZmRoot"):
        return co.COB.from_table(
            os.path.join(cf.options.testdir, "raw", "Expr", "RNASEQ",
                         "ROOTFPKM.tsv.gz"),
            "ZmRoot",
            "Maize Root Network",
            Zm5bFGS,
            rawtype="RNASEQ",
            max_gene_missing_data=0.3,
            max_accession_missing_data=0.08,
            min_single_sample_expr=1,
            min_expr=0.001,
            quantile=False,
            max_val=300,
        )
    else:
        return co.COB("ZmRoot")
コード例 #17
0
ファイル: conftest.py プロジェクト: lisabang/Camoco
def ZmPAN(Zm5bFGS):
    if cf.test.force.COB:
        tools.del_dataset("Expr", "ZmPAN", force=True)
    if not tools.available_datasets("Expr", "ZmPAN"):
        return co.COB.from_table(
            os.path.join(cf.options.testdir, "raw", "Expr", "RNASEQ",
                         "PANGenomeFPKM.txt.gz"),
            "ZmPAN",
            "Maize Root Network",
            Zm5bFGS,
            rawtype="RNASEQ",
            max_gene_missing_data=0.4,
            min_expr=1,
            quantile=False,
            dry_run=False,
            sep=",",
            max_val=300,
        )
    else:
        return co.COB("ZmPAN")
コード例 #18
0
def AtLeaf(AtTair10):
    if cf.test.force.COB:
        tools.del_dataset('Expr', 'AtLeaf', force=True)
    if not tools.available_datasets('Expr', 'AtLeaf'):
        Leaf = [
            'GSE14578',
            'GSE5630',
            'GSE13739',  #'GSE26199',
            'GSE5686',
            'GSE5615',
            'GSE5620',
            'GSE5628',
            'GSE5624',
            'GSE5626',
            'GSE5621',
            'GSE5622',
            'GSE5623',
            'GSE5625',
            'GSE5688'
        ]
        LeafFam = sum([
            co.Family.from_file(
                os.path.join(cf.options.testdir, 'raw', 'GSE',
                             '{}_family.soft.gz'.format(x))) for x in Leaf
        ])
        #LeafFam.to_keepfile("LeafKeep.tsv", keep_hint="lea")
        return co.COB.from_DataFrame(
            LeafFam.series_matrix(keepfile=os.path.join(
                cf.options.testdir, 'raw', 'GSE', 'LeafKeep.tsv')),
            'AtLeaf',
            'Arabidopsis Leaf',
            AtTair10,
            rawtype='MICROARRAY',
            max_gene_missing_data=0.3,
            min_expr=0.01,
            quantile=True,
        )
    else:
        return co.COB('AtLeaf')
コード例 #19
0
ファイル: conftest.py プロジェクト: lisabang/Camoco
def ZmRNASeqTissueAtlas(Zm5bFGS):
    if cf.test.force.COB:
        print("Rebuilding ZmRNASeqTissueAtlas")
        tools.del_dataset("COB", "ZmRNASeqTissueAtlas", force=True)
        tools.del_dataset("Expr", "ZmRNASeqTissueAtlas", force=True)
    if not tools.available_datasets("Expr", "ZmRNASeqTissueAtlas"):
        # Build it
        return co.COB.from_table(
            os.path.join(cf.options.testdir, "raw", "Expr", "RNASEQ",
                         "MaizeRNASeqTissue.tsv.bz2"),
            "ZmRNASeqTissueAtlas",
            "Maize RNASeq Tissue Atlas Network, Sekhon 2013, PLoS ONE",
            Zm5bFGS,
            rawtype="RNASEQ",
            max_gene_missing_data=0.3,
            max_accession_missing_data=0.08,
            min_single_sample_expr=1,
            min_expr=0.001,
            quantile=False,
            max_val=300,
            dry_run=False,
        )
    else:
        return co.COB("ZmRNASeqTissueAtlas")
コード例 #20
0
    def from_CLI(cls, args):
        """
            Implements an interface for the CLI to perform overlap
            Analysis
        """
        if args.genes != [None]:
            source = "genes"
        elif args.go is not None:
            source = "go"
        elif args.gwas is not None:
            source = "gwas"
        elif args.ontology is not None:
            source = 'ontology'
        self = cls.create(source+'_CLI', description="CLI Overlap")
        self.source = source
        self.args = args
        # Build base camoco objects
        self.cob = co.COB(args.cob)

        # Generate the ontology of terms that we are going to look
        # at the overlap of
        if source == "genes":
            # Be smart about this
            import re

            args.genes = list(chain(*[re.split("[,; ]", x) for x in args.genes]))
            self.ont = pd.DataFrame()
            self.ont.name = "GeneList"
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        elif source == "go":
            self.ont = co.GOnt(args.go)
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        elif source == "gwas":
            self.ont = co.GWAS(args.gwas)
        elif source == 'ontology':
            self.ont = co.Ontology(args.ontology)
        else:
            raise ValueError(
                "Please provide a valid overlap source (--genes, --go or --gwas)"
            )
        try:
            self.generate_output_name()
        except ValueError as e:
            return

        # Save strongest description arguments if applicable
        if "strongest" in self.args.snp2gene:
            if not (self.ont._global("strongest_attr") == args.strongest_attr):
                self.ont.set_strongest(attr=args.strongest_attr)
            if not (
                bool(int(self.ont._global("strongest_higher")))
                == bool(args.strongest_higher)
            ):
                self.ont.set_strongest(higher=args.strongest_higher)

        # Generate a terms iterable
        if self.source == "genes":
            # make a single term
            loci = self.cob.refgen.from_ids(self.args.genes)
            if len(loci) < len(self.args.genes):
                self.cob.log("Some input genes not in network")
            terms = [Term("CustomTerm", desc="Custom from CLI", loci=loci)]
        else:
            # Generate terms from the ontology
            if "all" in self.args.terms:
                terms = list(self.ont.iter_terms())
            else:
                terms = [self.ont[term] for term in self.args.terms]
        all_results = list()
        results = []

        num_total_terms = len(terms)
        # Iterate through terms and calculate
        for i, term in enumerate(terms):
            self.cob.log(
                " ---------- Calculating overlap for {} of {} Terms", i, num_total_terms
            )
            if term.id in self.args.skip_terms:
                self.cob.log("Skipping {} since it was in --skip-terms", term.id)
            self.cob.log("Generating SNP-to-gene mapping")
            # If appropriate, generate SNP2Gene Loci
            if self.args.candidate_flank_limit > 0:
                loci = self.snp2gene(term, self.ont)
            else:
                loci = list(term.loci)
                for x in loci:
                    x.window = 1

            # Filter out terms with insufficient or too many genes
            if len(loci) < 2 or len(loci) < args.min_term_size:
                self.cob.log("Not enough genes to perform overlap")
                continue
            if args.max_term_size != None and len(loci) > args.max_term_size:
                self.cob.log("Too many genes to perform overlap")
                continue

            # Send some output to the terminal
            self.cob.log(
                "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)",
                term.id,
                self.ont.name,
                self.cob.name,
                self.args.candidate_window_size,
                self.args.candidate_flank_limit,
                len(loci),
            )
            if args.dry_run:
                continue
            # Do the dirty
            try:
                overlap = self.overlap(loci)
            except DataError as e:
                continue
            self.cob.log("Generating bootstraps")
            bootstraps = self.generate_bootstraps(loci, overlap)
            bs_mean = bootstraps.groupby("iter").score.apply(np.mean).mean()
            bs_std = bootstraps.groupby("iter").score.apply(np.std).mean()
            # Calculate z scores for density
            self.cob.log("Calculating Z-Scores")
            if bs_std != 0:
                overlap["zscore"] = (overlap.score - bs_mean) / bs_std
                bootstraps["zscore"] = (bootstraps.score - bs_mean) / bs_std
            else:
                # If there is no variation, make all Z-scores 0
                overlap["zscore"] = bootstraps["zscore"] = 0
            # Calculate FDR
            self.cob.log("Calculating FDR")
            overlap["fdr"] = np.nan
            max_zscore = int(overlap.zscore.max()) + 1
            for zscore in np.arange(0, max_zscore, 0.25):
                num_random = (
                    bootstraps.groupby("iter")
                    .apply(lambda df: sum(df.zscore >= zscore))
                    .mean()
                )
                num_real = sum(overlap.zscore >= zscore)
                # Calculate FDR
                if num_real != 0 and num_random != 0:
                    fdr = num_random / num_real
                elif num_real != 0 and num_random == 0:
                    fdr = 0
                else:
                    fdr = 1
                overlap.loc[overlap.zscore >= zscore, "fdr"] = fdr
                overlap.loc[overlap.zscore >= zscore, "num_real"] = num_real
                overlap.loc[overlap.zscore >= zscore, "num_random"] = num_random
                overlap.loc[overlap.zscore >= zscore, "bs_mean"] = bs_mean
                overlap.loc[overlap.zscore >= zscore, "bs_std"] = bs_std
                overlap.sort_values(by=["zscore"], ascending=False, inplace=True)
            overlap_pval = (
                sum(
                    bootstraps.groupby("iter").apply(lambda x: x.score.mean())
                    >= overlap.score.mean()
                )
            ) / len(bootstraps.iter.unique())
            # This gets collated into all_results below
            overlap["COB"] = self.cob.name
            overlap["Ontology"] = self.ont.name
            overlap["Term"] = term.id
            overlap["WindowSize"] = self.args.candidate_window_size
            overlap["FlankLimit"] = self.args.candidate_flank_limit
            overlap["TermLoci"] = len(term.loci)
            overlap["TermCollapsedLoci"] = len(loci)
            overlap["TermPValue"] = overlap_pval
            overlap["NumBootstraps"] = len(bootstraps.iter.unique())
            overlap["Method"] = self.args.method
            overlap["SNP2Gene"] = self.args.snp2gene
            results.append(overlap.reset_index())
            # Summarize results
            if self.args.method == "density":
                overlap_score = np.nanmean(overlap.score) / (
                    1 / np.sqrt(overlap.num_trans_edges.mean())
                )
            elif self.args.method == "locality":
                overlap_score = np.nanmean(overlap.score)
            self.cob.log(
                "Overlap Score ({}): {} (p<{})".format(
                    self.args.method, overlap_score, overlap_pval
                )
            )
        if not args.dry_run:
            # Consolidate results and output to files
            self.results = pd.concat(results)
            self.results.to_csv(self.args.out, sep="\t", index=None)

            # Make an actual results object if not exists
            overlap_object = cls.create(self.ont)

            # Save the results to the SQLite table
            self.results.to_sql(
                "overlap",
                sqlite3.connect(overlap_object.db.filename),
                if_exists="append",
                index=False,
            )
コード例 #21
0
ファイル: Overlap.py プロジェクト: jonahcullen/Camoco
    def from_CLI(cls, args):
        '''
            Implements an interface for the CLI to perform overlap
            Analysis
        '''
        if args.genes != [None]:
            source = 'genes'
        elif args.go is not None:
            source = 'go'
        elif args.gwas is not None:
            source = 'gwas'
        self = cls.create(source, description='CLI Overlap')
        self.source = source
        self.args = args
        # Build base camoco objects
        self.cob = co.COB(args.cob)

        # Generate the ontology of terms that we are going to look
        # at the overlap of
        if source == 'genes':
            # Be smart about this
            import re
            args.genes = list(
                chain(*[re.split('[,; ]', x) for x in args.genes]))
            self.ont = pd.DataFrame()
            self.ont.name = 'GeneList'
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        elif source == 'go':
            self.ont = co.GOnt(args.go)
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        elif source == 'gwas':
            self.ont = co.GWAS(args.gwas)
        else:
            raise ValueError(
                'Please provide a valid overlap source (--genes, --go or --gwas)'
            )
        try:
            self.generate_output_name()
        except ValueError as e:
            return

        # Save strongest description arguments if applicable
        if 'strongest' in self.args.snp2gene:
            if not (self.ont._global('strongest_attr') == args.strongest_attr):
                self.ont.set_strongest(attr=args.strongest_attr)
            if not (bool(int(self.ont._global('strongest_higher'))) == bool(
                    args.strongest_higher)):
                self.ont.set_strongest(higher=args.strongest_higher)

        # Generate a terms iterable
        if self.source == 'genes':
            # make a single term
            loci = self.cob.refgen.from_ids(self.args.genes)
            if len(loci) < len(self.args.genes):
                self.cob.log('Some input genes not in network')
            terms = [Term('CustomTerm', desc='Custom from CLI', loci=loci)]
        else:
            # Generate terms from the ontology
            if 'all' in self.args.terms:
                terms = list(self.ont.iter_terms())
            else:
                terms = [self.ont[term] for term in self.args.terms]
        all_results = list()
        results = []

        num_total_terms = len(terms)
        # Iterate through terms and calculate
        for i, term in enumerate(terms):
            self.cob.log(' ---------- Calculating overlap for {} of {} Terms',
                         i, num_total_terms)
            if term.id in self.args.skip_terms:
                self.cob.log('Skipping {} since it was in --skip-terms',
                             term.id)
            self.cob.log('Generating SNP-to-gene mapping')
            # If appropriate, generate SNP2Gene Loci
            if self.args.candidate_flank_limit > 0:
                loci = self.snp2gene(term, self.ont)
            else:
                loci = list(term.loci)
                for x in loci:
                    x.window = 1

            # Filter out terms with insufficient or too many genes
            if len(loci) < 2 or len(loci) < args.min_term_size:
                self.cob.log('Not enough genes to perform overlap')
                continue
            if args.max_term_size != None and len(loci) > args.max_term_size:
                self.cob.log('Too many genes to perform overlap')
                continue

            # Send some output to the terminal
            self.cob.log(
                "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)",
                term.id, self.ont.name, self.cob.name,
                self.args.candidate_window_size,
                self.args.candidate_flank_limit, len(loci))
            if args.dry_run:
                continue
            # Do the dirty
            try:
                overlap = self.overlap(loci)
            except DataError as e:
                continue
            self.cob.log('Generating bootstraps')
            bootstraps = self.generate_bootstraps(loci, overlap)
            bs_mean = bootstraps.groupby('iter').score.apply(np.mean).mean()
            bs_std = bootstraps.groupby('iter').score.apply(np.std).mean()
            # Calculate z scores for density
            self.cob.log('Calculating Z-Scores')
            if bs_std != 0:
                overlap['zscore'] = (overlap.score - bs_mean) / bs_std
                bootstraps['zscore'] = (bootstraps.score - bs_mean) / bs_std
            else:
                # If there is no variation, make all Z-scores 0
                overlap['zscore'] = bootstraps['zscore'] = 0
            # Calculate FDR
            self.cob.log('Calculating FDR')
            overlap['fdr'] = np.nan
            max_zscore = int(overlap.zscore.max()) + 1
            for zscore in np.arange(0, max_zscore, 0.25):
                num_random = bootstraps\
                        .groupby('iter')\
                        .apply(lambda df: sum(df.zscore >= zscore))\
                        .mean()
                num_real = sum(overlap.zscore >= zscore)
                # Calculate FDR
                if num_real != 0 and num_random != 0:
                    fdr = num_random / num_real
                elif num_real != 0 and num_random == 0:
                    fdr = 0
                else:
                    fdr = 1
                overlap.loc[overlap.zscore >= zscore, 'fdr'] = fdr
                overlap.loc[overlap.zscore >= zscore, 'num_real'] = num_real
                overlap.loc[overlap.zscore >= zscore,
                            'num_random'] = num_random
                overlap.loc[overlap.zscore >= zscore, 'bs_mean'] = bs_mean
                overlap.loc[overlap.zscore >= zscore, 'bs_std'] = bs_std
                overlap.sort_values(by=['zscore'],
                                    ascending=False,
                                    inplace=True)
            overlap_pval = (
                (sum(bootstraps.groupby('iter').apply(lambda x: x.score.mean()) >= overlap.score.mean()))\
                / len(bootstraps.iter.unique())
            )
            # This gets collated into all_results below
            overlap['COB'] = self.cob.name
            overlap['Ontology'] = self.ont.name
            overlap['Term'] = term.id
            overlap['WindowSize'] = self.args.candidate_window_size
            overlap['FlankLimit'] = self.args.candidate_flank_limit
            overlap['TermLoci'] = len(term.loci)
            overlap['TermCollapsedLoci'] = len(loci)
            overlap['TermPValue'] = overlap_pval
            overlap['NumBootstraps'] = len(bootstraps.iter.unique())
            overlap['Method'] = self.args.method
            overlap['SNP2Gene'] = self.args.snp2gene
            results.append(overlap.reset_index())
            # Summarize results
            if self.args.method == 'density':
                overlap_score = np.nanmean(overlap.score) / (
                    1 / np.sqrt(overlap.num_trans_edges.mean()))
            elif self.args.method == 'locality':
                overlap_score = np.nanmean(overlap.score)
            self.cob.log('Overlap Score ({}): {} (p<{})'.format(
                self.args.method, overlap_score, overlap_pval))
        if not args.dry_run:
            # Consolidate results and output to files
            self.results = pd.concat(results)
            self.results.to_csv(self.args.out, sep='\t', index=None)

            # Make an actual results object if not exists
            overlap_object = cls.create(self.ont)

            # Save the results to the SQLite table
            self.results.to_sql('overlap',
                                sqlite3.connect(overlap_object.db.filename),
                                if_exists='append',
                                index=False)
コード例 #22
0
    def from_CLI(cls, args):
        '''
            Implements an interface for the CLI to perform overlap
            Analysis
        '''

        self = cls.create(args.gwas, description='CLI Overlap')
        # Build base camoco objects
        self.args = args
        self.cob = co.COB(args.cob)
        if args.go:
            self.ont = co.GOnt(args.gwas)
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        else:
            self.ont = co.GWAS(args.gwas)
        self.generate_output_name()

        # Generate a terms iterable
        if 'all' in self.args.terms:
            terms = self.ont.iter_terms()
        else:
            terms = [self.ont[term] for term in self.args.terms]
        all_results = list()

        results = []
        # Iterate through terms and calculate
        for term in terms:
            if term.id in self.args.skip_terms:
                self.cob.log('Skipping {} since it was in --skip-terms',
                             term.id)
            # Generate SNP2Gene Loci
            loci = self.snp2gene(term)
            if len(loci) < 2 or len(loci) < args.min_term_size:
                self.cob.log('Not enough genes to perform overlap')
                continue
            if args.max_term_size != None and len(loci) > args.max_term_size:
                self.cob.log('Too many genes to perform overlap')
                continue
            # Send some output to the terminal
            self.cob.log(
                "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)",
                term.id, self.ont.name, self.cob.name,
                self.args.candidate_window_size,
                self.args.candidate_flank_limit, len(loci))
            if args.dry_run:
                continue

            # Do the dirty
            try:
                overlap = self.overlap(loci)
            except DataError as e:
                continue
            bootstraps = self.generate_bootstraps(loci, overlap)
            bs_mean = bootstraps.groupby('iter').score.apply(np.mean).mean()
            bs_std = bootstraps.groupby('iter').score.apply(np.std).mean()
            # Calculate z scores for density
            if bs_std != 0:
                overlap['zscore'] = (overlap.score - bs_mean) / bs_std
                bootstraps['zscore'] = (bootstraps.score - bs_mean) / bs_std
            else:
                # If there is no variation, make all Z-scores 0
                overlap['zscore'] = bootstraps['zscore'] = 0
            # Calculate FDR
            overlap['fdr'] = np.nan
            max_zscore = int(overlap.zscore.max()) + 1
            for zscore in np.arange(0, max_zscore, 0.25):
                num_random = bootstraps\
                        .groupby('iter')\
                        .apply(lambda df: sum(df.zscore >= zscore))\
                        .mean()
                num_real = sum(overlap.zscore >= zscore)
                # Calculate FDR
                if num_real != 0 and num_random != 0:
                    fdr = num_random / num_real
                elif num_real != 0 and num_random == 0:
                    fdr = 0
                else:
                    fdr = 1
                overlap.loc[overlap.zscore >= zscore, 'fdr'] = fdr
                overlap.loc[overlap.zscore >= zscore, 'num_real'] = num_real
                overlap.loc[overlap.zscore >= zscore,
                            'num_random'] = num_random
                overlap.loc[overlap.zscore >= zscore, 'bs_mean'] = bs_mean
                overlap.loc[overlap.zscore >= zscore, 'bs_std'] = bs_std
                overlap.sort_values(by=['zscore'],
                                    ascending=False,
                                    inplace=True)
            overlap_pval = (
                (sum(bootstraps.groupby('iter').apply(lambda x: x.score.mean()) >= overlap.score.mean()))\
                / len(bootstraps.iter.unique())
            )
            # This gets collated into all_results below
            overlap['COB'] = self.cob.name
            overlap['Ontology'] = self.ont.name
            overlap['Term'] = term.id
            overlap['WindowSize'] = self.args.candidate_window_size
            overlap['FlankLimit'] = self.args.candidate_flank_limit
            overlap['TermLoci'] = len(term.loci)
            overlap['TermCollapsedLoci'] = len(loci)
            overlap['TermPValue'] = overlap_pval
            overlap['NumBootstraps'] = len(bootstraps.iter.unique())
            overlap['Method'] = self.args.method
            overlap['SNP2Gene'] = self.args.snp2gene
            results.append(overlap.reset_index())
        if not args.dry_run:
            self.results = pd.concat(results)
            self.results.to_csv(self.args.out, sep='\t', index=None)
            overlap_object = cls.create(self.ont)
            overlap_object.results = results
            self.results.to_sql('overlap',
                                sqlite3.connect(overlap_object.db.filename),
                                if_exists='append',
                                index=False)
コード例 #23
0
ファイル: snp2gene.py プロジェクト: jonahcullen/Camoco
def snp2gene(args):
    '''
        Perform SNP (locus) to candidate gene mapping
    '''

    if args.out != sys.stdout:
        # Create any non-existant directories
        if os.path.dirname(args.out) != '':
            os.makedirs(os.path.dirname(args.out),exist_ok=True)
        if os.path.exists(args.out) and not args.force:
            print(
                "Output for {} exists! Skipping!".format(
                    args.out
                ),file=sys.stderr
            )
            return None

    # Set a flag saying this is from a COB refgen
    from_cob = False
    # Create the refgen (option to create it from a COB)
    if co.Tools.available_datasets('Expr',args.refgen):
        refgen = co.COB(args.refgen).refgen
        from_cob = args.refgen 
    elif co.Tools.available_datasets('RefGen',args.refgen):
        refgen = co.RefGen(args.refgen)
    # Create the GWAS object
    ont = co.GWAS(args.gwas)

    if 'all' in args.terms:
        terms = ont.iter_terms()
    else:
        terms = [ont[term] for term in args.terms]

    data = pd.DataFrame()
    results = []
    for term in terms:
        for window_size in args.candidate_window_size:
            for flank_limit in args.candidate_flank_limit:
                if 'effective' in args.snp2gene:
                    # Map to effective
                    effective_loci = term.effective_loci(
                        window_size=window_size
                    )
                elif 'strongest' in args.snp2gene:
                    effective_loci = term.strongest_loci(
                        window_size=window_size,
                        attr=args.strongest_attr,
                        lowest=args.strongest_higher
                    )
                genes = pd.DataFrame([ x.as_dict() for x in 
                    refgen.candidate_genes(
                        effective_loci,
                        flank_limit=flank_limit,
                        include_parent_locus=True,
                        include_num_siblings=True,
                        include_num_intervening=True,
                        include_rank_intervening=True,
                        include_SNP_distance=True,
                        include_parent_attrs=args.include_parent_attrs,
                        attrs={'Term':term.id},
                    )
                ])
                genes['FlankLimit'] = flank_limit
                genes['WindowSize'] = window_size
                genes['RefGen'] = refgen.name
                if from_cob != False:
                    genes['COB'] = from_cob
                data = pd.concat([data,genes])

    # Add data from gene info files
    original_number_genes = len(data)
    for info_file in args.gene_info:
        log('Adding info for {}',info_file)
        # Assume the file is a table
        info = pd.read_table(info_file,sep='\t')
        if len(info.columns) == 1:
            info = pd.read_table(info_file,sep=',')
        # try to match as many columns as possible
        matching_columns = set(data.columns).intersection(info.columns)
        log("Joining SNP2Gene mappings with info file on: {}",','.join(matching_columns))
        data = pd.merge(data,info,how='left')
        if len(data) != original_number_genes:
            log.warn(
                'There were multiple info rows for some genes. '
                'Beware of potential duplicate candidate gene entries! '
            )
    
    # Generate the output file
    data.to_csv(args.out,index=None,sep='\t')

    log("Summary stats")
    print('-'*100)
    #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit))
    print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),len(data.ID.unique())))
    print("Number of candidate genes per term:")
    print(data.groupby('Term').apply(lambda df: len(df.ID)))
コード例 #24
0
ファイル: server.py プロジェクト: monprin/cob
                bundle.write(fd.read())
                bundle.write('\n')


# Actually bundle them
bundle_files(js_files, 'js')
bundle_files(css_files, 'css')

# ----------------------------------------
#    Load things to memeory to prepare
# ----------------------------------------
# Generate network list based on allowed list
print('Preloading networks into memory...')
if len(conf['networks']) < 1:
    conf['networks'] = list(co.Tools.available_datasets('Expr')['Name'].values)
networks = {x: co.COB(x) for x in conf['networks']}

network_info = []
refLinks = {}
for name, net in networks.items():
    network_info.append({
        'name': net.name,
        'refgen': net._global('parent_refgen'),
        'desc': net.description,
    })
    if net._global('parent_refgen') in conf['refLinks']:
        refLinks[net.name] = conf['refLinks'][net._global('parent_refgen')]
print('Availible Networks: ' + str(networks))

# Generate ontology list based on allowed list and load them into memory
print('Preloading GWASes into Memory...')
コード例 #25
0
def geneneighbors(args):
    args.out = os.path.splitext(args.out)[0] + "_neighbors.txt"
    if os.path.dirname(args.out) != "":
        os.makedirs(os.path.dirname(args.out), exist_ok=True)
    if os.path.exists(args.out):
        print("Output for {} exists! Skipping!".format(args.out), file=sys.stderr)
        return

    cob = co.COB(args.cob)
    cob.set_sig_edge_zscore(int(args.zscore))
    genes = cob.refgen.iter_genes()
    print("Generating neighbors for {} ".format(cob.name), file=sys.stderr)
    # make empty list to store our results
    GENE = []
    NumNeighbors = []
    results = []
    # iterate through each gene in the network
    for i in genes:
        # get the list of neighbors
        NB = cob.neighbors(i)
        # pandas is weird so we need to get gene names like this
        NB = NB.reset_index()
        x = set(NB.gene_a)
        x = x.union(NB.gene_b)
        # sometimes it lists itself as a neighbor so remove
        if i.id in x:
            x.remove(i.id)
        else:
            continue
        # store how many neighbors a gene has
        NumNeighbors.append(str(len(x)))
        GeneNeighbors = []
        GENE.append(i.id)
        SCORE = []
        DIST = []
        JGENE = []
        # for each of the gene neighbors
        for j in x:
            # get the ID and see the co-expression results
            # between the neighbor and original gene
            gene2 = cob.refgen.from_ids(j)
            y = cob.coexpression(i, gene2)
            score = y[0]
            significant = y[1]
            distance = y[2]
            # store all of the information
            JGENE.append(j)
            SCORE.append(score)
            DIST.append(distance)
        # zip those results so we can sort it together
        ZIPPED = zip(JGENE, SCORE, DIST)
        # sort by the Z-score
        SZIPPED = sorted(ZIPPED, key=lambda x: x[1], reverse=True)
        # grab the top 10 genes
        TOP10 = SZIPPED[0 : int(args.numneighbors)]
        # unzip them so we can combine them for writing
        x, y, z = zip(*TOP10)

        # combin the lists and add to results
        for l, m, n in zip(x, y, z):
            Temp = (str(l), str(m), str(n))
            NeighborInfo = ",".join(Temp)
            GeneNeighbors.append(NeighborInfo)
        GeneNeighbors = "\t".join(GeneNeighbors)
        # print(GeneNeighbors)
        results.append(GeneNeighbors)

    # write to a file
    output = open(args.out, "w")
    output.write(
        "Gene"
        + "\t"
        + "Number of Neighbors"
        + "\t"
        + "Gene,ZScore,Significant,distance"
        + "\n"
    )

    for a, b, c in zip(GENE, NumNeighbors, results):
        final = (a, b, c)
        output.write("\t".join(final))
        output.write("\n")
コード例 #26
0
def cob_health(args):
    log = coblog()
    log('\n'
        '-----------------------\n'
        '   Network Health      \n'
        '-----------------------\n')
    cob = co.COB(args.cob)
    if args.out is None:
        args.out = '{}_Health'.format(cob.name)

    log('Plotting Scores ----------------------------------------------------')
    if not path.exists('{}_CoexPCC_raw.png'.format(args.out)):
        cob.plot_scores('{}_CoexPCC_raw.png'.format(args.out), pcc=True)
    else:
        log('Skipped Raw.')

    if not path.exists('{}_CoexScore_zscore.png'.format(args.out)):
        cob.plot_scores('{}_CoexScore_zscore.png'.format(args.out), pcc=False)
    else:
        log('Skipped Norm.')

    log('Plotting Expression ------------------------------------------------')
    if not path.exists('{}_Expr_raw.png'.format(args.out)):
        cob.plot('{}_Expr_raw.png'.format(args.out),
                 include_accession_labels=True,
                 raw=True,
                 cluster_method=None)
    else:
        log('Skipped raw.')
    if not path.exists('{}_Expr_norm.png'.format(args.out)):
        cob.plot('{}_Expr_norm.png'.format(args.out),
                 include_accession_labels=True,
                 raw=False,
                 cluster_method='leaf',
                 cluster_accessions=True)
    else:
        log('Skipped norm.')
    log('Plotting Cluster Expression-----------------------------------------')
    if not path.exists('{}_Expr_cluster.png'.format(args.out)):
        cob.plot('{}_Expr_cluster.png'.format(args.out),
                 include_accession_labels=True,
                 raw=False,
                 cluster_accessions=True,
                 avg_by_cluster=True)
    else:
        log('Skipped norm.')
    log('Printing Summary ---------------------------------------------------')
    if not path.exists('{}.summary.txt'.format(args.out)):
        with open('{}.summary.txt'.format(args.out), 'w') as OUT:
            # Print out the network summary
            cob.summary(file=OUT)
    else:
        log('Skipped summary.')

    log('Printing QC Statistics ---------------------------------------------')
    if args.refgen is not None:
        if not path.exists('{}_qc_gene.txt'.format(args.out)):
            # Print out the breakdown of QC Values
            refgen = co.RefGen(args.refgen)
            gene_qc = cob._bcolz('qc_gene')
            gene_qc = gene_qc[gene_qc.pass_membership]
            gene_qc['chrom'] = [
                'chr' + str(refgen[x].chrom) for x in gene_qc.index
            ]
            gene_qc = gene_qc.groupby('chrom').agg(sum, axis=0)
            # Add totals at the bottom
            totals = gene_qc.ix[:, slice(1, None)].apply(sum)
            totals.name = 'TOTAL'
            gene_qc = gene_qc.append(totals)
            gene_qc.to_csv('{}_qc_gene.txt'.format(args.out), sep='\t')
        else:
            log('Skipped QC summary.')

    #if not path.exists('{}_CisTrans.png'.format(args.out)):
    # Get trans edges

    log('Plotting Degree Distribution ---------------------------------------')
    if not path.exists('{}_DegreeDist.png'.format(args.out)):
        degree = cob.degree['Degree'].values
        #Using powerlaw makes run-time warning the first time you use it.
        #This is still an open issue on the creators github.
        #The creator recommends removing this warning as long as there is a fit.
        np.seterr(divide='ignore', invalid='ignore')
        fit = powerlaw.Fit(degree, discrete=True, xmin=1)
        # get an axis
        ax = plt.subplot()
        # Calculate log ratios
        t2p = fit.distribution_compare('truncated_power_law', 'power_law')
        t2e = fit.distribution_compare('truncated_power_law', 'exponential')
        p2e = fit.distribution_compare('power_law', 'exponential')
        # Plot!
        emp = fit.plot_ccdf(ax=ax,
                            color='r',
                            linewidth=3,
                            label='Empirical Data')
        pwr = fit.power_law.plot_ccdf(ax=ax,
                                      color='b',
                                      linestyle='--',
                                      label='Power law')
        tpw = fit.truncated_power_law.plot_ccdf(ax=ax,
                                                color='k',
                                                linestyle='--',
                                                label='Truncated Power')
        exp = fit.exponential.plot_ccdf(ax=ax,
                                        color='g',
                                        linestyle='--',
                                        label='Exponential')
        ####
        ax.set_ylabel("p(Degree≥x)")
        ax.set_xlabel("Degree Frequency")
        ax.legend(loc='best')
        plt.title('{} Degree Distribution'.format(cob.name))
        # Save Fig
        try:
            plt.savefig('{}_DegreeDist.png'.format(args.out))
        except FutureWarning as e:
            # This is a matplotlib bug
            pass
    else:
        log('Skipping Degree Dist.')

    log('Plotting GO --------------------------------------------------------')
    if args.go is not None:
        if not path.exists('{}_GO.csv'.format(args.out)):
            go = co.GOnt(args.go)
            term_ids = []
            density_emp = []
            density_pvals = []
            locality_emp = []
            locality_pvals = []
            term_sizes = []
            term_desc = []
            terms_tested = 0
            if args.max_terms is not None:
                log('Limiting to {} GO Terms', args.max_terms)
                terms = go.rand(n=args.max_terms,
                                min_term_size=args.min_term_size,
                                max_term_size=args.max_term_size)
            else:
                terms = go.iter_terms(min_term_size=args.min_term_size,
                                      max_term_size=args.max_term_size)
            for term in terms:
                term.loci = list(filter(lambda x: x in cob, term.loci))
                if len(term) < args.min_term_size or len(
                        term) > args.max_term_size:
                    continue
                #set density value for two tailed go so we only test it once
                density = cob.density(term.loci)
                #one tailed vs two tailed test
                if args.two_tailed_GO is False:
                    #run one tail for only positive values
                    if density > 0:
                        density_emp.append(density)
                    #skip negative density values
                    else:
                        continue
                #if two_tailed_go is not none
                else:
                    density_emp.append(density)
                term_ids.append(term.id)
                term_sizes.append(len(term))
                term_desc.append(str(term.desc))
                # ------ Density
                # Calculate PVals
                density_bs = np.array([
                    cob.density(cob.refgen.random_genes(n=len(term.loci))) \
                    for x in range(args.num_bootstraps)
                ])
                if density > 0:
                    pval = sum(density_bs >= density) / args.num_bootstraps
                else:
                    pval = sum(density_bs <= density) / args.num_bootstraps
                density_pvals.append(pval)

                # ------- Locality
                locality = cob.locality(term.loci,
                                        include_regression=True).resid.mean()
                locality_emp.append(locality)
                # Calculate PVals
                locality_bs = np.array([
                    cob.locality(
                        cob.refgen.random_genes(n=len(term.loci)),
                        include_regression=True
                    ).resid.mean() \
                    for x in range(args.num_bootstraps)
                ])
                if locality > 0:
                    pval = sum(locality_bs >= locality) / args.num_bootstraps
                else:
                    pval = sum(locality_bs <= locality) / args.num_bootstraps
                locality_pvals.append(pval)
                # -------------
                terms_tested += 1
                if terms_tested % 100 == 0 and terms_tested > 0:
                    log('Processed {} terms'.format(terms_tested))
            go_enrichment = pd.DataFrame({
                'GOTerm': term_ids,
                'desc': term_desc,
                'size': term_sizes,
                'density': density_emp,
                'density_pval': density_pvals,
                'locality': locality_emp,
                'locality_pval': locality_pvals
            })
            go_enrichment\
                .sort_values(by='density_pval',ascending=True)\
                .to_csv('{}_GO.csv'.format(args.out),index=False)
            if terms_tested == 0:
                log.warn('No GO terms met your min/max gene criteria!')
        else:
            go_enrichment = pd.read_table('{}_GO.csv'.format(args.out),
                                          sep=',')

        if not path.exists('{}_GO.png'.format(args.out)):
            # Convert pvals to log10
            with np.errstate(divide='ignore'):
                # When no bootstraps are more extreme than the term, the minus log pval yields an infinite
                go_enrichment['density_pval'] = -1 * np.log10(
                    go_enrichment['density_pval'])
                go_enrichment['locality_pval'] = -1 * np.log10(
                    go_enrichment['locality_pval'])
                # Fix the infinites so they are plotted
                max_density = np.max(go_enrichment['density_pval'][np.isfinite(
                    go_enrichment['density_pval'])])
                max_locality = np.max(
                    go_enrichment['locality_pval'][np.isfinite(
                        go_enrichment['locality_pval'])])
                go_enrichment.loc[
                    np.logical_not(np.isfinite(go_enrichment['density_pval'])),
                    'density_pval'] = max_density + 1
                go_enrichment.loc[np.logical_not(
                    np.isfinite(go_enrichment['locality_pval'])),
                                  'locality_pval'] = max_locality + 1
            plt.clf()
            figure, axes = plt.subplots(3, 2, figsize=(12, 12))
            # -----------
            # Density
            # ----------
            axes[0, 0].scatter(go_enrichment['density'],
                               go_enrichment['density_pval'],
                               alpha=0.05)
            axes[0, 0].set_xlabel('Empirical Density (Z-Score)')
            axes[0, 0].set_ylabel('Bootstraped -log10(p-value)')
            fold = sum(np.array(go_enrichment['density_pval']) > 1.3) / (
                0.05 * len(go_enrichment))
            axes[0, 0].axhline(y=-1 * np.log10(0.05), color='red')
            axes[0, 0].text(min(axes[0, 0].get_xlim()),
                            -1 * np.log10(0.05),
                            '{:.3g} Fold Enrichement'.format(fold),
                            color='red')
            axes[1, 0].scatter(go_enrichment['size'],
                               go_enrichment['density_pval'],
                               alpha=0.05)
            axes[1, 0].set_ylabel('Bootstrapped -log10(p-value)')
            axes[1, 0].set_xlabel('Term Size')
            axes[1, 0].axhline(y=-1 * np.log10(0.05), color='red')
            axes[2, 0].scatter(go_enrichment['size'],
                               go_enrichment['density'],
                               alpha=0.05)
            axes[2,
                 0].scatter(go_enrichment.query('density_pval>1.3')['size'],
                            go_enrichment.query('density_pval>1.3')['density'],
                            alpha=0.05,
                            color='r')
            axes[2, 0].set_ylabel('Density')
            axes[2, 0].set_xlabel('Term Size')
            # ------------
            # Do Locality
            # ------------
            axes[0, 1].scatter(go_enrichment['locality'],
                               go_enrichment['locality_pval'],
                               alpha=0.05)
            axes[0, 1].set_xlabel('Empirical Locality (Residual)')
            axes[0, 1].set_ylabel('Bootstraped -log10(p-value)')
            fold = sum(np.array(go_enrichment['locality_pval']) > 1.3) / (
                0.05 * len(go_enrichment))
            axes[0, 1].axhline(y=-1 * np.log10(0.05), color='red')
            axes[0, 1].text(min(axes[0, 1].get_xlim()),
                            -1 * np.log10(0.05),
                            '{:.3g} Fold Enrichement'.format(fold),
                            color='red')
            axes[1, 1].scatter(go_enrichment['size'],
                               go_enrichment['locality_pval'],
                               alpha=0.05)
            axes[1, 1].set_xlabel('Term Size')
            axes[1, 1].set_ylabel('Bootstrapped -log10(p-value)')
            axes[1, 1].axhline(y=-1 * np.log10(0.05), color='red')
            axes[2, 1].scatter(go_enrichment['size'],
                               go_enrichment['locality'],
                               alpha=0.05)
            axes[2, 1].scatter(
                go_enrichment.query('locality_pval>1.3')['size'],
                go_enrichment.query('locality_pval>1.3')['locality'],
                alpha=0.05,
                color='r')
            axes[2, 1].set_ylabel('Density')
            axes[2, 1].set_xlabel('Term Size')
            # Save Figure
            plt.tight_layout()
            try:
                plt.savefig('{}_GO.png'.format(args.out))
            except FutureWarning as e:
                pass
        else:
            log('Skipping GO Volcano.')
コード例 #27
0
ファイル: server.py プロジェクト: lisabang/cob
                bundle.write(fd.read())
                bundle.write("\n")


# Actually bundle them
bundle_files(js_files, "js")
bundle_files(css_files, "css")

# ----------------------------------------
#    Load things to memeory to prepare
# ----------------------------------------
# Generate network list based on allowed list
print("Preloading networks into memory...")
if len(conf["networks"]) < 1:
    conf["networks"] = list(co.Tools.available_datasets("Expr")["Name"].values)
networks = {x: co.COB(x) for x in conf["networks"]}

network_info = []
refLinks = {}
for name, net in networks.items():
    network_info.append(
        {
            "name": net.name,
            "refgen": net._global("parent_refgen"),
            "desc": net.description,
        }
    )
    if net._global("parent_refgen") in conf["refLinks"]:
        refLinks[net.name] = conf["refLinks"][net._global("parent_refgen")]
print("Availible Networks: " + str(networks))
コード例 #28
0
    usage()
    sys.exit(2)

for opt, arg in opts:
    if opt in ("-c", "--cob"):
        cobname = arg
    elif opt in ("-s", "--secondnetwork"):
        cob2name = arg
    elif opt in ("-h", "--help"):
        usage()
        sys.exit(2)
    else:
        assert False, "unhandled option"

# load the network object
cob = co.COB(cobname)
cob_compare = co.COB(cob2name)

cob.set_sig_edge_zscore(2.5)
cob_compare.set_sig_edge_zscore(2.5)

# change from np.ndarray to pd dataframe
Clusters = pd.DataFrame(cob.clusters)

# Make a ordered dictionary to each key
# is a cluster and each value is the
# genes in that cluster
ClustDict = collections.OrderedDict()
for index, row in Clusters.iterrows():
    if row[0] in ClustDict.keys():
        ClustDict[row[0]].append(index)
コード例 #29
0
ファイル: plotGWAS.py プロジェクト: zhaijj/Camoco
def plot_gwas(args):
    # snag the appropriate COB
    cob = co.COB(args.cob)
    # snag the GWAS object
    gwas = co.GWAS(args.gwas)

    if 'all' in args.terms:
        terms = gwas.iter_terms()
    else:
        terms = [gwas[term] for term in args.terms]

    # Make a plot for each Term
    for term in terms:
        loci = list(term.loci)
        # create a dictionary of Loci which we can refer to using ids
        locus_lookup = {x.id: x for x in loci}
        # Each chromosome gets a plot
        chroms = set([x.chrom for x in loci])

        # Create a figure with a subplot for each chromosome
        f, axes = plt.subplots(len(chroms), figsize=(15, 4 * len(chroms)))
        plt.title('{} Term'.format(term.id))
        # Pull out the snp to gene mappings
        if args.snp2gene == 'effective':
            loci = sorted(
                term.effective_loci(window_size=args.candidate_window_size))
        elif args.snp2gene == 'strongest':
            loci = term.strongest_loci(window_size=args.candidate_window_size,
                                       attr=args.strongest_attr,
                                       lowest=args.strongest_higher)
        else:
            raise ValueError('{} not valid snp2gene mapping'.format(
                args.snp2gene))

        # iterate over Loci
        seen_chroms = set()
        voffset = 1  # Vertical Offset
        current_axis = 0
        y_labels = []
        y_ticks = []
        for i, locus in enumerate(loci):
            hoffset = -1 * locus.window
            # Reset the temp variables in necessary
            if locus.chrom not in seen_chroms:
                seen_chroms.add(locus.chrom)
                current_axis = len(seen_chroms) - 1
                voffset = 1
                if len(y_labels) > 0 and current_axis > 0:
                    # Set the old labels in the current
                    axes[current_axis - 1].set_yticks(y_ticks)
                    axes[current_axis - 1].set_yticklabels(y_labels)
                y_labels = []
                y_ticks = []
            # Get current axis
            cax = axes[current_axis]
            # Set up labels if first time one axis
            if voffset == 1:
                cax.set_ylabel('Chrom: ' + locus.chrom)
                cax.set_xlabel('Loci')
            # shortcut for current axis
            cax.hold(True)

            # Plot ALL Genes
            for gene in gwas.refgen.candidate_genes(locus, flank_limit=10e10):
                cax.barh(bottom=voffset,
                         width=len(gene),
                         height=5,
                         zorder=1,
                         left=hoffset + gene.start - locus.start +
                         locus.window,
                         label='RefGen Genes',
                         color='grey')
            # Plot the candidate genes
            for gene in cob.refgen.candidate_genes(locus, flank_limit=10e10):
                cax.barh(bottom=voffset,
                         width=len(gene),
                         height=5,
                         zorder=1,
                         left=hoffset + gene.start - locus.start +
                         locus.window,
                         label='Gene Passed QC',
                         color='green')

            # Plot the candidate genes
            for gene in cob.refgen.candidate_genes(
                    locus, flank_limit=args.candidate_flank_limit):
                cax.barh(bottom=voffset,
                         width=len(gene),
                         height=5,
                         zorder=1,
                         left=hoffset + gene.start - locus.start +
                         locus.window,
                         label='Candidate Gene',
                         color='red')

            # Plot the Effective Locus
            cax.scatter(  # Upstream
                hoffset, voffset, marker='>', zorder=2)
            cax.scatter(  # Start
                hoffset + locus.window,
                voffset,
                marker='.',
                color='blue',
                zorder=2)
            cax.scatter(  # Stop
                hoffset + locus.window + len(locus),
                voffset,
                marker='.',
                color='blue',
                zorder=2)
            cax.scatter(  # Downstream
                hoffset + locus.window + len(locus) + locus.window,
                voffset,
                marker='<',
                zorder=2)
            # Plot the Sub Loci
            for id in locus.sub_loci:
                if id in locus_lookup:
                    sub_locus = locus_lookup[id]
                    cax.scatter(hoffset + locus.window +
                                abs(sub_locus.start - locus.start),
                                voffset,
                                zorder=2,
                                marker='.',
                                label='SNP',
                                color='blue')

            # place a block for interlocal distance
            y_labels.append(commify(locus.start))
            y_ticks.append(voffset)
            voffset += 10
        # Have to finish off the ticks on the last chromosome
        axes[current_axis].set_yticks(y_ticks)
        axes[current_axis].set_yticklabels(y_labels)
        # Save Plot
        plt.savefig(args.out.replace('.png', '_{}.png'.format(term.id)))
        plt.close()
コード例 #30
0
    def from_CLI(cls, args):
        """
            Implements an interface to the CLI to perform GWAS simulation
        """
        self = cls()
        # Build the base objects
        self.args = args
        # Load camoco objects
        self.go = co.GOnt(self.args.GOnt)
        self.cob = co.COB(self.args.cob)
        self.generate_output_name()

        # Generate an iterable of GO Terms
        if "all" in self.args.terms:
            # Create a list of all terms within the size specification
            terms = list(
                self.go.iter_terms(
                    min_term_size=self.args.min_term_size,
                    max_term_size=self.args.max_term_size,
                ))
        elif os.path.exists(self.args.terms[0]):
            # If parameter is a filename, read term name from a filenamie
            terms = list(
                [self.go[x.strip()] for x in open(args.terms[0]).readlines()])
        else:
            # Generate terms from a parameter list
            terms = list([self.go[x] for x in self.args.terms])
        # Iterate and calculate
        log("Simulating GWAS for {} GO Terms", len(terms))
        min_term_size = np.min([len(x) for x in terms])
        max_term_size = np.max([len(x) for x in terms])
        log("All terms are between {} and {} 'SNPs'", min_term_size,
            max_term_size)

        results = []
        for i, term in enumerate(terms):
            log("-" * 75)
            window_size = self.args.candidate_window_size
            flank_limit = self.args.candidate_flank_limit
            # Generate a series of densities for parameters
            num_genes = len([x for x in term.loci if x in self.cob])
            eloci = [
                x for x in term.effective_loci(window_size=window_size)
                if x in self.cob
            ]
            eloci = self.simulate_missing_candidates(eloci,
                                                     self.args.percent_mcr)
            eloci = self.simulate_false_candidates(eloci,
                                                   self.args.percent_fcr)
            log(
                "GWAS Simulation {}: {} ({}/{} genes in {})",
                i,
                term.id,
                len(eloci),
                num_genes,
                self.cob.name,
            )
            # Make sure that the number of genes is adequate
            if num_genes > self.args.max_term_size:
                log("Too many genes... skipping")
                continue
            elif num_genes < self.args.min_term_size:
                log("Too few genes... skipping")
                continue
            elif num_genes == 0:
                continue
            # Generate candidate genes from the effecive loci
            candidates = self.cob.refgen.candidate_genes(
                eloci, flank_limit=flank_limit)
            log(
                "SNP to gene mapping finds {} genes at window:{} bp, "
                "flanking:{} genes",
                len(candidates),
                self.args.candidate_window_size,
                self.args.candidate_flank_limit,
            )
            overlap = self.overlap(eloci)
            # Dont bother bootstrapping on terms with overlap score below 0
            if overlap.score.mean() < 0:
                continue
            bootstraps = self.generate_bootstraps(eloci, overlap)
            bs_mean = bootstraps.groupby("iter").score.apply(np.mean).mean()
            bs_std = bootstraps.groupby("iter").score.apply(np.std).mean()
            # Calculate z scores for density
            overlap["zscore"] = (overlap.score - bs_mean) / bs_std
            bootstraps["zscore"] = (bootstraps.score - bs_mean) / bs_std
            overlap_pval = (sum(
                bootstraps.groupby("iter").apply(lambda x: x.score.mean()) >=
                overlap.score.mean())) / len(bootstraps.iter.unique())
            # Create a results object
            overlap["COB"] = self.cob.name
            overlap["Ontology"] = self.go.name
            overlap["Term"] = term.id
            overlap["WindowSize"] = self.args.candidate_window_size
            overlap["FlankLimit"] = self.args.candidate_flank_limit
            overlap["FCR"] = args.percent_fcr
            overlap["MCR"] = args.percent_mcr
            overlap["NumRealGenes"] = num_genes
            overlap["NumEffective"] = len(eloci)
            overlap["NumCandidates"] = len(candidates)
            overlap["TermSize"] = len(term)
            overlap["TermCollapsedLoci"] = len(eloci)
            overlap["TermPValue"] = overlap_pval
            overlap["NumBootstraps"] = len(bootstraps.iter.unique())
            overlap["Method"] = self.args.method
            results.append(overlap.reset_index())

        self.results = pd.concat(results)
        self.results.to_csv(args.out, sep="\t", index=False)