Python RefGen Examples, camoco.RefGen Python Examples

Example #1

0

Show file

def Zm5bFGS():
    if cf.test.force.RefGen:
        tools.del_dataset('RefGen', 'Zm5bFGS', force=True)
    if not tools.available_datasets('RefGen', 'Zm5bFGS'):
        # We have to build it
        gff = os.path.expanduser(
            os.path.join(cf.options.testdir, 'raw', 'RefGen',
                         'ZmB73_5b_FGS.gff.gz'))
        # This is stupid and necessary because pytables wont let me open
        # more than one table
        co.RefGen.from_gff(gff, 'Zm5bFGS', 'Maize 5b Filtered Gene Set', '5b',
                           'Zea Mays')
    return co.RefGen('Zm5bFGS')

Example #2

0

Show file

File: conftest.py Project: lisabang/Camoco

def Zm5bFGS():
    if cf.test.force.RefGen:
        tools.del_dataset("RefGen", "Zm5bFGS", force=True)
    if not tools.available_datasets("RefGen", "Zm5bFGS"):
        # We have to build it
        gff = os.path.expanduser(
            os.path.join(cf.options.testdir, "raw", "RefGen",
                         "ZmB73_5b_FGS.gff.gz"))
        # This is stupid and necessary because pytables wont let me open
        # more than one table
        co.RefGen.from_gff(gff, "Zm5bFGS", "Maize 5b Filtered Gene Set", "5b",
                           "Zea Mays")
    return co.RefGen("Zm5bFGS")

Example #3

0

Show file

File: Tests.py Project: hawkaa/Camoco

 def ZmRoot(self):
     co.del_dataset('Expr','ZmRoot',safe=False)
     ZM = co.RefGen('Zm5bFGS')
     ZmRoot = co.COB.from_table(
         os.path.join(cf.get('options','testdir'),'raw','Expression','ROOTFPKM.tsv'),
         'ZmRoot',
         'Maize Root Network',
         ZM,
         rawtype='RNASEQ',
         max_gene_missing_data=0.4,
         min_expr=0.1,
         dry_run=False,
         max_val=300
     )

Example #4

0

Show file

def build_gont(args):
    refgen = co.RefGen(args.refgen)
    # Check to see if this dataset is already built
    if co.available_datasets('GOnt', args.name):
        print('Warning! This dataset has already been built.')
        co.del_dataset('GOnt', args.name, force=args.force)

    go = co.GOnt.from_obo(args.obo_filename,
                          args.filename,
                          args.name,
                          args.description,
                          refgen,
                          go_col=args.go_col,
                          id_col=args.id_col)
    print("Done: {}".format(go.summary()))
    print('Build Successful')

Example #5

0

Show file

def build_cob(args):
    # Build the refgen
    refgen = co.RefGen(args.refgen)
    # Check that the sep is likely right.
    if len(pd.read_table(args.filename, sep=args.sep).columns) == 1:
        print(
            ("Detected only 1 column in {}, are you sure "
             "colunms are separated by '{}'?").format(args.filename, args.sep))
        return None
    if args.allow_non_membership:
        refgen = refgen.copy('{}_tmp'.format(refgen.name),
                             'temp refgen'.format(refgen.name))
        # Add non membership genes
        for gid in pd.read_table(args.filename, sep=args.sep).index:
            refgen.add_gene(Gene(None, None, id=gid))

    quality_control = False if args.skip_quality_control else True
    normalize = False if args.skip_normalization else True

    # Check to see if this dataset is already built
    if co.available_datasets('Expr', args.name):
        print('Warning! This dataset has already been built.')
        co.del_dataset('Expr', args.name, safe=args.force)

    # Basically just pass all the CLI arguments to the COB class method
    cob = co.COB.from_table(
        args.filename,
        args.name,
        args.description,
        refgen,
        # Optional arguments
        sep=args.sep,
        rawtype=args.rawtype,
        # Data Processing
        quality_control=quality_control,
        normalization=normalize,
        quantile=args.quantile,
        # Data processing parameters
        max_gene_missing_data=args.max_gene_missing_data,
        max_accession_missing_data=args.max_accession_missing_data,
        min_single_sample_expr=args.min_single_sample_expr,
        min_expr=args.min_expr,
        max_val=args.max_val,
        dry_run=args.dry_run,
        index_col=args.index_col)
    print("Build successful!")
    print(cob.summary())

Example #6

0

Show file

File: Tests.py Project: hawkaa/Camoco

 def ZmSAM(self):
     co.del_dataset('Expr','ZmSAM',safe=False)
     ZM = co.RefGen('Zm5bFGS')
     ZmSAM = co.COB.from_table(
         os.path.join(
             cf.get('options','testdir'),'raw','Expression',
             'TranscriptomeProfiling_B73_Atlas_SAM_FGS_LiLin_20140316.txt'
         ),
         'ZmSAM',
         'Maize Root Network',
         ZM,
         rawtype='RNASEQ',
         max_gene_missing_data=0.4,
         min_expr=0.1,
         dry_run=False,
         max_val=300
     )

Example #7

0

Show file

def build_GWAS(args):
    df = pd.DataFrame.from_csv(args.filename, sep=args.sep).reset_index()
    if len(df.columns) == 1:
        raise ValueError("Only 1 column found, check --sep, see --help")
    print('Loading {}'.format(args.refgen))
    refgen = co.RefGen(args.refgen)
    # Filter out traits that are in args.skip_trait
    df = df[[x not in args.skip_traits for x in df[args.trait_col]]]
    # Build
    gwas = co.GWAS.from_DataFrame(df,
                                  args.name,
                                  args.description,
                                  refgen,
                                  term_col=args.trait_col,
                                  chr_col=args.chrom_col,
                                  pos_col=args.pos_col)
    print("Build Successful:")
    print(gwas.summary())

Example #8

0

Show file

def build_gont(args):
    refgen = co.RefGen(args.refgen)
    # Check to see if this dataset is already built
    if available_datasets("GOnt", args.name):
        print("Warning! This dataset has already been built.")
        co.Tools.del_dataset("GOnt", args.name, force=args.force)

    go = co.GOnt.from_obo(
        args.obo_filename,
        args.filename,
        args.name,
        args.description,
        refgen,
        go_col=args.go_col,
        id_col=args.id_col,
        headers=args.gene_term_header,
    )
    print("Done: {}".format(go.summary()))
    print("Build Successful")

Example #9

0

Show file

File: Tests.py Project: hawkaa/Camoco

 def AtSeed(self):
     Seed = ['GSE12404',#'GSE30223',
             'GSE1051','GSE11852','GSE5634']
     SeedFam = sum([co.Family.from_file("raw/GSE/{}_family.soft".format(x)) for x in Seed ])
     #SeedFam.to_keepfile("SeedKeep.tsv",keep_hint='seed')
     AtSeed = co.COB.from_DataFrame(SeedFam.series_matrix(keepfile="raw/GSE/SeedKeep.tsv"),'AtSeed','Arabidopsis Seed',co.RefGen('Tair10'),rawtype='MICROARRAY')

Example #10

0

Show file

def build_cob(args):
    try:
        # Build the refgen
        refgen = co.RefGen(args.refgen)
        # Check that the sep is likely right.
        if len(pd.read_table(args.filename, sep=args.sep).columns) == 1:
            print(("Detected only 1 column in {}, are you sure "
                   "colunms are separated by '{}'?").format(
                       args.filename, args.sep))
            return None
        elif len(pd.read_table(
                args.filename,
                sep=args.sep).columns) < 20 and args.non_interactive != True:
            print((
                "Detected fewer than 20 accessions in the expression matrix. "
                "Calculating co-expression with this many datapoints is not advised"
            ))
            if input('are you sure you want to continue? [y/n]: ').upper(
            ) == 'Y':
                pass
            else:
                sys.exit(1)
        if args.allow_non_membership:
            refgen = refgen.copy('{}_tmp'.format(refgen.name),
                                 'temp refgen'.format(refgen.name))
            # Add non membership genes
            for gid in pd.read_table(args.filename, sep=args.sep).index:
                refgen.add_gene(Gene(None, None, id=gid))

        quality_control = False if args.skip_quality_control else True
        normalize = False if args.skip_normalization else True
        quantile = False if args.skip_quantile else True

        # Check to see if this dataset is already built
        if available_datasets('Expr', args.name):
            print('Warning! This dataset has already been built.')
            co.Tools.del_dataset('Expr', args.name, force=args.force)

        # Basically just pass all the CLI arguments to the COB class method
        cob = co.COB.from_table(
            args.filename,
            args.name,
            args.description,
            refgen,
            # Optional arguments
            sep=args.sep,
            rawtype=args.rawtype,
            # Data Processing
            quality_control=quality_control,
            normalization=normalize,
            quantile=quantile,
            # Data processing parameters
            max_gene_missing_data=args.max_gene_missing_data,
            max_accession_missing_data=args.max_accession_missing_data,
            min_single_sample_expr=args.min_single_sample_expr,
            min_expr=args.min_expr,
            max_val=args.max_val,
            dry_run=args.dry_run,
            zscore_cutoff=args.zscore_cutoff,
            index_col=args.index_col)
        print(cob.summary())
    except Exception as e:
        print("Build failed. Rolling back: removing corrupted files...")
        co.Tools.del_dataset('Expr', args.name, force=True)
        raise e

Example #11

0

Show file

File: server.py Project: monprin/cob

def getNodes(genes,
             cob,
             term,
             primary=None,
             render=None,
             gwasData=pd.DataFrame(),
             nodeCutoff=0,
             windowSize=None,
             flankLimit=None,
             fdrCutoff=None,
             hpo=False):
    # Cache the locality
    locality = cob.locality(genes)

    # Containers for the node info
    nodes = {}
    parent_set = set()

    # Look for alises
    aliases = co.RefGen(cob._global('parent_refgen')).aliases(
        [gene.id for gene in genes])

    # Look for annotations
    if cob._global('parent_refgen') in func_data_db:
        func_data = func_data_db[cob._global('parent_refgen')].get_annotations(
            [gene.id for gene in genes])
    else:
        func_data = {}

    # Pre cache a list of the contained genes
    gwasDataGenes = set()
    if not gwasData.empty:
        gwasDataGenes = set(gwasData['gene'])

    for gene in genes:
        # Catch for translating the way camoco works to the way We need for COB
        try:
            ldegree = locality.ix[gene.id]['local']
            gdegree = locality.ix[gene.id]['global']
        except KeyError as e:
            ldegree = gdegree = 'nan'

        # Catch for bug in camoco
        try:
            numInterv = str(gene.attr['num_intervening'])
            rankIntervening = str(gene.attr['intervening_rank'])
            numSiblings = str(gene.attr['num_siblings'])
        except KeyError as e:
            #print('Num Attr fail on gene: ' + str(gene.id))
            numInterv = '-'
            rankIntervening = '-'
            numSiblings = '-'

        # Pull any aliases from our database
        alias = ''
        if gene.id in aliases:
            for a in aliases[gene.id]:
                alias += a + ' '

        # Fetch the FDR if we can
        fdr = np.nan
        if gene.id in gwasDataGenes:
            fdr = gwasData[gwasData['gene'] == gene.id]['fdr'].min()

        # Pull any annotations from our databases
        anote = ''
        if gene.id in func_data:
            for a in func_data[gene.id]:
                anote += a + ' '
        # Fetch parent locus if we can
        if 'parent_locus' not in gene.attr:
            gene.attr['parent_locus'] = '[Unknown]{}:{}-{}'.format(
                gene.chrom, gene.start, gene.end)

        # Build the data object from our data
        node = {
            'group': 'nodes',
            'data': {
                'id':
                gene.id,
                'type':
                'gene',
                'render':
                False,
                'term':
                term,
                'snp':
                gene.attr['parent_locus'].replace('<', '[').replace('>', ']'),
                'alias':
                alias,
                'origin':
                'N/A',
                'chrom':
                str(gene.chrom),
                'start':
                str(gene.start),
                'end':
                str(gene.end),
                'cur_ldegree':
                str(0),
                'ldegree':
                str(ldegree),
                'gdegree':
                str(gdegree),
                'fdr':
                'HPO' if hpo else str(fdr),
                'windowSize':
                str(windowSize),
                'flankLimit':
                str(flankLimit),
                'numIntervening':
                numInterv,
                'rankIntervening':
                rankIntervening,
                'numSiblings':
                numSiblings,
                # 'parentNumIterations': str(gene.attr['parent_numIterations']),
                # 'parentAvgEffectSize': str(gene.attr['parent_avgEffectSize']),
                'annotations':
                anote,
            }
        }

        # Denote the query genes
        if primary:
            if gene.id in primary:
                node['data']['origin'] = 'query'
            else:
                node['data']['origin'] = 'neighbor'

        # Denote whether or not to render it
        if ldegree >= nodeCutoff:
            if (not fdrCutoff) or gwasData.empty or fdr <= fdrCutoff:
                if (not render) or (gene.id in render):
                    node['data']['render'] = True

        # Save the node to the list
        nodes[gene.id] = node

    return nodes

Example #12

0

Show file

File: server.py Project: monprin/cob

    onts_info[net.name] = []
    for n, ont in onts.items():
        if ont.refgen.name == ref:
            onts_info[net.name].append({
                'name': ont.name,
                'refgen': ont.refgen.name,
                'desc': ont.description
            })
print('Availible GWASes: ' + str(onts_info))

# Prefetch the gene names for all the networks
print('Fetching gene names for networks...')
network_genes = {}
for name, net in networks.items():
    ids = list(net._expr.index.values)
    als = co.RefGen(net._global('parent_refgen')).aliases(ids)
    for k, v in als.items():
        ids += v
    network_genes[name] = list(set(ids))
print('Found gene names')

# Find all of the GWAS data we have available
print('Finding GWAS Data...')
gwas_data_db = {}
for gwas in co.Tools.available_datasets('Overlap')['Name']:
    print("Loading {}".format(gwas))
    gwas_data_db[gwas] = co.Overlap(gwas)

# Find the available window sizes and flank limits for each GWAS/COB combo
print('Finding GWAS Metadata...')
gwas_meta_db = {}

Example #13

0

Show file

File: create_coex_network.py Project: hawkaa/csci5461_project

import camoco as co

# read refgen
ZMFGS = co.RefGen("Zm5bFGS")

# create KLS network
ZmTissueNetwork = co.COB.from_table(
       'data/splits/KSS.txt',
       'KSS_T', # Dataset Name
       'Co-expression network for all KLS annotated samples', # Short Description
       ZMFGS, #A RefGen instance
       rawtype='RNASEQ', # Expression datatype, either 'RNASEQ' or 'MICROARRAY'
       max_gene_missing=0.4, # See Expr._quality_control
       min_expr=0.1,  # See Expr._quality_control
       quantile=False,  # See Expr._quality_control
       dry_run=False,  # See Expr._quality_control
       sep=',', # table is comma seperated
       max_val=300, # See Expr._normalize
)

Example #14

0

Show file

def getNodes(genes, cob, term, primary=None, render=None,
    gwas_data=pd.DataFrame(), nodeCutoff=0):
    # Cache the locality
    locality = cob.locality(genes)
    
    # Containers for the node info
    nodes = []
    parent_set = set()

    # Look for alises
    aliases = co.RefGen(cob._global('parent_refgen')).aliases([gene.id for gene in genes])
    
    # Look for annotations
    if cob._global('parent_refgen') in func_data_db:
        func_data = func_data_db[cob._global('parent_refgen')][[gene.id for gene in genes]]
    else:
        func_data = {}

    for gene in genes:
        # Catch for translating the way camoco works to the way We need for COB
        try:
            local_degree = locality.ix[gene.id]['local']
            global_degree = locality.ix[gene.id]['global']
        except KeyError as e:
            local_degree = global_degree = 0

        # Catch for bug in camoco
        try:
            num_interv = str(gene.attr['num_intervening'])
        except KeyError as e:
            #print('Num Attr fail on gene: ' + str(gene.id))
            num_interv = 'NAN'

        # Pull any aliases from our database
        alias = ''
        if gene.id in aliases:
            for a in aliases[gene.id]:
                alias += a + ' '
        
        # Fetch the FDR if we can
        fdr = np.nan
        if gene.id in gwas_data.index:
            fdr = gwas_data.loc[gene.id]['fdr']
            
        # Pull any annotations from our databases
        anote = ''
        if gene.id in func_data:
            for a in func_data[gene.id]:
                anote += a + ' '
        
        # Build the data object from our data
        node = {'group':'nodes', 'data':{
            'id': gene.id,
            'type': 'gene',
            'render': 'x',
            'term': term,
            'snp': gene.attr['parent_locus'],
            'alias': alias,
            'origin': 'N/A',
            'chrom': str(gene.chrom),
            'start': str(gene.start),
            'end': str(gene.end),
            'cur_ldegree': str(0),
            'ldegree': str(local_degree),
            'gdegree': str(global_degree),
            'fdr': str(fdr),
            'num_intervening': num_interv,
            'rank_intervening': str(gene.attr['intervening_rank']),
            'num_siblings': str(gene.attr['num_siblings']),
            #'parent_num_iterations': str(gene.attr['parent_numIterations']),
            #'parent_avg_effect_size': str(gene.attr['parent_avgEffectSize']),
            'annotations': anote,
        }}
        
        # Denote the query genes
        if primary:
            if gene.id in primary:
                node['data']['origin'] = 'query'
            else:
                node['data']['origin'] = 'neighbor'
        
        # Denote whether or not to render it if there is a list
        if render:
            if (gene.id in render) and (local_degree >= nodeCutoff):
                node['data']['render'] = 'x'
            else:
                node['data']['render'] = ' '
            # Save the node to the list
            nodes.append(node)
        else:
            if local_degree >= nodeCutoff:
                node['data']['render'] = 'x'
                nodes.append(node)
        
    return nodes

Example #15

0

Show file

def cob_health(args):
    log = coblog()
    log('\n'
        '-----------------------\n'
        '   Network Health      \n'
        '-----------------------\n')
    cob = co.COB(args.cob)
    if args.out is None:
        args.out = '{}_Health'.format(cob.name)

    log('Plotting Scores ----------------------------------------------------')
    if not path.exists('{}_CoexPCC_raw.png'.format(args.out)):
        cob.plot_scores('{}_CoexPCC_raw.png'.format(args.out), pcc=True)
    else:
        log('Skipped Raw.')

    if not path.exists('{}_CoexScore_zscore.png'.format(args.out)):
        cob.plot_scores('{}_CoexScore_zscore.png'.format(args.out), pcc=False)
    else:
        log('Skipped Norm.')

    log('Plotting Expression ------------------------------------------------')
    if not path.exists('{}_Expr_raw.png'.format(args.out)):
        cob.plot('{}_Expr_raw.png'.format(args.out),
                 include_accession_labels=True,
                 raw=True,
                 cluster_method=None)
    else:
        log('Skipped raw.')
    if not path.exists('{}_Expr_norm.png'.format(args.out)):
        cob.plot('{}_Expr_norm.png'.format(args.out),
                 include_accession_labels=True,
                 raw=False,
                 cluster_method='leaf',
                 cluster_accessions=True)
    else:
        log('Skipped norm.')
    log('Plotting Cluster Expression-----------------------------------------')
    if not path.exists('{}_Expr_cluster.png'.format(args.out)):
        cob.plot('{}_Expr_cluster.png'.format(args.out),
                 include_accession_labels=True,
                 raw=False,
                 cluster_accessions=True,
                 avg_by_cluster=True)
    else:
        log('Skipped norm.')
    log('Printing Summary ---------------------------------------------------')
    if not path.exists('{}.summary.txt'.format(args.out)):
        with open('{}.summary.txt'.format(args.out), 'w') as OUT:
            # Print out the network summary
            cob.summary(file=OUT)
    else:
        log('Skipped summary.')

    log('Printing QC Statistics ---------------------------------------------')
    if args.refgen is not None:
        if not path.exists('{}_qc_gene.txt'.format(args.out)):
            # Print out the breakdown of QC Values
            refgen = co.RefGen(args.refgen)
            gene_qc = cob._bcolz('qc_gene')
            gene_qc = gene_qc[gene_qc.pass_membership]
            gene_qc['chrom'] = [
                'chr' + str(refgen[x].chrom) for x in gene_qc.index
            ]
            gene_qc = gene_qc.groupby('chrom').agg(sum, axis=0)
            # Add totals at the bottom
            totals = gene_qc.ix[:, slice(1, None)].apply(sum)
            totals.name = 'TOTAL'
            gene_qc = gene_qc.append(totals)
            gene_qc.to_csv('{}_qc_gene.txt'.format(args.out), sep='\t')
        else:
            log('Skipped QC summary.')

    #if not path.exists('{}_CisTrans.png'.format(args.out)):
    # Get trans edges

    log('Plotting Degree Distribution ---------------------------------------')
    if not path.exists('{}_DegreeDist.png'.format(args.out)):
        degree = cob.degree['Degree'].values
        #Using powerlaw makes run-time warning the first time you use it.
        #This is still an open issue on the creators github.
        #The creator recommends removing this warning as long as there is a fit.
        np.seterr(divide='ignore', invalid='ignore')
        fit = powerlaw.Fit(degree, discrete=True, xmin=1)
        # get an axis
        ax = plt.subplot()
        # Calculate log ratios
        t2p = fit.distribution_compare('truncated_power_law', 'power_law')
        t2e = fit.distribution_compare('truncated_power_law', 'exponential')
        p2e = fit.distribution_compare('power_law', 'exponential')
        # Plot!
        emp = fit.plot_ccdf(ax=ax,
                            color='r',
                            linewidth=3,
                            label='Empirical Data')
        pwr = fit.power_law.plot_ccdf(ax=ax,
                                      color='b',
                                      linestyle='--',
                                      label='Power law')
        tpw = fit.truncated_power_law.plot_ccdf(ax=ax,
                                                color='k',
                                                linestyle='--',
                                                label='Truncated Power')
        exp = fit.exponential.plot_ccdf(ax=ax,
                                        color='g',
                                        linestyle='--',
                                        label='Exponential')
        ####
        ax.set_ylabel("p(Degree≥x)")
        ax.set_xlabel("Degree Frequency")
        ax.legend(loc='best')
        plt.title('{} Degree Distribution'.format(cob.name))
        # Save Fig
        try:
            plt.savefig('{}_DegreeDist.png'.format(args.out))
        except FutureWarning as e:
            # This is a matplotlib bug
            pass
    else:
        log('Skipping Degree Dist.')

    log('Plotting GO --------------------------------------------------------')
    if args.go is not None:
        if not path.exists('{}_GO.csv'.format(args.out)):
            go = co.GOnt(args.go)
            term_ids = []
            density_emp = []
            density_pvals = []
            locality_emp = []
            locality_pvals = []
            term_sizes = []
            term_desc = []
            terms_tested = 0
            if args.max_terms is not None:
                log('Limiting to {} GO Terms', args.max_terms)
                terms = go.rand(n=args.max_terms,
                                min_term_size=args.min_term_size,
                                max_term_size=args.max_term_size)
            else:
                terms = go.iter_terms(min_term_size=args.min_term_size,
                                      max_term_size=args.max_term_size)
            for term in terms:
                term.loci = list(filter(lambda x: x in cob, term.loci))
                if len(term) < args.min_term_size or len(
                        term) > args.max_term_size:
                    continue
                #set density value for two tailed go so we only test it once
                density = cob.density(term.loci)
                #one tailed vs two tailed test
                if args.two_tailed_GO is False:
                    #run one tail for only positive values
                    if density > 0:
                        density_emp.append(density)
                    #skip negative density values
                    else:
                        continue
                #if two_tailed_go is not none
                else:
                    density_emp.append(density)
                term_ids.append(term.id)
                term_sizes.append(len(term))
                term_desc.append(str(term.desc))
                # ------ Density
                # Calculate PVals
                density_bs = np.array([
                    cob.density(cob.refgen.random_genes(n=len(term.loci))) \
                    for x in range(args.num_bootstraps)
                ])
                if density > 0:
                    pval = sum(density_bs >= density) / args.num_bootstraps
                else:
                    pval = sum(density_bs <= density) / args.num_bootstraps
                density_pvals.append(pval)

                # ------- Locality
                locality = cob.locality(term.loci,
                                        include_regression=True).resid.mean()
                locality_emp.append(locality)
                # Calculate PVals
                locality_bs = np.array([
                    cob.locality(
                        cob.refgen.random_genes(n=len(term.loci)),
                        include_regression=True
                    ).resid.mean() \
                    for x in range(args.num_bootstraps)
                ])
                if locality > 0:
                    pval = sum(locality_bs >= locality) / args.num_bootstraps
                else:
                    pval = sum(locality_bs <= locality) / args.num_bootstraps
                locality_pvals.append(pval)
                # -------------
                terms_tested += 1
                if terms_tested % 100 == 0 and terms_tested > 0:
                    log('Processed {} terms'.format(terms_tested))
            go_enrichment = pd.DataFrame({
                'GOTerm': term_ids,
                'desc': term_desc,
                'size': term_sizes,
                'density': density_emp,
                'density_pval': density_pvals,
                'locality': locality_emp,
                'locality_pval': locality_pvals
            })
            go_enrichment\
                .sort_values(by='density_pval',ascending=True)\
                .to_csv('{}_GO.csv'.format(args.out),index=False)
            if terms_tested == 0:
                log.warn('No GO terms met your min/max gene criteria!')
        else:
            go_enrichment = pd.read_table('{}_GO.csv'.format(args.out),
                                          sep=',')

        if not path.exists('{}_GO.png'.format(args.out)):
            # Convert pvals to log10
            with np.errstate(divide='ignore'):
                # When no bootstraps are more extreme than the term, the minus log pval yields an infinite
                go_enrichment['density_pval'] = -1 * np.log10(
                    go_enrichment['density_pval'])
                go_enrichment['locality_pval'] = -1 * np.log10(
                    go_enrichment['locality_pval'])
                # Fix the infinites so they are plotted
                max_density = np.max(go_enrichment['density_pval'][np.isfinite(
                    go_enrichment['density_pval'])])
                max_locality = np.max(
                    go_enrichment['locality_pval'][np.isfinite(
                        go_enrichment['locality_pval'])])
                go_enrichment.loc[
                    np.logical_not(np.isfinite(go_enrichment['density_pval'])),
                    'density_pval'] = max_density + 1
                go_enrichment.loc[np.logical_not(
                    np.isfinite(go_enrichment['locality_pval'])),
                                  'locality_pval'] = max_locality + 1
            plt.clf()
            figure, axes = plt.subplots(3, 2, figsize=(12, 12))
            # -----------
            # Density
            # ----------
            axes[0, 0].scatter(go_enrichment['density'],
                               go_enrichment['density_pval'],
                               alpha=0.05)
            axes[0, 0].set_xlabel('Empirical Density (Z-Score)')
            axes[0, 0].set_ylabel('Bootstraped -log10(p-value)')
            fold = sum(np.array(go_enrichment['density_pval']) > 1.3) / (
                0.05 * len(go_enrichment))
            axes[0, 0].axhline(y=-1 * np.log10(0.05), color='red')
            axes[0, 0].text(min(axes[0, 0].get_xlim()),
                            -1 * np.log10(0.05),
                            '{:.3g} Fold Enrichement'.format(fold),
                            color='red')
            axes[1, 0].scatter(go_enrichment['size'],
                               go_enrichment['density_pval'],
                               alpha=0.05)
            axes[1, 0].set_ylabel('Bootstrapped -log10(p-value)')
            axes[1, 0].set_xlabel('Term Size')
            axes[1, 0].axhline(y=-1 * np.log10(0.05), color='red')
            axes[2, 0].scatter(go_enrichment['size'],
                               go_enrichment['density'],
                               alpha=0.05)
            axes[2,
                 0].scatter(go_enrichment.query('density_pval>1.3')['size'],
                            go_enrichment.query('density_pval>1.3')['density'],
                            alpha=0.05,
                            color='r')
            axes[2, 0].set_ylabel('Density')
            axes[2, 0].set_xlabel('Term Size')
            # ------------
            # Do Locality
            # ------------
            axes[0, 1].scatter(go_enrichment['locality'],
                               go_enrichment['locality_pval'],
                               alpha=0.05)
            axes[0, 1].set_xlabel('Empirical Locality (Residual)')
            axes[0, 1].set_ylabel('Bootstraped -log10(p-value)')
            fold = sum(np.array(go_enrichment['locality_pval']) > 1.3) / (
                0.05 * len(go_enrichment))
            axes[0, 1].axhline(y=-1 * np.log10(0.05), color='red')
            axes[0, 1].text(min(axes[0, 1].get_xlim()),
                            -1 * np.log10(0.05),
                            '{:.3g} Fold Enrichement'.format(fold),
                            color='red')
            axes[1, 1].scatter(go_enrichment['size'],
                               go_enrichment['locality_pval'],
                               alpha=0.05)
            axes[1, 1].set_xlabel('Term Size')
            axes[1, 1].set_ylabel('Bootstrapped -log10(p-value)')
            axes[1, 1].axhline(y=-1 * np.log10(0.05), color='red')
            axes[2, 1].scatter(go_enrichment['size'],
                               go_enrichment['locality'],
                               alpha=0.05)
            axes[2, 1].scatter(
                go_enrichment.query('locality_pval>1.3')['size'],
                go_enrichment.query('locality_pval>1.3')['locality'],
                alpha=0.05,
                color='r')
            axes[2, 1].set_ylabel('Density')
            axes[2, 1].set_xlabel('Term Size')
            # Save Figure
            plt.tight_layout()
            try:
                plt.savefig('{}_GO.png'.format(args.out))
            except FutureWarning as e:
                pass
        else:
            log('Skipping GO Volcano.')

Example #16

0

Show file

File: server.py Project: lisabang/cob

def getNodes(
    genes,
    cob,
    term,
    primary=None,
    render=None,
    gwasData=pd.DataFrame(),
    nodeCutoff=0,
    windowSize=None,
    flankLimit=None,
    fdrCutoff=None,
    hpo=False,
):
    # Cache the locality
    locality = cob.locality(genes)

    # Containers for the node info
    nodes = {}
    parent_set = set()

    # Look for alises
    aliases = co.RefGen(cob._global("parent_refgen")).aliases(
        [gene.id for gene in genes]
    )

    # Look for annotations
    if cob._global("parent_refgen") in func_data_db:
        func_data = func_data_db[cob._global("parent_refgen")].get_annotations(
            [gene.id for gene in genes]
        )
    else:
        func_data = {}

    # Pre cache a list of the contained genes
    gwasDataGenes = set()
    if not gwasData.empty:
        gwasDataGenes = set(gwasData["gene"])

    for gene in genes:
        # Catch for translating the way camoco works to the way We need for COB
        try:
            ldegree = locality.ix[gene.id]["local"]
            gdegree = locality.ix[gene.id]["global"]
        except KeyError as e:
            ldegree = gdegree = "nan"

        # Catch for bug in camoco
        try:
            numInterv = str(gene.attr["num_intervening"])
            rankIntervening = str(gene.attr["intervening_rank"])
            numSiblings = str(gene.attr["num_siblings"])
        except KeyError as e:
            # print('Num Attr fail on gene: ' + str(gene.id))
            numInterv = "-"
            rankIntervening = "-"
            numSiblings = "-"

        # Pull any aliases from our database
        alias = ""
        if gene.id in aliases:
            for a in aliases[gene.id]:
                alias += a + " "

        # Fetch the FDR if we can
        fdr = np.nan
        if gene.id in gwasDataGenes:
            fdr = gwasData[gwasData["gene"] == gene.id]["fdr"].min()

        # Pull any annotations from our databases
        anote = ""
        if gene.id in func_data:
            for a in func_data[gene.id]:
                anote += a + " "
        # Fetch parent locus if we can
        if "parent_locus" not in gene.attr:
            gene.attr["parent_locus"] = "[Unknown]{}:{}-{}".format(
                gene.chrom, gene.start, gene.end
            )

        # Build the data object from our data
        node = {
            "group": "nodes",
            "data": {
                "id": gene.id,
                "type": "gene",
                "render": False,
                "term": term,
                "snp": gene.attr["parent_locus"].replace("<", "[").replace(">", "]"),
                "alias": alias,
                "origin": "N/A",
                "chrom": str(gene.chrom),
                "start": str(gene.start),
                "end": str(gene.end),
                "cur_ldegree": str(0),
                "ldegree": str(ldegree),
                "gdegree": str(gdegree),
                "fdr": "HPO" if hpo else str(fdr),
                "windowSize": str(windowSize),
                "flankLimit": str(flankLimit),
                "numIntervening": numInterv,
                "rankIntervening": rankIntervening,
                "numSiblings": numSiblings,
                # 'parentNumIterations': str(gene.attr['parent_numIterations']),
                # 'parentAvgEffectSize': str(gene.attr['parent_avgEffectSize']),
                "annotations": anote,
            },
        }

        # Denote the query genes
        if primary:
            if gene.id in primary:
                node["data"]["origin"] = "query"
            else:
                node["data"]["origin"] = "neighbor"

        # Denote whether or not to render it
        if ldegree >= nodeCutoff:
            if (not fdrCutoff) or gwasData.empty or fdr <= fdrCutoff:
                if (not render) or (gene.id in render):
                    node["data"]["render"] = True

        # Save the node to the list
        nodes[gene.id] = node

    return nodes

Example #17

0

Show file

File: Tests.py Project: hawkaa/Camoco

 def AtLeaf(self):
     Leaf = ['GSE14578','GSE5630','GSE13739', #'GSE26199',
             'GSE5686','GSE5615','GSE5620','GSE5628','GSE5624','GSE5626','GSE5621','GSE5622',
             'GSE5623','GSE5625','GSE5688']
     LeafFam = sum([co.Family.from_file("raw/GSE/{}_family.soft".format(x)) for x in Leaf ])
     # Generate the LeafKeep file
     #LeafFam.to_keepfile("LeafKeep.tsv",keep_hint="lea")
     AtLeaf = co.COB.from_DataFrame(LeafFam.series_matrix(keepfile="raw/GSE/LeafKeep.tsv"),'AtLeaf','Arabidopsis Leaf',co.RefGen('Tair10'),rawtype='MICROARRAY')
     self.assertIsInstance(AtLeaf,co.COB)

Example #18

0

Show file

File: Tests.py Project: hawkaa/Camoco

 def BuildIonome(self):
     csv = os.path.join(cf.get('options','testdir'),'raw','sigGWASsnpsCombinedIterations.longhorn.allLoc.csv')
     ZM = co.RefGen('Zm5bFGS')
     df = pd.DataFrame.from_csv(csv,index_col=None)
     IONS  = co.Ontology.from_DataFrame(df,'ZmIonome','Maize Ionome',ZM,term_col='el',chr_col='chr',pos_col='pos');
     self.assertIsInstance(IONS,co.Ontology)

Example #19

0

Show file

File: snp2gene.py Project: jonahcullen/Camoco

def snp2gene(args):
    '''
        Perform SNP (locus) to candidate gene mapping
    '''

    if args.out != sys.stdout:
        # Create any non-existant directories
        if os.path.dirname(args.out) != '':
            os.makedirs(os.path.dirname(args.out),exist_ok=True)
        if os.path.exists(args.out) and not args.force:
            print(
                "Output for {} exists! Skipping!".format(
                    args.out
                ),file=sys.stderr
            )
            return None

    # Set a flag saying this is from a COB refgen
    from_cob = False
    # Create the refgen (option to create it from a COB)
    if co.Tools.available_datasets('Expr',args.refgen):
        refgen = co.COB(args.refgen).refgen
        from_cob = args.refgen 
    elif co.Tools.available_datasets('RefGen',args.refgen):
        refgen = co.RefGen(args.refgen)
    # Create the GWAS object
    ont = co.GWAS(args.gwas)

    if 'all' in args.terms:
        terms = ont.iter_terms()
    else:
        terms = [ont[term] for term in args.terms]

    data = pd.DataFrame()
    results = []
    for term in terms:
        for window_size in args.candidate_window_size:
            for flank_limit in args.candidate_flank_limit:
                if 'effective' in args.snp2gene:
                    # Map to effective
                    effective_loci = term.effective_loci(
                        window_size=window_size
                    )
                elif 'strongest' in args.snp2gene:
                    effective_loci = term.strongest_loci(
                        window_size=window_size,
                        attr=args.strongest_attr,
                        lowest=args.strongest_higher
                    )
                genes = pd.DataFrame([ x.as_dict() for x in 
                    refgen.candidate_genes(
                        effective_loci,
                        flank_limit=flank_limit,
                        include_parent_locus=True,
                        include_num_siblings=True,
                        include_num_intervening=True,
                        include_rank_intervening=True,
                        include_SNP_distance=True,
                        include_parent_attrs=args.include_parent_attrs,
                        attrs={'Term':term.id},
                    )
                ])
                genes['FlankLimit'] = flank_limit
                genes['WindowSize'] = window_size
                genes['RefGen'] = refgen.name
                if from_cob != False:
                    genes['COB'] = from_cob
                data = pd.concat([data,genes])

    # Add data from gene info files
    original_number_genes = len(data)
    for info_file in args.gene_info:
        log('Adding info for {}',info_file)
        # Assume the file is a table
        info = pd.read_table(info_file,sep='\t')
        if len(info.columns) == 1:
            info = pd.read_table(info_file,sep=',')
        # try to match as many columns as possible
        matching_columns = set(data.columns).intersection(info.columns)
        log("Joining SNP2Gene mappings with info file on: {}",','.join(matching_columns))
        data = pd.merge(data,info,how='left')
        if len(data) != original_number_genes:
            log.warn(
                'There were multiple info rows for some genes. '
                'Beware of potential duplicate candidate gene entries! '
            )
    
    # Generate the output file
    data.to_csv(args.out,index=None,sep='\t')

    log("Summary stats")
    print('-'*100)
    #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit))
    print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),len(data.ID.unique())))
    print("Number of candidate genes per term:")
    print(data.groupby('Term').apply(lambda df: len(df.ID)))

Example #20

0

Show file

#!/usr/bin/python3

#packages used to build the three networks
import camoco as co
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt


#Brachipodium Ref
Bd21 = co.RefGen("Bd21")
co.RefGen.from_gff("BdistachyonBd21-3v1.1.gene.gff3" ,"Bd21", "BdistachyonBd21-3v1.1", "BdistachyonBd21-3v1.1", "Brachipodium Ref")

#Brachipodium GO propogation
co.GOnt.from_obo('go.obo',
    'Brach.gene.GO.txt',
    'Bd21GO',
    'Brachi Gene Ontology',
     Bd21)

#Build Brachy network
co.COB.from_table('Bd21_Meesh_V3.csv',
'Bd21_Treated',
'Bd21_Treated Samples with mock on day two',
Bd21,
rawtype='RNASEQ',
max_gene_missing_data=0.4,
max_accession_missing_data=0.4,
min_single_sample_expr=1,
min_expr=0.001,

Example #21

0

Show file

File: Tests.py Project: hawkaa/Camoco

 def AtGen(self):
     General = ['GSE18975','GSE39384','GSE19271','GSE5632','GSE39385','GSE5630','GSE15617','GSE5617','GSE5686','GSE2473',
                'GSE5633','GSE5620','GSE5628','GSE5624','GSE5626','GSE5621','GSE5622','GSE5623','GSE5625','GSE5688']
     GenFam = sum([co.Family.from_file("raw/GSE/{}_family.soft".format(x)) for x in General ])
     #GenFam.to_keepfile("GenKeep.tsv")
     AtGen = co.COB.from_DataFrame(GenFam.series_matrix(keepfile="raw/GSE/GenKeep.tsv"),'AtGen','Arab General',co.RefGen('Tair10'),rawtype='MICROARRAY')

Example #22

0

Show file

File: Tests.py Project: hawkaa/Camoco

 def AtRoot(self):
     Root = ['GSE14578','GSE46205','GSE7631','GSE10576','GSE42007','GSE34130','GSE21611','GSE22966','GSE7641','GSE5620',
             'GSE8934','GSE5628','GSE30095','GSE30097','GSE5624','GSE5626','GSE5749','GSE5621','GSE5622','GSE5623','GSE5625','GSE5688']
     RootFam = sum([co.Family.from_file("raw/GSE/{}_family.soft".format(x)) for x in Root ])
     #RootFam.to_keepfile("RootKeep.tsv",keep_hint='root')
     AtRoot = co.COB.from_DataFrame(RootFam.series_matrix(keepfile="raw/GSE/RootKeep.tsv"),'AtRoot','Arab Root',co.RefGen('Tair10'),rawtype='MICROARRAY')

Example #23

0

Show file

def cob_health(args):
    log = coblog()
    log(
        f"\n"
        f"-----------------------------\n"
        f"   Network Health:{args.cob} \n"
        f"-----------------------------\n"
    )
    log(f"\nCreating reports in {os.getcwd()}\n\n")

    cob = co.COB(args.cob)
    if args.out is None:
        args.out = "{}_Health".format(cob.name)
    log(f"Output prefix: {args.out}")

    if args.edge_zscore_cutoff is not None:
        log("Changing Z-Score cutoff to {}", args.edge_zscore_cutoff)
        cob.set_sig_edge_zscore(args.edge_zscore_cutoff)

    log("Printing Summary ---------------------------------------------------")
    if not path.exists("{}.summary.txt".format(args.out)):
        with open("{}.summary.txt".format(args.out), "w") as OUT:
            # Print out the network summary
            cob.summary(file=OUT)
    else:
        log("Skipped summary.")

    log("Plotting Scores ----------------------------------------------------")
    if not path.exists("{}_CoexPCC_raw.png".format(args.out)):
        cob.plot_scores("{}_CoexPCC_raw.png".format(args.out), pcc=True)
    else:
        log("Skipped Raw.")

    if not path.exists("{}_CoexScore_zscore.png".format(args.out)):
        cob.plot_scores("{}_CoexScore_zscore.png".format(args.out), pcc=False)
    else:
        log("Skipped Norm.")

    log("Plotting Expression ------------------------------------------------")
    # if not path.exists('{}_Expr_raw.png'.format(args.out)):
    #    cob.plot(
    #        '{}_Expr_raw.png'.format(args.out),
    #        include_accession_labels=True,
    #        raw=True,
    #        cluster_method=None
    #    )
    # else:
    #    log('Skipped raw.')
    if not path.exists("{}_Expr_norm.png".format(args.out)):
        cob.plot_heatmap(
            "{}_Expr_norm.png".format(args.out),
            include_accession_labels=True,
            raw=False,
            cluster_method="ward",
            cluster_accessions=True,
        )
    else:
        log("Skipped norm.")
    # log('Plotting Cluster Expression-----------------------------------------')
    # if not path.exists('{}_Expr_cluster.png'.format(args.out)):
    #    cob.plot(
    #        '{}_Expr_cluster.png'.format(args.out),
    #        include_accession_labels=True,
    #        raw=False,
    #        cluster_accessions=True,
    #        avg_by_cluster=True
    #    )
    # else:
    #    log('Skipped norm.')

    log("Printing QC Statistics ---------------------------------------------")
    if args.refgen is not None:
        if not path.exists("{}_qc_gene.txt".format(args.out)):
            # Print out the breakdown of QC Values
            refgen = co.RefGen(args.refgen)
            gene_qc = cob._bcolz("qc_gene")
            gene_qc = gene_qc[gene_qc.pass_membership]
            gene_qc["chrom"] = ["chr" + str(refgen[x].chrom) for x in gene_qc.index]
            gene_qc = gene_qc.groupby("chrom").agg(sum, axis=0)
            # Add totals at the bottom
            totals = gene_qc.ix[:, slice(1, None)].apply(sum)
            totals.name = "TOTAL"
            gene_qc = gene_qc.append(totals)
            gene_qc.to_csv("{}_qc_gene.txt".format(args.out), sep="\t")
        else:
            log("Skipped QC summary.")

    log("Plotting Degree Distribution ---------------------------------------")
    if not path.exists("{}_DegreeDist.png".format(args.out)):
        degree = cob.degree["Degree"].values
        # Using powerlaw makes run-time warning the first time you use it.
        # This is still an open issue on the creators github.
        # The creator recommends removing this warning as long as there is a fit.
        np.seterr(divide="ignore", invalid="ignore")
        fit = powerlaw.Fit(degree, discrete=True, xmin=1)
        # get an axis
        ax = plt.subplot()
        # Calculate log ratios
        t2p = fit.distribution_compare("truncated_power_law", "power_law")
        t2e = fit.distribution_compare("truncated_power_law", "exponential")
        p2e = fit.distribution_compare("power_law", "exponential")
        # Plot!
        emp = fit.plot_ccdf(ax=ax, color="r", linewidth=3, label="Empirical Data")
        pwr = fit.power_law.plot_ccdf(
            ax=ax, linewidth=2, color="b", linestyle=":", label="Power law"
        )
        tpw = fit.truncated_power_law.plot_ccdf(
            ax=ax, linewidth=2, color="k", linestyle="-.", label="Truncated Power"
        )
        exp = fit.exponential.plot_ccdf(
            ax=ax, linewidth=2, color="g", linestyle="--", label="Exponential"
        )
        ####
        ax.set_ylabel("p(Degree≥x)")
        ax.set_xlabel("Degree Frequency")
        ax.legend(loc="best")
        plt.title("{} Degree Distribution".format(cob.name))
        # Save Fig
        try:
            plt.savefig("{}_DegreeDist.png".format(args.out))
        except FutureWarning as e:
            # This is a matplotlib bug
            pass
    else:
        log("Skipping Degree Dist.")

    if args.go is not None:
        log("Plotting GO --------------------------------------------------------")
        # Set the alpha based on the tails
        if args.two_tailed == True:
            alpha = 0.05 / 2
        else:
            alpha = 0.05
        # Generate the GO Table
        if not path.exists("{}_GO.csv".format(args.out)):
            go = co.GOnt(args.go)
            term_ids = []
            density_emp = []
            density_pvals = []
            locality_emp = []
            locality_pvals = []
            term_sizes = []
            term_desc = []
            terms_tested = 0
            # max_terms limits the number of GO terms tested (sub-sampling)
            if args.max_terms is not None:
                log("Limiting to {} GO Terms", args.max_terms)
                terms = go.rand(
                    n=args.max_terms,
                    min_term_size=args.min_term_size,
                    max_term_size=args.max_term_size,
                )
            else:
                # Else do the whole set (default is terms between 10 and 300 genes)
                terms = go.iter_terms(
                    min_term_size=args.min_term_size, max_term_size=args.max_term_size
                )
            for term in terms:
                # Some terms will lose genes that are not in networks
                term.loci = list(filter(lambda x: x in cob, term.loci))
                # Skip terms that are not an adequate size
                if len(term) < args.min_term_size or len(term) > args.max_term_size:
                    continue
                # set density value for two tailed go so we only test it once
                density = cob.density(term.loci)
                # one tailed vs two tailed test
                density_emp.append(density)
                #
                term_ids.append(term.id)
                term_sizes.append(len(term))
                term_desc.append(str(term.desc))
                # ------ Density
                # Calculate PVals
                density_bs = np.array(
                    [
                        cob.density(cob.refgen.random_genes(n=len(term.loci)))
                        for x in range(args.num_bootstraps)
                    ]
                )
                if density > 0:
                    pval = sum(density_bs >= density) / args.num_bootstraps
                else:
                    pval = sum(density_bs <= density) / args.num_bootstraps
                density_pvals.append(pval)

                # ------- Locality
                locality = cob.locality(term.loci, include_regression=True).resid.mean()
                locality_emp.append(locality)
                # Calculate PVals
                locality_bs = np.array(
                    [
                        cob.locality(
                            cob.refgen.random_genes(n=len(term.loci)),
                            include_regression=True,
                        ).resid.mean()
                        for x in range(args.num_bootstraps)
                    ]
                )
                if locality > 0:
                    pval = sum(locality_bs >= locality) / args.num_bootstraps
                else:
                    pval = sum(locality_bs <= locality) / args.num_bootstraps
                locality_pvals.append(pval)
                # -------------
                terms_tested += 1
                if terms_tested % 100 == 0 and terms_tested > 0:
                    log("Processed {} terms".format(terms_tested))
            go_enrichment = pd.DataFrame(
                {
                    "GOTerm": term_ids,
                    "desc": term_desc,
                    "size": term_sizes,
                    "density": density_emp,
                    "density_pval": density_pvals,
                    "locality": locality_emp,
                    "locality_pval": locality_pvals,
                }
            )
            # Calculate significance
            go_enrichment["density_significant"] = go_enrichment.density_pval < alpha
            go_enrichment["locality_significant"] = go_enrichment.locality_pval < alpha
            # Calculate bonferonni
            go_enrichment["density_bonferroni"] = go_enrichment.density_pval < (
                alpha / len(go_enrichment)
            )
            go_enrichment["locality_bonferroni"] = go_enrichment.locality_pval < (
                alpha / len(go_enrichment)
            )
            # Store the GO results in a CSV
            go_enrichment.sort_values(by="density_pval", ascending=True).to_csv(
                "{}_GO.csv".format(args.out), index=False
            )
            if terms_tested == 0:
                log.warn("No GO terms met your min/max gene criteria!")
        else:
            go_enrichment = pd.read_table("{}_GO.csv".format(args.out), sep=",")

        if not path.exists("{}_GO.png".format(args.out)):
            # Convert pvals to log10
            with np.errstate(divide="ignore"):
                # When no bootstraps are more extreme than the term, the minus log pval yields an infinite
                go_enrichment["density_pval"] = -1 * np.log10(
                    go_enrichment["density_pval"]
                )
                go_enrichment["locality_pval"] = -1 * np.log10(
                    go_enrichment["locality_pval"]
                )
                # Fix the infinites so they are plotted
                max_density = np.max(
                    go_enrichment["density_pval"][
                        np.isfinite(go_enrichment["density_pval"])
                    ]
                )
                max_locality = np.max(
                    go_enrichment["locality_pval"][
                        np.isfinite(go_enrichment["locality_pval"])
                    ]
                )
                go_enrichment.loc[
                    np.logical_not(np.isfinite(go_enrichment["density_pval"])),
                    "density_pval",
                ] = (max_density + 1)
                go_enrichment.loc[
                    np.logical_not(np.isfinite(go_enrichment["locality_pval"])),
                    "locality_pval",
                ] = (max_locality + 1)
            plt.clf()
            # Calculate the transparency based on the number of terms
            if len(go_enrichment) > 20:
                transparency_alpha = 0.05
            else:
                transparency_alpha = 1

            # --------------------------------------------------------------------
            # Start Plotting
            figure, axes = plt.subplots(3, 2, figsize=(12, 12))
            # -----------
            # Density
            # ----------
            log_alpha = -1 * np.log10(alpha)
            axes[0, 0].scatter(
                go_enrichment["density"],
                go_enrichment["density_pval"],
                alpha=transparency_alpha,
                color="blue",
            )
            axes[0, 0].set_xlabel("Empirical Density (Z-Score)")
            axes[0, 0].set_ylabel("Bootstraped -log10(p-value)")
            fold = sum(np.array(go_enrichment["density_pval"]) > log_alpha) / (
                alpha * len(go_enrichment)
            )
            axes[0, 0].axhline(y=-1 * np.log10(0.05), color="red")
            axes[0, 0].text(
                min(axes[0, 0].get_xlim()),
                -1 * np.log10(alpha) + 0.1,
                "{:.3g} Fold Enrichement".format(fold),
                color="red",
            )
            axes[0, 0].set_title("Density Health")
            # Plot pvalue by term size
            axes[1, 0].scatter(
                go_enrichment["size"],
                go_enrichment["density_pval"],
                alpha=transparency_alpha,
                color="blue",
            )
            axes[1, 0].set_ylabel("Bootstrapped -log10(p-value)")
            axes[1, 0].set_xlabel("Term Size")
            axes[1, 0].axhline(y=-1 * np.log10(alpha), color="red")
            axes[2, 0].scatter(
                go_enrichment["size"],
                go_enrichment["density"],
                alpha=transparency_alpha,
                color="blue",
            )
            # Plot raw density by term size
            axes[2, 0].scatter(
                go_enrichment.query(f"density_pval>{log_alpha}")["size"],
                go_enrichment.query(f"density_pval>{log_alpha}")["density"],
                alpha=transparency_alpha,
                color="r",
            )
            axes[2, 0].set_ylabel("Density")
            axes[2, 0].set_xlabel("Term Size")
            # ------------
            # Do Locality
            # ------------
            axes[0, 1].scatter(
                go_enrichment["locality"],
                go_enrichment["locality_pval"],
                alpha=transparency_alpha,
                color="blue",
            )
            axes[0, 1].set_xlabel("Empirical Locality (Residual)")
            axes[0, 1].set_ylabel("Bootstraped -log10(p-value)")
            fold = sum(np.array(go_enrichment["locality_pval"]) > log_alpha) / (
                0.05 * len(go_enrichment)
            )
            axes[0, 1].axhline(y=-1 * np.log10(0.05), color="red")
            axes[0, 1].text(
                min(axes[0, 1].get_xlim()),
                -1 * np.log10(alpha),
                "{:.3g} Fold Enrichement".format(fold),
                color="red",
            )
            axes[0, 1].set_title("Locality Health")
            axes[1, 1].scatter(
                go_enrichment["size"],
                go_enrichment["locality_pval"],
                alpha=transparency_alpha,
                color="blue",
            )
            axes[1, 1].set_xlabel("Term Size")
            axes[1, 1].set_ylabel("Bootstrapped -log10(p-value)")
            axes[1, 1].axhline(y=-1 * np.log10(0.05), color="red")
            axes[2, 1].scatter(
                go_enrichment["size"],
                go_enrichment["locality"],
                alpha=transparency_alpha,
                color="blue",
            )
            axes[2, 1].scatter(
                go_enrichment.query(f"locality_pval>{log_alpha}")["size"],
                go_enrichment.query(f"locality_pval>{log_alpha}")["locality"],
                alpha=transparency_alpha,
                color="r",
            )
            axes[2, 1].set_ylabel("Locality")
            axes[2, 1].set_xlabel("Term Size")
            # Save Figure
            plt.tight_layout()
            try:
                plt.savefig("{}_GO.png".format(args.out))
            except FutureWarning as e:
                pass
        else:
            log("Skipping GO Volcano.")