Esempio n. 1
0
def available_datasets(type=None,*args):
    if((type == 'GWAS') or (type == 'Ontology')):
        return jsonify(gwas_sets)
    elif((type == 'Expr') or (type == 'Network')):
        return jsonify(network_list)
    elif(type == 'All'):
        return str(co.available_datasets())
    else:
        return jsonify({"data" : list(co.available_datasets(type)[
                    ['Name','Description']].itertuples(index=False))})
Esempio n. 2
0
def available_datasets(type=None,*args):
    # Find the datasets
    if(type == None):
        datasets = co.available_datasets()
    else:
        datasets = co.available_datasets(type)
    
    # Return the results in a table friendly format
    return jsonify({"data" : list(datasets[
                ['Name','Description']].itertuples(index=False))})
Esempio n. 3
0
def available_datasets(type=None,*args):
    # Find the datasets
    if(type == None):
        datasets = co.available_datasets()
    else:
        datasets = co.available_datasets(type)
    
    # Return the results in a table friendly format
    return jsonify({"data" : list(datasets[
                ['Name','Description']].itertuples(index=False))})
Esempio n. 4
0
def AtRoot(AtTair10):
    if cf.test.force.COB:
        co.del_dataset('Expr', 'AtRoot', force=True)
    if not co.available_datasets('Expr', 'AtRoot'):
        Root = [
            'GSE14578', 'GSE46205', 'GSE7631', 'GSE10576', 'GSE42007',
            'GSE34130', 'GSE21611', 'GSE22966', 'GSE7641', 'GSE5620',
            'GSE8934', 'GSE5628', 'GSE30095', 'GSE30097', 'GSE5624', 'GSE5626',
            'GSE5749', 'GSE5621', 'GSE5622', 'GSE5623', 'GSE5625', 'GSE5688'
        ]
        RootFam = sum([
            co.Family.from_file(
                os.path.join(cf.options.testdir, 'raw', 'GSE',
                             '{}_family.soft.gz'.format(x))) for x in Root
        ])
        #RootFam.to_keepfile("RootKeep.tsv", keep_hint='root')
        return co.COB.from_DataFrame(
            RootFam.series_matrix(keepfile=os.path.join(
                cf.options.testdir, 'raw', 'GSE', 'RootKeep.tsv')),
            'AtRoot',
            'Arab Root',
            AtTair10,
            rawtype='MICROARRAY',
            quantile=True)
    else:
        return co.COB('AtRoot')
Esempio n. 5
0
def AtLeafHydroIonome(AtTair10):
    if cf.test.force.Ontology:
        co.del_dataset('GWAS', 'AtLeafHydroIonome', force=True)
    if not co.available_datasets('GWAS', 'AtLeafHydroIonome'):
        # glob glob is god
        csvs = glob.glob(
            os.path.join(cf.options.testdir, 'raw', 'GWAS', 'AtIonome',
                         'AtLeafHydroIonome', '*.csv.gz'))
        # Read in each table individually then concat for GIANT table
        df = pd.concat([pd.read_table(x, sep=' ') for x in csvs])
        df = df.loc[df.pval <= cf.options.alpha, :]
        # Kill groups of SNPs that have identical (beta,pval)s
        df = df.groupby(['beta', 'pval']).filter(lambda x: len(x) < 5)
        # Add 'Chr' to chromosome column
        df.CHR = df.CHR.apply(lambda x: 'Chr' + str(x))
        # Import class from dataframe
        return co.GWAS.from_DataFrame(df,
                                      'AtLeafHydroIonome',
                                      'Arabidopsis second pass 1.6M',
                                      AtTair10,
                                      term_col='Trait',
                                      chr_col='CHR',
                                      pos_col='POS')
    else:
        return co.GWAS('AtLeafHydroIonome')
Esempio n. 6
0
def AtGen(AtTair10):
    if cf.test.force.COB:
        co.del_dataset('Expr', 'AtGen', force=True)
    if not co.available_datasets('Expr', 'AtGen'):
        General = [
            'GSE18975', 'GSE39384', 'GSE19271', 'GSE5632', 'GSE39385',
            'GSE5630', 'GSE15617', 'GSE5617', 'GSE5686', 'GSE2473', 'GSE5633',
            'GSE5620', 'GSE5628', 'GSE5624', 'GSE5626', 'GSE5621', 'GSE5622',
            'GSE5623', 'GSE5625', 'GSE5688'
        ]
        GenFam = sum([
            co.Family.from_file(
                os.path.join(cf.options.testdir, 'raw', 'GSE',
                             '{}_family.soft.gz'.format(x))) for x in General
        ])
        #GenFam.to_keepfile("GenKeep.tsv")
        return co.COB.from_DataFrame(
            GenFam.series_matrix(keepfile=os.path.join(
                cf.options.testdir, 'raw', 'GSE', 'GenKeep.tsv')),
            'AtGen',
            'Arab General',
            AtTair10,
            rawtype='MICROARRAY',
            quantile=True)
    else:
        return co.COB('AtGen')
Esempio n. 7
0
def AtSeed(AtTair10):
    if cf.test.force.COB:
        co.del_dataset('Expr', 'AtSeed', force=True)
    if not co.available_datasets('Expr', 'AtSeed'):
        Seed = [
            'GSE12404',  #'GSE30223',
            'GSE1051',
            'GSE11852',
            'GSE5634'
        ]
        SeedFam = sum([
            co.Family.from_file(
                os.path.join(cf.options.testdir, 'raw', 'GSE',
                             '{}_family.soft.gz'.format(x))) for x in Seed
        ])
        #SeedFam.to_keepfile("SeedKeep.tsv", keep_hint='seed')
        return co.COB.from_DataFrame(
            SeedFam.series_matrix(keepfile=os.path.join(
                cf.options.testdir, 'raw', 'GSE', 'SeedKeep.tsv')),
            'AtSeed',
            'Arabidopsis Seed',
            AtTair10,
            rawtype='MICROARRAY',
            quantile=True)
    else:
        return co.COB('AtSeed')
Esempio n. 8
0
def ZmRNASeqTissueAtlas(Zm5bFGS):
    if cf.test.force.COB:
        print('Rebuilding ZmRNASeqTissueAtlas')
        co.del_dataset('COB', 'ZmRNASeqTissueAtlas', force=True)
        co.del_dataset('Expr', 'ZmRNASeqTissueAtlas', force=True)
    if not co.available_datasets('Expr', 'ZmRNASeqTissueAtlas'):
        # Build it
        return co.COB.from_table(
            os.path.join(
                cf.options.testdir,
                'raw',
                'Expr',
                'RNASEQ',
                'MaizeRNASeqTissue.tsv.bz2',
            ),
            'ZmRNASeqTissueAtlas',
            'Maize RNASeq Tissue Atlas Network, Sekhon 2013, PLoS ONE',
            Zm5bFGS,
            rawtype='RNASEQ',
            max_gene_missing_data=0.3,
            max_accession_missing_data=0.08,
            min_single_sample_expr=1,
            min_expr=0.001,
            quantile=False,
            max_val=300,
            dry_run=True)
    else:
        return co.COB('ZmRNASeqTissueAtlas')
Esempio n. 9
0
def AtSeed(AtTair10):
    if cf.test.force.COB:
        co.del_dataset('Expr', 'AtSeed', safe=False)
    if not co.available_datasets('Expr', 'AtSeed'):
        Seed = ['GSE12404', #'GSE30223',
                'GSE1051', 'GSE11852', 'GSE5634']
        SeedFam = sum(
            [co.Family.from_file(
                os.path.join(
                    cf.options.testdir,
                    'raw', 'GSE', '{}_family.soft.gz'.format(x)
                )
            )
            for x in Seed ]
        )
        #SeedFam.to_keepfile("SeedKeep.tsv", keep_hint='seed')
        return co.COB.from_DataFrame(
            SeedFam.series_matrix(
                keepfile=os.path.join(
                    cf.options.testdir,
                    'raw', 'GSE', 'SeedKeep.tsv'
                )
            ),
            'AtSeed', 'Arabidopsis Seed',
            AtTair10,
            rawtype='MICROARRAY',
            quantile=True

        )
    else:
        return co.COB('AtSeed')
Esempio n. 10
0
def AtGen(AtTair10):
    if cf.test.force.COB:
        co.del_dataset('Expr', 'AtGen', safe=False)
    if not co.available_datasets('Expr', 'AtGen'):
        General = ['GSE18975', 'GSE39384', 'GSE19271', 'GSE5632', 'GSE39385',
                'GSE5630', 'GSE15617', 'GSE5617', 'GSE5686', 'GSE2473',
                'GSE5633', 'GSE5620', 'GSE5628', 'GSE5624',
                'GSE5626', 'GSE5621', 'GSE5622', 'GSE5623', 'GSE5625', 'GSE5688']
        GenFam = sum(
            [co.Family.from_file(
                os.path.join(
                    cf.options.testdir,
                    'raw', 'GSE', '{}_family.soft.gz'.format(x)
                )
            )
            for x in General ]
        )
        #GenFam.to_keepfile("GenKeep.tsv")
        return co.COB.from_DataFrame(
            GenFam.series_matrix(
                keepfile=os.path.join(
                    cf.options.testdir,
                    'raw', 'GSE', 'GenKeep.tsv'
                )
            ),
            'AtGen', 'Arab General',
            AtTair10,
            rawtype='MICROARRAY',
            quantile=True
        )
    else:
        return co.COB('AtGen')
Esempio n. 11
0
def AtLeaf(AtTair10):
    if cf.test.force.COB:
        co.del_dataset('Expr', 'AtLeaf', safe=False)
    if not co.available_datasets('Expr', 'AtLeaf'):
        Leaf = ['GSE14578', 'GSE5630', 'GSE13739', #'GSE26199',
                'GSE5686', 'GSE5615', 'GSE5620', 'GSE5628',
                'GSE5624', 'GSE5626', 'GSE5621', 'GSE5622',
                'GSE5623', 'GSE5625', 'GSE5688']
        LeafFam = sum(
            [co.Family.from_file(
                os.path.join(
                    cf.options.testdir,
                    'raw', 'GSE', '{}_family.soft.gz'.format(x)
                )
            )
            for x in Leaf ]
        )
        #LeafFam.to_keepfile("LeafKeep.tsv", keep_hint="lea")
        return co.COB.from_DataFrame(
            LeafFam.series_matrix(
                keepfile=os.path.join(
                    cf.options.testdir,
                    'raw', 'GSE', 'LeafKeep.tsv'
                )
            ),
            'AtLeaf', 'Arabidopsis Leaf',
            AtTair10,
            rawtype='MICROARRAY',
            max_gene_missing_data=0.3,
            min_expr=0.01,
            quantile=True,
        )
    else:
        return co.COB('AtLeaf')
Esempio n. 12
0
def AtRoot(AtTair10):
    if cf.test.force.COB:
        co.del_dataset('Expr', 'AtRoot', safe=False)
    if not co.available_datasets('Expr', 'AtRoot'):
        Root = ['GSE14578', 'GSE46205', 'GSE7631', 'GSE10576', 'GSE42007',
                'GSE34130', 'GSE21611', 'GSE22966', 'GSE7641', 'GSE5620',
                'GSE8934', 'GSE5628', 'GSE30095', 'GSE30097', 'GSE5624',
                'GSE5626', 'GSE5749', 'GSE5621', 'GSE5622',
                'GSE5623', 'GSE5625', 'GSE5688']
        RootFam = sum(
            [co.Family.from_file(
                os.path.join(
                    cf.options.testdir,
                    'raw', 'GSE', '{}_family.soft.gz'.format(x)
                )
            )
            for x in Root ]
        )
        #RootFam.to_keepfile("RootKeep.tsv", keep_hint='root')
        return co.COB.from_DataFrame(
            RootFam.series_matrix(
                keepfile=os.path.join(
                    cf.options.testdir,
                    'raw', 'GSE', 'RootKeep.tsv')
            ),
            'AtRoot', 'Arab Root',
            AtTair10,
            rawtype='MICROARRAY',
            quantile=True
        )
    else:
        return co.COB('AtRoot')
Esempio n. 13
0
def ZmIonome(Zm5bFGS):
        # Delete the old dataset
    if cf.test.force.Ontology:
        co.del_dataset('GWAS','ZmIonome',safe=False)
    if not co.available_datasets('GWAS','ZmIonome'):
        # Grab path the csv
        csv = os.path.join(
            cf.options.testdir,
            'raw','GWAS','Ionome',
            'sigGWASsnpsCombinedIterations.longhorn.allLoc.csv.gz'
        )
        # Define our reference geneome
        df = pd.DataFrame.from_csv(csv,index_col=None)
        # Import class from dataframe
        IONS  = co.GWAS.from_DataFrame(
            df,'ZmIonome','Maize Ionome',
            Zm5bFGS,
            term_col='el',chr_col='chr',pos_col='pos'
        )
        # Get rid of pesky Cobalt
        IONS.del_term('Co59')
        # I guess we need a test in here too
        return IONS
    else:
        return co.GWAS('ZmIonome')
Esempio n. 14
0
def AtRootHydroIonome(AtTair10):
    if cf.test.force.Ontology:
        co.del_dataset('GWAS','AtRootHydroIonome',safe=False)
    if not co.available_datasets('GWAS', 'AtRootHydroIonome'):
        # glob glob is god
        csvs = glob.glob(os.path.join(
            cf.options.testdir,
            'raw','GWAS','AtIonome',
            'AtRootHydroIonome','*.csv.gz'
        ))
        # Read in each table individually then concat for GIANT table
        df = pd.concat([pd.read_table(x,sep=' ') for x in csvs])
        # Only keep significant pvals
        df = df.loc[df.pval <= cf.options.alpha,:]
        # Kill groups of SNPs that have identical (beta,pval)s
        df = df.groupby(['beta','pval']).filter(lambda x: len(x) < 5)
        # Add 'Chr' to chromosome column
        df.CHR = df.CHR.apply(lambda x: 'Chr'+str(x))
        # Chase dat refgen
        # Import class from dataframe
        return co.GWAS.from_DataFrame(
            df,'AtRootHydroIonome','Arabidopsis second pass 1.6M',
            AtTair10, term_col='Trait', chr_col='CHR', pos_col='POS'
        )
    else:
        return co.GWAS('AtRootHydroIonome')
Esempio n. 15
0
def ZmIonome(Zm5bFGS):
    # Delete the old dataset
    if cf.test.force.Ontology:
        co.del_dataset('GWAS', 'ZmIonome', force=True)
    if not co.available_datasets('GWAS', 'ZmIonome'):
        # Grab path the csv
        csv = os.path.join(
            cf.options.testdir, 'raw', 'GWAS', 'Ionome',
            'sigGWASsnpsCombinedIterations.longhorn.allLoc.csv.gz')
        # Define our reference geneome
        df = pd.DataFrame.from_csv(csv, index_col=None)
        # Import class from dataframe
        IONS = co.GWAS.from_DataFrame(df,
                                      'ZmIonome',
                                      'Maize Ionome',
                                      Zm5bFGS,
                                      term_col='el',
                                      chr_col='chr',
                                      pos_col='pos')
        # Get rid of pesky Cobalt
        IONS.del_term('Co59')
        # I guess we need a test in here too
        return IONS
    else:
        return co.GWAS('ZmIonome')
Esempio n. 16
0
def ZmRNASeqTissueAtlas(Zm5bFGS):
    if cf.test.force.COB:
        co.del_dataset('COB', 'ZmRNASeqTissueAtlas', safe=False)
        co.del_dataset('Expr', 'ZmRNASeqTissueAtlas', safe=False)
    if not co.available_datasets('Expr', 'ZmRNASeqTissueAtlas'):
        # Build it
        return co.COB.from_table(
            os.path.join(cf.options.testdir,
                'raw', 'Expr', 'RNASEQ',
                'MaizeRNASeqTissue.tsv.bz2',
            ),
            'ZmRNASeqTissueAtlas',
            'Maize RNASeq Tissue Atlas Network, Sekhon 2013, PLoS ONE',
            Zm5bFGS,
            rawtype='RNASEQ',
            max_gene_missing_data=0.3,
            max_accession_missing_data=0.08,
            min_single_sample_expr=1,
            min_expr=0.001,
            quantile=False,
            max_val=300,
            dry_run=True
        )
    else:
        return co.COB('ZmRNASeqTissueAtlas')
Esempio n. 17
0
def available_datasets(type=None,*args):
    return jsonify({ 
        "data" : list(
            co.available_datasets(type)[
                ['Name','Description']
            ].itertuples(index=False))
        }
    )
Esempio n. 18
0
def list_command(args):
    if args.type != None and args.name != None:
        if args.terms:
            if args.type == 'GWAS':
                gwas = co.GWAS(args.name)
                print('\n'.join([x.id for x in gwas.iter_terms()]))
            elif args.type =='GOnt':
                gont = co.GOnt(args.name)
                print('\n'.join([x.id for x in gont.iter_terms()]))
        else:
            print(co.available_datasets(args.type,args.name))
    elif args.type != None and args.name == None:
        args.name = '%'
        print(co.available_datasets(args.type,args.name).to_string())
    else:
        args.type = '%'
        args.name = '%'
        print(co.available_datasets(args.type,args.name).to_string())
Esempio n. 19
0
def ZmGO(Zm5bFGS):
    if cf.test.force.Ontology:
        co.del_dataset('GOnt', 'ZmGO', force=True)
    if not co.available_datasets('GOnt', 'ZmGO'):
        obo = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'go.obo.bz2')
        gene_map_file = os.path.join(cf.options.testdir, 'raw', 'GOnt',
                                     'zm_go.tsv.bz2')
        return co.GOnt.from_obo(obo, gene_map_file, 'ZmGO',
                                'Maize Gene Ontology', Zm5bFGS)
    else:
        return co.GOnt('ZmGO')
Esempio n. 20
0
def TestGO(Zm5bFGS):
    if cf.test.force.Ontology:
        co.del_dataset('GOnt', 'TestGO', force=True)
    if not co.available_datasets('GOnt', 'TestGO'):
        obo = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'go.test.obo')
        gene_map_file = os.path.join(cf.options.testdir, 'raw', 'GOnt',
                                     'go.test.tsv')
        return co.GOnt.from_obo(obo, gene_map_file, 'TestGO', 'Test GO',
                                Zm5bFGS)
    else:
        return co.GOnt('TestGO')
Esempio n. 21
0
def AtTair10():
    if cf.test.force.RefGen:
        co.del_dataset('RefGen', 'AtTair10', force=True)
    if not co.available_datasets('RefGen', 'AtTair10'):
        gff = os.path.expanduser(
            os.path.join(cf.options.testdir, 'raw', 'RefGen',
                         'TAIR10_GFF3_genes.gff.gz'))
        return co.RefGen.from_gff(gff, 'AtTair10', 'Tair 10', '10',
                                  'Arabidopsis')
    else:
        return co.RefGen('AtTair10')
Esempio n. 22
0
def list_command(args):
    if args.type and args.name:
        if args.type == 'GWAS':
            gwas = co.GWAS(args.name)
            print('\n'.join([x.id for x in gwas.iter_terms()]))
        elif args.type =='GOnt':
            gont = co.GOnt(args.name)
            print('\n'.join([x.id for x in gont.iter_terms()]))
    else:
        args.type = '%'
        args.name = '%'
        print(co.available_datasets(args.type,args.name))
Esempio n. 23
0
def Zm5bFGS():
    if cf.test.force.RefGen:
        co.del_dataset('RefGen', 'Zm5bFGS', force=True)
    if not co.available_datasets('RefGen', 'Zm5bFGS'):
        # We have to build it
        gff = os.path.expanduser(
            os.path.join(cf.options.testdir, 'raw', 'RefGen',
                         'ZmB73_5b_FGS.gff.gz'))
        # This is stupid and necessary because pytables wont let me open
        # more than one table
        co.RefGen.from_gff(gff, 'Zm5bFGS', 'Maize 5b Filtered Gene Set', '5b',
                           'Zea Mays')
    return co.RefGen('Zm5bFGS')
Esempio n. 24
0
def AtTair10():
    if cf.test.force.RefGen:
        co.del_dataset('RefGen', 'AtTair10', safe=False)
    if not co.available_datasets('RefGen', 'AtTair10'):
        gff = os.path.expanduser(
            os.path.join(
                cf.options.testdir,
                'raw', 'RefGen', 'TAIR10_GFF3_genes.gff.gz'
            )
        )
        return co.RefGen.from_gff(
            gff, 'AtTair10', 'Tair 10', '10', 'Arabidopsis'
        )
    else:
        return co.RefGen('AtTair10')
Esempio n. 25
0
def build_gont(args):
    refgen = co.RefGen(args.refgen)
    # Check to see if this dataset is already built
    if co.available_datasets('GOnt', args.name):
        print('Warning! This dataset has already been built.')
        co.del_dataset('GOnt', args.name, force=args.force)

    go = co.GOnt.from_obo(args.obo_filename,
                          args.filename,
                          args.name,
                          args.description,
                          refgen,
                          go_col=args.go_col,
                          id_col=args.id_col)
    print("Done: {}".format(go.summary()))
    print('Build Successful')
Esempio n. 26
0
def AtGO(AtTair10):
    if cf.test.force.Ontology:
        co.del_dataset('GOnt', 'AtGO', force=True)
    if not co.available_datasets('GOnt', 'AtGO'):
        obo = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'go.obo.bz2')
        gene_map_file = os.path.join(cf.options.testdir, 'raw', 'GOnt',
                                     'ath_go.tsv.bz2')
        return co.GOnt.from_obo(obo,
                                gene_map_file,
                                'AtGO',
                                'Arabidopsis Gene Ontology',
                                AtTair10,
                                id_col=0,
                                go_col=5)
    else:
        return co.GOnt('AtGO')
Esempio n. 27
0
def build_cob(args):
    # Build the refgen
    refgen = co.RefGen(args.refgen)
    # Check that the sep is likely right.
    if len(pd.read_table(args.filename, sep=args.sep).columns) == 1:
        print(
            ("Detected only 1 column in {}, are you sure "
             "colunms are separated by '{}'?").format(args.filename, args.sep))
        return None
    if args.allow_non_membership:
        refgen = refgen.copy('{}_tmp'.format(refgen.name),
                             'temp refgen'.format(refgen.name))
        # Add non membership genes
        for gid in pd.read_table(args.filename, sep=args.sep).index:
            refgen.add_gene(Gene(None, None, id=gid))

    quality_control = False if args.skip_quality_control else True
    normalize = False if args.skip_normalization else True

    # Check to see if this dataset is already built
    if co.available_datasets('Expr', args.name):
        print('Warning! This dataset has already been built.')
        co.del_dataset('Expr', args.name, safe=args.force)

    # Basically just pass all the CLI arguments to the COB class method
    cob = co.COB.from_table(
        args.filename,
        args.name,
        args.description,
        refgen,
        # Optional arguments
        sep=args.sep,
        rawtype=args.rawtype,
        # Data Processing
        quality_control=quality_control,
        normalization=normalize,
        quantile=args.quantile,
        # Data processing parameters
        max_gene_missing_data=args.max_gene_missing_data,
        max_accession_missing_data=args.max_accession_missing_data,
        min_single_sample_expr=args.min_single_sample_expr,
        min_expr=args.min_expr,
        max_val=args.max_val,
        dry_run=args.dry_run,
        index_col=args.index_col)
    print("Build successful!")
    print(cob.summary())
Esempio n. 28
0
def Zm5bFGS():
    if cf.test.force.RefGen:
        co.del_dataset('RefGen', 'Zm5bFGS', safe=False)
    if not co.available_datasets('RefGen', 'Zm5bFGS'):
        # We have to build it
        gff = os.path.expanduser(
            os.path.join(
                cf.options.testdir,
                'raw', 'RefGen', 'ZmB73_5b_FGS.gff.gz'
            )
        )
        # This is stupid and necessary because pytables wont let me open
        # more than one table
        return co.RefGen.from_gff(
            gff, 'Zm5bFGS', 'Maize 5b Filtered Gene Set', '5b', 'Zea Mays'
        )
    return co.RefGen('Zm5bFGS')
Esempio n. 29
0
def ZmSAM(Zm5bFGS):
    if cf.test.force.COB:
        co.del_dataset('Expr', 'ZmSAM', force=True)
    if not co.available_datasets('Expr', 'ZmSAM'):
        return co.COB.from_table(os.path.join(
            cf.options.testdir, 'raw', 'Expr', 'RNASEQ',
            'TranscriptomeProfiling_B73_Atlas_SAM_FGS_LiLin_20140316.txt.gz'),
                                 'ZmSAM',
                                 'Maize Root Network',
                                 Zm5bFGS,
                                 rawtype='RNASEQ',
                                 max_gene_missing_data=0.4,
                                 min_expr=0.1,
                                 quantile=False,
                                 dry_run=False,
                                 max_val=250)
    else:
        return co.COB('ZmSAM')
Esempio n. 30
0
def ZmGO(Zm5bFGS):
    if cf.test.force.Ontology:
        co.del_dataset('GOnt','ZmGO',safe=False)
    if not co.available_datasets('GOnt','ZmGO'):
        obo = os.path.join(
            cf.options.testdir,
            'raw','GOnt','go.obo.bz2'
        )
        gene_map_file = os.path.join(
            cf.options.testdir,
            'raw','GOnt','zm_go.tsv.bz2'
        )
        return co.GOnt.from_obo(
           obo, gene_map_file, 'ZmGO',
           'Maize Gene Ontology', Zm5bFGS
        )
    else:
        return co.GOnt('ZmGO')
Esempio n. 31
0
def ZmPAN(Zm5bFGS):
    if cf.test.force.COB:
        co.del_dataset('Expr', 'ZmPAN', force=True)
    if not co.available_datasets('Expr', 'ZmPAN'):
        return co.COB.from_table(os.path.join(cf.options.testdir, 'raw',
                                              'Expr', 'RNASEQ',
                                              'PANGenomeFPKM.txt.gz'),
                                 'ZmPAN',
                                 'Maize Root Network',
                                 Zm5bFGS,
                                 rawtype='RNASEQ',
                                 max_gene_missing_data=0.4,
                                 min_expr=1,
                                 quantile=False,
                                 dry_run=False,
                                 sep=',',
                                 max_val=300)
    else:
        return co.COB('ZmPAN')
Esempio n. 32
0
def AtGO(AtTair10):
    if cf.test.force.Ontology:
        co.del_dataset('GOnt','AtGO',safe=False)
    if not co.available_datasets('GOnt','AtGO'):
        obo = os.path.join(
            cf.options.testdir,
            'raw','GOnt','go.obo.bz2'
        )
        gene_map_file = os.path.join(
            cf.options.testdir,
            'raw','GOnt','ath_go.tsv.bz2'
        )
        return co.GOnt.from_obo(
           obo, gene_map_file, 'AtGO',
           'Arabidopsis Gene Ontology', AtTair10,
           id_col=0, go_col=5 
        )
    else:
        return co.GOnt('AtGO')
Esempio n. 33
0
def ZmRoot(Zm5bFGS):
    if cf.test.force.COB:
        co.del_dataset('Expr', 'ZmRoot', force=True)
    if not co.available_datasets('Expr', 'ZmRoot'):
        return co.COB.from_table(os.path.join(cf.options.testdir, 'raw',
                                              'Expr', 'RNASEQ',
                                              'ROOTFPKM.tsv.gz'),
                                 'ZmRoot',
                                 'Maize Root Network',
                                 Zm5bFGS,
                                 rawtype='RNASEQ',
                                 max_gene_missing_data=0.3,
                                 max_accession_missing_data=0.08,
                                 min_single_sample_expr=1,
                                 min_expr=0.001,
                                 quantile=False,
                                 max_val=300)
    else:
        return co.COB('ZmRoot')
Esempio n. 34
0
def ZmWallace(Zm5bFGS):
    if cf.test.force.Ontology:
        co.del_dataset('GWAS', 'ZmWallace', force=True)
    if not co.available_datasets('GWAS', 'ZmWallace'):
        # Grab path the csv
        csv = os.path.join(
            cf.options.testdir, 'raw', 'GWAS', 'WallacePLoSGenet',
            'Wallace_etal_2014_PLoSGenet_GWAS_hits-150112.txt.bz2')
        # Define our reference geneome
        df = pd.DataFrame.from_csv(csv, index_col=None, sep='\t')
        # Import class from dataframe
        gwas = co.GWAS.from_DataFrame(df,
                                      'ZmWallace',
                                      'Wallace PLoS ONE Dataset.',
                                      Zm5bFGS,
                                      term_col='trait',
                                      chr_col='chr',
                                      pos_col='pos')
        return gwas
    else:
        return co.GWAS('ZmWallace')
Esempio n. 35
0
def ZmWallace(Zm5bFGS):
    if cf.test.force.Ontology:
        co.del_dataset('GWAS','ZmWallace',safe=False)
    if not co.available_datasets('GWAS','ZmWallace'):
        # Grab path the csv
        csv = os.path.join(
            cf.options.testdir,
            'raw','GWAS','WallacePLoSGenet',
            'Wallace_etal_2014_PLoSGenet_GWAS_hits-150112.txt.bz2'
        )
        # Define our reference geneome
        df = pd.DataFrame.from_csv(csv,index_col=None,sep='\t')
        # Import class from dataframe
        gwas  = co.GWAS.from_DataFrame(
            df, 'ZmWallace', 'Wallace PLoS ONE Dataset.',
            Zm5bFGS,
            term_col='trait', chr_col='chr', pos_col='pos'
        )
        return gwas
    else:
        return co.GWAS('ZmWallace')
Esempio n. 36
0
def AtLeaf(AtTair10):
    if cf.test.force.COB:
        co.del_dataset('Expr', 'AtLeaf', force=True)
    if not co.available_datasets('Expr', 'AtLeaf'):
        Leaf = [
            'GSE14578',
            'GSE5630',
            'GSE13739',  #'GSE26199',
            'GSE5686',
            'GSE5615',
            'GSE5620',
            'GSE5628',
            'GSE5624',
            'GSE5626',
            'GSE5621',
            'GSE5622',
            'GSE5623',
            'GSE5625',
            'GSE5688'
        ]
        LeafFam = sum([
            co.Family.from_file(
                os.path.join(cf.options.testdir, 'raw', 'GSE',
                             '{}_family.soft.gz'.format(x))) for x in Leaf
        ])
        #LeafFam.to_keepfile("LeafKeep.tsv", keep_hint="lea")
        return co.COB.from_DataFrame(
            LeafFam.series_matrix(keepfile=os.path.join(
                cf.options.testdir, 'raw', 'GSE', 'LeafKeep.tsv')),
            'AtLeaf',
            'Arabidopsis Leaf',
            AtTair10,
            rawtype='MICROARRAY',
            max_gene_missing_data=0.3,
            min_expr=0.01,
            quantile=True,
        )
    else:
        return co.COB('AtLeaf')
Esempio n. 37
0
def ZmSAM(Zm5bFGS):
    if cf.test.force.COB:
        co.del_dataset('Expr','ZmSAM',safe=False)
    if not co.available_datasets('Expr','ZmSAM'):
        return co.COB.from_table(
            os.path.join(
                cf.options.testdir,
                'raw','Expr','RNASEQ',
                'TranscriptomeProfiling_B73_Atlas_SAM_FGS_LiLin_20140316.txt.gz'
            ),
            'ZmSAM',
            'Maize Root Network',
            Zm5bFGS,
            rawtype='RNASEQ',
            max_gene_missing_data=0.4,
            min_expr=0.1,
            quantile=False,
            dry_run=False,
            max_val=250
        )
    else:
        return co.COB('ZmSAM')
Esempio n. 38
0
def ZmRoot(Zm5bFGS):
    if cf.test.force.COB:
        co.del_dataset('Expr','ZmRoot',safe=False)
    if not co.available_datasets('Expr','ZmRoot'):
        return co.COB.from_table(
            os.path.join(
                cf.options.testdir,
                'raw','Expr',
                'RNASEQ','ROOTFPKM.tsv.gz'
            ),
            'ZmRoot',
            'Maize Root Network',
            Zm5bFGS,
            rawtype='RNASEQ',
            max_gene_missing_data=0.3,
            max_accession_missing_data=0.08,
            min_single_sample_expr=1,
            min_expr=0.001,
            quantile=False,
            max_val=300
        )
    else:
        return co.COB('ZmRoot')
Esempio n. 39
0
def ZmPAN(Zm5bFGS):
    if cf.test.force.COB:
        co.del_dataset('Expr','ZmPAN',safe=False)
    if not co.available_datasets('Expr','ZmPAN'):
        return co.COB.from_table(
            os.path.join(
                cf.options.testdir,
                'raw','Expr','RNASEQ',
                'PANGenomeFPKM.txt.gz'
            ),
            'ZmPAN',
            'Maize Root Network',
            Zm5bFGS,
            rawtype='RNASEQ',
            max_gene_missing_data=0.4,
            min_expr=1,
            quantile=False,
            dry_run=False,
            sep=',',
            max_val=300
        )
    else:
        return co.COB('ZmPAN')
Esempio n. 40
0
def snp2gene(args):
    '''
        Perform SNP (locus) to candidate gene mapping
    '''

    if args.out != sys.stdout:
        # Create any non-existant directories
        if os.path.dirname(args.out) != '':
            os.makedirs(os.path.dirname(args.out),exist_ok=True)
        if os.path.exists(args.out) and not args.force:
            print(
                "Output for {} exists! Skipping!".format(
                    args.out
                ),file=sys.stderr
            )
            return None

    # Set a flag saying this is from a COB refgen
    from_cob = False
    # Create the refgen (option to create it from a COB)
    if co.available_datasets('Expr',args.refgen):
        refgen = co.COB(args.refgen).refgen
        from_cob = args.refgen 
    elif co.available_datasets('RefGen',args.refgen):
        refgen = co.RefGen(args.refgen)
    # Create the GWAS object
    ont = co.GWAS(args.gwas)

    if 'all' in args.terms:
        terms = ont.iter_terms()
    else:
        terms = [ont[term] for term in args.terms]

    data = pd.DataFrame()
    results = []
    for term in terms:
        for window_size in args.candidate_window_size:
            for flank_limit in args.candidate_flank_limit:
                if 'effective' in args.snp2gene:
                    # Map to effective
                    effective_loci = term.effective_loci(
                        window_size=window_size
                    )
                elif 'strongest' in args.snp2gene:
                    effective_loci = term.strongest_loci(
                        window_size=window_size,
                        attr=args.strongest_attr,
                        lowest=args.strongest_higher
                    )
                genes = pd.DataFrame([ x.as_dict() for x in 
                    refgen.candidate_genes(
                        effective_loci,
                        flank_limit=flank_limit,
                        include_parent_locus=True,
                        include_num_siblings=True,
                        include_num_intervening=True,
                        include_rank_intervening=True,
                        include_SNP_distance=True,
                        include_parent_attrs=args.include_parent_attrs,
                        attrs={'Term':term.id},
                    )
                ])
                genes['FlankLimit'] = flank_limit
                genes['WindowSize'] = window_size
                genes['RefGen'] = refgen.name
                if from_cob != False:
                    genes['COB'] = from_cob
                data = pd.concat([data,genes])

    # Add data from gene info files
    original_number_genes = len(data)
    for info_file in args.gene_info:
        log('Adding info for {}',info_file)
        # Assume the file is a table
        info = pd.read_table(info_file,sep='\t')
        if len(info.columns) == 1:
            info = pd.read_table(info_file,sep=',')
        # try to match as many columns as possible
        matching_columns = set(data.columns).intersection(info.columns)
        log("Joining SNP2Gene mappings with info file on: {}",','.join(matching_columns))
        data = pd.merge(data,info,how='left')
        if len(data) != original_number_genes:
            log.warn(
                'There were multiple info rows for some genes. '
                'Beware of potential duplicate candidate gene entries! '
            )
    
    # Generate the output file
    data.to_csv(args.out,index=None,sep='\t')

    log("Summary stats")
    print('-'*100)
    #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit))
    print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),len(data.ID.unique())))
    print("Number of candidate genes per term:")
    print(data.groupby('Term').apply(lambda df: len(df.ID)))
Esempio n. 41
0
    'default':dflt['snpLevels'],'min':1,'max':10,'int':True},
  'pCutoff':{'title':'Probability Cutoff',
    'default':dflt['pCutoff'],'min':0.0,'max':1.0,'int':False},
  'minTerm':{'title':'Min Genes (GO)',
    'default':dflt['minTerm'],'min':1,'max':99,'int':True},
  'maxTerm':{'title':'Max Genes (GO)',
    'default':dflt['maxTerm'],'min':100,'max':1000,'int':True},
}

# ----------------------------------------
#    Load things to memeory to prepare
# ----------------------------------------
# Generate network list based on allowed list
print('Preloading networks into memory...')
if len(conf['networks']) < 1:
    conf['networks'] = list(co.available_datasets('Expr')['Name'].values)
networks = {x:co.COB(x) for x in conf['networks']}
network_info = [[net.name, net._global('parent_refgen'), net.description] for name,net in networks.items()]
print('Availible Networks: ' + str(networks))

# Generate ontology list based on allowed list and load them into memory
print('Preloading GWASes into Memory...')
if len(conf['gwas']) < 1:
    conf['gwas'] = list(co.available_datasets('GWAS')['Name'].values)
onts = {x:co.GWAS(x) for x in conf['gwas']}
onts_info = {}
for m,net in networks.items():
    ref = net._global('parent_refgen')
    onts_info[net.name] = []
    for n,ont in onts.items():
        if ont.refgen.name == ref:
Esempio n. 42
0
def snp2gene(args):
    '''
        Perform SNP (locus) to candidate gene mapping
    '''

    if args.out != sys.stdout:
        # Create any non-existant directories
        if os.path.dirname(args.out) != '':
            os.makedirs(os.path.dirname(args.out), exist_ok=True)
        if os.path.exists(args.out) and not args.force:
            print("Output for {} exists! Skipping!".format(args.out),
                  file=sys.stderr)
            return None

    # Set a flag saying this is from a COB refgen
    from_cob = False
    # Create the refgen (option to create it from a COB)
    if co.available_datasets('Expr', args.refgen):
        refgen = co.COB(args.refgen).refgen
        from_cob = args.refgen
    elif co.available_datasets('RefGen', args.refgen):
        refgen = co.RefGen(args.refgen)
    # Create the GWAS object
    ont = co.GWAS(args.gwas)

    if 'all' in args.terms:
        terms = ont.iter_terms()
    else:
        terms = [ont[term] for term in args.terms]

    data = pd.DataFrame()
    results = []
    for term in terms:
        for window_size in args.candidate_window_size:
            for flank_limit in args.candidate_flank_limit:
                if 'effective' in args.snp2gene:
                    # Map to effective
                    effective_loci = term.effective_loci(
                        window_size=window_size)
                elif 'strongest' in args.snp2gene:
                    effective_loci = term.strongest_loci(
                        window_size=window_size,
                        attr=args.strongest_attr,
                        lowest=args.strongest_higher)
                genes = pd.DataFrame([
                    x.as_dict() for x in refgen.candidate_genes(
                        effective_loci,
                        flank_limit=flank_limit,
                        include_parent_locus=True,
                        include_num_siblings=True,
                        include_num_intervening=True,
                        include_rank_intervening=True,
                        include_SNP_distance=True,
                        include_parent_attrs=args.include_parent_attrs,
                        attrs={'Term': term.id},
                    )
                ])
                genes['FlankLimit'] = flank_limit
                genes['WindowSize'] = window_size
                genes['RefGen'] = refgen.name
                if from_cob != False:
                    genes['COB'] = from_cob
                data = pd.concat([data, genes])

    # Add data from gene info files
    original_number_genes = len(data)
    for info_file in args.gene_info:
        log('Adding info for {}', info_file)
        # Assume the file is a table
        info = pd.read_table(info_file, sep='\t')
        if len(info.columns) == 1:
            info = pd.read_table(info_file, sep=',')
        # try to match as many columns as possible
        matching_columns = set(data.columns).intersection(info.columns)
        log("Joining SNP2Gene mappings with info file on: {}",
            ','.join(matching_columns))
        data = pd.merge(data, info, how='left')
        if len(data) != original_number_genes:
            log.warn('There were multiple info rows for some genes. '
                     'Beware of potential duplicate candidate gene entries! ')

    # Generate the output file
    data.to_csv(args.out, index=None, sep='\t')

    log("Summary stats")
    print('-' * 100)
    #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit))
    print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),
                                              len(data.ID.unique())))
    print("Number of candidate genes per term:")
    print(data.groupby('Term').apply(lambda df: len(df.ID)))
Esempio n. 43
0
File: COB.py Progetto: monprin/cob
# Prefetch the gene names for all the networks
print('Fetching gene names for networks...')
network_genes = {}
for name, net in networks.items():
    ids = list(net._expr.index.values)
    als = co.RefGen(net._global('parent_refgen')).aliases(ids)
    for k,v in als.items():
        ids += v
    network_genes[name] = list(set(ids))
print('Found gene names')

# Find all of the GWAS data we have available
print('Finding GWAS Data...')
gwas_data_db = {}
for gwas in co.available_datasets('GWASData')['Name']:
    gwas_data_db[gwas] = co.GWASData(gwas)

# Find any functional annotations we have 
print('Finding functional annotations...')
func_data_db = {}
for func in co.available_datasets('RefGenFunc')['Name']:
    print('Processing annotations for {}...'.format(func))
    func_data_db[func] = co.RefGenFunc(func)
    func_data_db[func].to_csv(os.path.join(scratch_folder,(func+'.tsv')))
    geneWordBuilder(func,[os.path.join(scratch_folder,(func+'.tsv'))],[1],['2 end'],['tab'],[True])

# Find any GO ontologies we have for the networks we have
print('Finding applicable GO Ontologies...')
GOnt_db = {}
for name in co.available_datasets('GOnt')['Name']:
Esempio n. 44
0
print('Availible Networks: ' + str(networks))

# Prefetch the gene neames for all the networks
print('Fetching gene names for networks...')
network_genes = {}
for name, net in networks.items():
    ids = list(net._expr.index.values)
    als = co.RefGen(net._global('parent_refgen')).aliases(ids)
    for k,v in als.items():
        ids += v
    network_genes[name] = list(set(ids))
print('Found gene names')

# Generate in Memory Avalible GWAS datasets list
print('Finding available GWAS datasets...')
gwas_sets = {"data" : list(co.available_datasets('GWAS')[
            ['Name','Description']].itertuples(index=False))}

# Find all of the GWAS data we have available
print('Finding GWAS Data...')
gwas_data_db = {}
for gwas in co.available_datasets('GWASData')['Name']:
    gwas_data_db[gwas] = co.GWASData(gwas) 

# Find any functional annotations we have 
print('Finding functional annotations...')
func_data_db = {}
for func in co.available_datasets('RefGenFunc')['Name']:
    print('Processing annotations for {}...'.format(func))
    func_data_db[func] = co.RefGenFunc(func)
    func_data_db[func].to_csv(os.path.join(scratch_folder,(func+'.tsv')))
    geneWordBuilder(func,[os.path.join(scratch_folder,(func+'.tsv'))],[1],['2 end'],['tab'],[True])
Esempio n. 45
0
def all_available_datasets():
    return str(co.available_datasets())
Esempio n. 46
0
        'default':dflt['hpo'],
        'state':dflt['hpo'],
        'isBool':True},
    'visEnrich':{
        'default':dflt['visEnrich'],
        'state':dflt['visEnrich'],
        'isBool':True},
    }

# ----------------------------------------
#    Load things to memeory to prepare
# ----------------------------------------
# Generate network list based on allowed list
print('Preloading networks into memory...')
if len(conf['networks']) < 1:
    conf['networks'] = list(co.available_datasets('Expr')['Name'].values)
networks = {x:co.COB(x) for x in conf['networks']}

network_info = []
refLinks = {}
for name,net in networks.items():
    network_info.append({
        'name':net.name,
        'refgen':net._global('parent_refgen'),
        'desc':net.description,
    })
    if net._global('parent_refgen') in conf['refLinks']:
        refLinks[net.name] = conf['refLinks'][net._global('parent_refgen')]
print('Availible Networks: ' + str(networks))

# Generate ontology list based on allowed list and load them into memory