def available_datasets(type=None,*args): if((type == 'GWAS') or (type == 'Ontology')): return jsonify(gwas_sets) elif((type == 'Expr') or (type == 'Network')): return jsonify(network_list) elif(type == 'All'): return str(co.available_datasets()) else: return jsonify({"data" : list(co.available_datasets(type)[ ['Name','Description']].itertuples(index=False))})
def available_datasets(type=None,*args): # Find the datasets if(type == None): datasets = co.available_datasets() else: datasets = co.available_datasets(type) # Return the results in a table friendly format return jsonify({"data" : list(datasets[ ['Name','Description']].itertuples(index=False))})
def AtRoot(AtTair10): if cf.test.force.COB: co.del_dataset('Expr', 'AtRoot', force=True) if not co.available_datasets('Expr', 'AtRoot'): Root = [ 'GSE14578', 'GSE46205', 'GSE7631', 'GSE10576', 'GSE42007', 'GSE34130', 'GSE21611', 'GSE22966', 'GSE7641', 'GSE5620', 'GSE8934', 'GSE5628', 'GSE30095', 'GSE30097', 'GSE5624', 'GSE5626', 'GSE5749', 'GSE5621', 'GSE5622', 'GSE5623', 'GSE5625', 'GSE5688' ] RootFam = sum([ co.Family.from_file( os.path.join(cf.options.testdir, 'raw', 'GSE', '{}_family.soft.gz'.format(x))) for x in Root ]) #RootFam.to_keepfile("RootKeep.tsv", keep_hint='root') return co.COB.from_DataFrame( RootFam.series_matrix(keepfile=os.path.join( cf.options.testdir, 'raw', 'GSE', 'RootKeep.tsv')), 'AtRoot', 'Arab Root', AtTair10, rawtype='MICROARRAY', quantile=True) else: return co.COB('AtRoot')
def AtLeafHydroIonome(AtTair10): if cf.test.force.Ontology: co.del_dataset('GWAS', 'AtLeafHydroIonome', force=True) if not co.available_datasets('GWAS', 'AtLeafHydroIonome'): # glob glob is god csvs = glob.glob( os.path.join(cf.options.testdir, 'raw', 'GWAS', 'AtIonome', 'AtLeafHydroIonome', '*.csv.gz')) # Read in each table individually then concat for GIANT table df = pd.concat([pd.read_table(x, sep=' ') for x in csvs]) df = df.loc[df.pval <= cf.options.alpha, :] # Kill groups of SNPs that have identical (beta,pval)s df = df.groupby(['beta', 'pval']).filter(lambda x: len(x) < 5) # Add 'Chr' to chromosome column df.CHR = df.CHR.apply(lambda x: 'Chr' + str(x)) # Import class from dataframe return co.GWAS.from_DataFrame(df, 'AtLeafHydroIonome', 'Arabidopsis second pass 1.6M', AtTair10, term_col='Trait', chr_col='CHR', pos_col='POS') else: return co.GWAS('AtLeafHydroIonome')
def AtGen(AtTair10): if cf.test.force.COB: co.del_dataset('Expr', 'AtGen', force=True) if not co.available_datasets('Expr', 'AtGen'): General = [ 'GSE18975', 'GSE39384', 'GSE19271', 'GSE5632', 'GSE39385', 'GSE5630', 'GSE15617', 'GSE5617', 'GSE5686', 'GSE2473', 'GSE5633', 'GSE5620', 'GSE5628', 'GSE5624', 'GSE5626', 'GSE5621', 'GSE5622', 'GSE5623', 'GSE5625', 'GSE5688' ] GenFam = sum([ co.Family.from_file( os.path.join(cf.options.testdir, 'raw', 'GSE', '{}_family.soft.gz'.format(x))) for x in General ]) #GenFam.to_keepfile("GenKeep.tsv") return co.COB.from_DataFrame( GenFam.series_matrix(keepfile=os.path.join( cf.options.testdir, 'raw', 'GSE', 'GenKeep.tsv')), 'AtGen', 'Arab General', AtTair10, rawtype='MICROARRAY', quantile=True) else: return co.COB('AtGen')
def AtSeed(AtTair10): if cf.test.force.COB: co.del_dataset('Expr', 'AtSeed', force=True) if not co.available_datasets('Expr', 'AtSeed'): Seed = [ 'GSE12404', #'GSE30223', 'GSE1051', 'GSE11852', 'GSE5634' ] SeedFam = sum([ co.Family.from_file( os.path.join(cf.options.testdir, 'raw', 'GSE', '{}_family.soft.gz'.format(x))) for x in Seed ]) #SeedFam.to_keepfile("SeedKeep.tsv", keep_hint='seed') return co.COB.from_DataFrame( SeedFam.series_matrix(keepfile=os.path.join( cf.options.testdir, 'raw', 'GSE', 'SeedKeep.tsv')), 'AtSeed', 'Arabidopsis Seed', AtTair10, rawtype='MICROARRAY', quantile=True) else: return co.COB('AtSeed')
def ZmRNASeqTissueAtlas(Zm5bFGS): if cf.test.force.COB: print('Rebuilding ZmRNASeqTissueAtlas') co.del_dataset('COB', 'ZmRNASeqTissueAtlas', force=True) co.del_dataset('Expr', 'ZmRNASeqTissueAtlas', force=True) if not co.available_datasets('Expr', 'ZmRNASeqTissueAtlas'): # Build it return co.COB.from_table( os.path.join( cf.options.testdir, 'raw', 'Expr', 'RNASEQ', 'MaizeRNASeqTissue.tsv.bz2', ), 'ZmRNASeqTissueAtlas', 'Maize RNASeq Tissue Atlas Network, Sekhon 2013, PLoS ONE', Zm5bFGS, rawtype='RNASEQ', max_gene_missing_data=0.3, max_accession_missing_data=0.08, min_single_sample_expr=1, min_expr=0.001, quantile=False, max_val=300, dry_run=True) else: return co.COB('ZmRNASeqTissueAtlas')
def AtSeed(AtTair10): if cf.test.force.COB: co.del_dataset('Expr', 'AtSeed', safe=False) if not co.available_datasets('Expr', 'AtSeed'): Seed = ['GSE12404', #'GSE30223', 'GSE1051', 'GSE11852', 'GSE5634'] SeedFam = sum( [co.Family.from_file( os.path.join( cf.options.testdir, 'raw', 'GSE', '{}_family.soft.gz'.format(x) ) ) for x in Seed ] ) #SeedFam.to_keepfile("SeedKeep.tsv", keep_hint='seed') return co.COB.from_DataFrame( SeedFam.series_matrix( keepfile=os.path.join( cf.options.testdir, 'raw', 'GSE', 'SeedKeep.tsv' ) ), 'AtSeed', 'Arabidopsis Seed', AtTair10, rawtype='MICROARRAY', quantile=True ) else: return co.COB('AtSeed')
def AtGen(AtTair10): if cf.test.force.COB: co.del_dataset('Expr', 'AtGen', safe=False) if not co.available_datasets('Expr', 'AtGen'): General = ['GSE18975', 'GSE39384', 'GSE19271', 'GSE5632', 'GSE39385', 'GSE5630', 'GSE15617', 'GSE5617', 'GSE5686', 'GSE2473', 'GSE5633', 'GSE5620', 'GSE5628', 'GSE5624', 'GSE5626', 'GSE5621', 'GSE5622', 'GSE5623', 'GSE5625', 'GSE5688'] GenFam = sum( [co.Family.from_file( os.path.join( cf.options.testdir, 'raw', 'GSE', '{}_family.soft.gz'.format(x) ) ) for x in General ] ) #GenFam.to_keepfile("GenKeep.tsv") return co.COB.from_DataFrame( GenFam.series_matrix( keepfile=os.path.join( cf.options.testdir, 'raw', 'GSE', 'GenKeep.tsv' ) ), 'AtGen', 'Arab General', AtTair10, rawtype='MICROARRAY', quantile=True ) else: return co.COB('AtGen')
def AtLeaf(AtTair10): if cf.test.force.COB: co.del_dataset('Expr', 'AtLeaf', safe=False) if not co.available_datasets('Expr', 'AtLeaf'): Leaf = ['GSE14578', 'GSE5630', 'GSE13739', #'GSE26199', 'GSE5686', 'GSE5615', 'GSE5620', 'GSE5628', 'GSE5624', 'GSE5626', 'GSE5621', 'GSE5622', 'GSE5623', 'GSE5625', 'GSE5688'] LeafFam = sum( [co.Family.from_file( os.path.join( cf.options.testdir, 'raw', 'GSE', '{}_family.soft.gz'.format(x) ) ) for x in Leaf ] ) #LeafFam.to_keepfile("LeafKeep.tsv", keep_hint="lea") return co.COB.from_DataFrame( LeafFam.series_matrix( keepfile=os.path.join( cf.options.testdir, 'raw', 'GSE', 'LeafKeep.tsv' ) ), 'AtLeaf', 'Arabidopsis Leaf', AtTair10, rawtype='MICROARRAY', max_gene_missing_data=0.3, min_expr=0.01, quantile=True, ) else: return co.COB('AtLeaf')
def AtRoot(AtTair10): if cf.test.force.COB: co.del_dataset('Expr', 'AtRoot', safe=False) if not co.available_datasets('Expr', 'AtRoot'): Root = ['GSE14578', 'GSE46205', 'GSE7631', 'GSE10576', 'GSE42007', 'GSE34130', 'GSE21611', 'GSE22966', 'GSE7641', 'GSE5620', 'GSE8934', 'GSE5628', 'GSE30095', 'GSE30097', 'GSE5624', 'GSE5626', 'GSE5749', 'GSE5621', 'GSE5622', 'GSE5623', 'GSE5625', 'GSE5688'] RootFam = sum( [co.Family.from_file( os.path.join( cf.options.testdir, 'raw', 'GSE', '{}_family.soft.gz'.format(x) ) ) for x in Root ] ) #RootFam.to_keepfile("RootKeep.tsv", keep_hint='root') return co.COB.from_DataFrame( RootFam.series_matrix( keepfile=os.path.join( cf.options.testdir, 'raw', 'GSE', 'RootKeep.tsv') ), 'AtRoot', 'Arab Root', AtTair10, rawtype='MICROARRAY', quantile=True ) else: return co.COB('AtRoot')
def ZmIonome(Zm5bFGS): # Delete the old dataset if cf.test.force.Ontology: co.del_dataset('GWAS','ZmIonome',safe=False) if not co.available_datasets('GWAS','ZmIonome'): # Grab path the csv csv = os.path.join( cf.options.testdir, 'raw','GWAS','Ionome', 'sigGWASsnpsCombinedIterations.longhorn.allLoc.csv.gz' ) # Define our reference geneome df = pd.DataFrame.from_csv(csv,index_col=None) # Import class from dataframe IONS = co.GWAS.from_DataFrame( df,'ZmIonome','Maize Ionome', Zm5bFGS, term_col='el',chr_col='chr',pos_col='pos' ) # Get rid of pesky Cobalt IONS.del_term('Co59') # I guess we need a test in here too return IONS else: return co.GWAS('ZmIonome')
def AtRootHydroIonome(AtTair10): if cf.test.force.Ontology: co.del_dataset('GWAS','AtRootHydroIonome',safe=False) if not co.available_datasets('GWAS', 'AtRootHydroIonome'): # glob glob is god csvs = glob.glob(os.path.join( cf.options.testdir, 'raw','GWAS','AtIonome', 'AtRootHydroIonome','*.csv.gz' )) # Read in each table individually then concat for GIANT table df = pd.concat([pd.read_table(x,sep=' ') for x in csvs]) # Only keep significant pvals df = df.loc[df.pval <= cf.options.alpha,:] # Kill groups of SNPs that have identical (beta,pval)s df = df.groupby(['beta','pval']).filter(lambda x: len(x) < 5) # Add 'Chr' to chromosome column df.CHR = df.CHR.apply(lambda x: 'Chr'+str(x)) # Chase dat refgen # Import class from dataframe return co.GWAS.from_DataFrame( df,'AtRootHydroIonome','Arabidopsis second pass 1.6M', AtTair10, term_col='Trait', chr_col='CHR', pos_col='POS' ) else: return co.GWAS('AtRootHydroIonome')
def ZmIonome(Zm5bFGS): # Delete the old dataset if cf.test.force.Ontology: co.del_dataset('GWAS', 'ZmIonome', force=True) if not co.available_datasets('GWAS', 'ZmIonome'): # Grab path the csv csv = os.path.join( cf.options.testdir, 'raw', 'GWAS', 'Ionome', 'sigGWASsnpsCombinedIterations.longhorn.allLoc.csv.gz') # Define our reference geneome df = pd.DataFrame.from_csv(csv, index_col=None) # Import class from dataframe IONS = co.GWAS.from_DataFrame(df, 'ZmIonome', 'Maize Ionome', Zm5bFGS, term_col='el', chr_col='chr', pos_col='pos') # Get rid of pesky Cobalt IONS.del_term('Co59') # I guess we need a test in here too return IONS else: return co.GWAS('ZmIonome')
def ZmRNASeqTissueAtlas(Zm5bFGS): if cf.test.force.COB: co.del_dataset('COB', 'ZmRNASeqTissueAtlas', safe=False) co.del_dataset('Expr', 'ZmRNASeqTissueAtlas', safe=False) if not co.available_datasets('Expr', 'ZmRNASeqTissueAtlas'): # Build it return co.COB.from_table( os.path.join(cf.options.testdir, 'raw', 'Expr', 'RNASEQ', 'MaizeRNASeqTissue.tsv.bz2', ), 'ZmRNASeqTissueAtlas', 'Maize RNASeq Tissue Atlas Network, Sekhon 2013, PLoS ONE', Zm5bFGS, rawtype='RNASEQ', max_gene_missing_data=0.3, max_accession_missing_data=0.08, min_single_sample_expr=1, min_expr=0.001, quantile=False, max_val=300, dry_run=True ) else: return co.COB('ZmRNASeqTissueAtlas')
def available_datasets(type=None,*args): return jsonify({ "data" : list( co.available_datasets(type)[ ['Name','Description'] ].itertuples(index=False)) } )
def list_command(args): if args.type != None and args.name != None: if args.terms: if args.type == 'GWAS': gwas = co.GWAS(args.name) print('\n'.join([x.id for x in gwas.iter_terms()])) elif args.type =='GOnt': gont = co.GOnt(args.name) print('\n'.join([x.id for x in gont.iter_terms()])) else: print(co.available_datasets(args.type,args.name)) elif args.type != None and args.name == None: args.name = '%' print(co.available_datasets(args.type,args.name).to_string()) else: args.type = '%' args.name = '%' print(co.available_datasets(args.type,args.name).to_string())
def ZmGO(Zm5bFGS): if cf.test.force.Ontology: co.del_dataset('GOnt', 'ZmGO', force=True) if not co.available_datasets('GOnt', 'ZmGO'): obo = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'go.obo.bz2') gene_map_file = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'zm_go.tsv.bz2') return co.GOnt.from_obo(obo, gene_map_file, 'ZmGO', 'Maize Gene Ontology', Zm5bFGS) else: return co.GOnt('ZmGO')
def TestGO(Zm5bFGS): if cf.test.force.Ontology: co.del_dataset('GOnt', 'TestGO', force=True) if not co.available_datasets('GOnt', 'TestGO'): obo = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'go.test.obo') gene_map_file = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'go.test.tsv') return co.GOnt.from_obo(obo, gene_map_file, 'TestGO', 'Test GO', Zm5bFGS) else: return co.GOnt('TestGO')
def AtTair10(): if cf.test.force.RefGen: co.del_dataset('RefGen', 'AtTair10', force=True) if not co.available_datasets('RefGen', 'AtTair10'): gff = os.path.expanduser( os.path.join(cf.options.testdir, 'raw', 'RefGen', 'TAIR10_GFF3_genes.gff.gz')) return co.RefGen.from_gff(gff, 'AtTair10', 'Tair 10', '10', 'Arabidopsis') else: return co.RefGen('AtTair10')
def list_command(args): if args.type and args.name: if args.type == 'GWAS': gwas = co.GWAS(args.name) print('\n'.join([x.id for x in gwas.iter_terms()])) elif args.type =='GOnt': gont = co.GOnt(args.name) print('\n'.join([x.id for x in gont.iter_terms()])) else: args.type = '%' args.name = '%' print(co.available_datasets(args.type,args.name))
def Zm5bFGS(): if cf.test.force.RefGen: co.del_dataset('RefGen', 'Zm5bFGS', force=True) if not co.available_datasets('RefGen', 'Zm5bFGS'): # We have to build it gff = os.path.expanduser( os.path.join(cf.options.testdir, 'raw', 'RefGen', 'ZmB73_5b_FGS.gff.gz')) # This is stupid and necessary because pytables wont let me open # more than one table co.RefGen.from_gff(gff, 'Zm5bFGS', 'Maize 5b Filtered Gene Set', '5b', 'Zea Mays') return co.RefGen('Zm5bFGS')
def AtTair10(): if cf.test.force.RefGen: co.del_dataset('RefGen', 'AtTair10', safe=False) if not co.available_datasets('RefGen', 'AtTair10'): gff = os.path.expanduser( os.path.join( cf.options.testdir, 'raw', 'RefGen', 'TAIR10_GFF3_genes.gff.gz' ) ) return co.RefGen.from_gff( gff, 'AtTair10', 'Tair 10', '10', 'Arabidopsis' ) else: return co.RefGen('AtTair10')
def build_gont(args): refgen = co.RefGen(args.refgen) # Check to see if this dataset is already built if co.available_datasets('GOnt', args.name): print('Warning! This dataset has already been built.') co.del_dataset('GOnt', args.name, force=args.force) go = co.GOnt.from_obo(args.obo_filename, args.filename, args.name, args.description, refgen, go_col=args.go_col, id_col=args.id_col) print("Done: {}".format(go.summary())) print('Build Successful')
def AtGO(AtTair10): if cf.test.force.Ontology: co.del_dataset('GOnt', 'AtGO', force=True) if not co.available_datasets('GOnt', 'AtGO'): obo = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'go.obo.bz2') gene_map_file = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'ath_go.tsv.bz2') return co.GOnt.from_obo(obo, gene_map_file, 'AtGO', 'Arabidopsis Gene Ontology', AtTair10, id_col=0, go_col=5) else: return co.GOnt('AtGO')
def build_cob(args): # Build the refgen refgen = co.RefGen(args.refgen) # Check that the sep is likely right. if len(pd.read_table(args.filename, sep=args.sep).columns) == 1: print( ("Detected only 1 column in {}, are you sure " "colunms are separated by '{}'?").format(args.filename, args.sep)) return None if args.allow_non_membership: refgen = refgen.copy('{}_tmp'.format(refgen.name), 'temp refgen'.format(refgen.name)) # Add non membership genes for gid in pd.read_table(args.filename, sep=args.sep).index: refgen.add_gene(Gene(None, None, id=gid)) quality_control = False if args.skip_quality_control else True normalize = False if args.skip_normalization else True # Check to see if this dataset is already built if co.available_datasets('Expr', args.name): print('Warning! This dataset has already been built.') co.del_dataset('Expr', args.name, safe=args.force) # Basically just pass all the CLI arguments to the COB class method cob = co.COB.from_table( args.filename, args.name, args.description, refgen, # Optional arguments sep=args.sep, rawtype=args.rawtype, # Data Processing quality_control=quality_control, normalization=normalize, quantile=args.quantile, # Data processing parameters max_gene_missing_data=args.max_gene_missing_data, max_accession_missing_data=args.max_accession_missing_data, min_single_sample_expr=args.min_single_sample_expr, min_expr=args.min_expr, max_val=args.max_val, dry_run=args.dry_run, index_col=args.index_col) print("Build successful!") print(cob.summary())
def Zm5bFGS(): if cf.test.force.RefGen: co.del_dataset('RefGen', 'Zm5bFGS', safe=False) if not co.available_datasets('RefGen', 'Zm5bFGS'): # We have to build it gff = os.path.expanduser( os.path.join( cf.options.testdir, 'raw', 'RefGen', 'ZmB73_5b_FGS.gff.gz' ) ) # This is stupid and necessary because pytables wont let me open # more than one table return co.RefGen.from_gff( gff, 'Zm5bFGS', 'Maize 5b Filtered Gene Set', '5b', 'Zea Mays' ) return co.RefGen('Zm5bFGS')
def ZmSAM(Zm5bFGS): if cf.test.force.COB: co.del_dataset('Expr', 'ZmSAM', force=True) if not co.available_datasets('Expr', 'ZmSAM'): return co.COB.from_table(os.path.join( cf.options.testdir, 'raw', 'Expr', 'RNASEQ', 'TranscriptomeProfiling_B73_Atlas_SAM_FGS_LiLin_20140316.txt.gz'), 'ZmSAM', 'Maize Root Network', Zm5bFGS, rawtype='RNASEQ', max_gene_missing_data=0.4, min_expr=0.1, quantile=False, dry_run=False, max_val=250) else: return co.COB('ZmSAM')
def ZmGO(Zm5bFGS): if cf.test.force.Ontology: co.del_dataset('GOnt','ZmGO',safe=False) if not co.available_datasets('GOnt','ZmGO'): obo = os.path.join( cf.options.testdir, 'raw','GOnt','go.obo.bz2' ) gene_map_file = os.path.join( cf.options.testdir, 'raw','GOnt','zm_go.tsv.bz2' ) return co.GOnt.from_obo( obo, gene_map_file, 'ZmGO', 'Maize Gene Ontology', Zm5bFGS ) else: return co.GOnt('ZmGO')
def ZmPAN(Zm5bFGS): if cf.test.force.COB: co.del_dataset('Expr', 'ZmPAN', force=True) if not co.available_datasets('Expr', 'ZmPAN'): return co.COB.from_table(os.path.join(cf.options.testdir, 'raw', 'Expr', 'RNASEQ', 'PANGenomeFPKM.txt.gz'), 'ZmPAN', 'Maize Root Network', Zm5bFGS, rawtype='RNASEQ', max_gene_missing_data=0.4, min_expr=1, quantile=False, dry_run=False, sep=',', max_val=300) else: return co.COB('ZmPAN')
def AtGO(AtTair10): if cf.test.force.Ontology: co.del_dataset('GOnt','AtGO',safe=False) if not co.available_datasets('GOnt','AtGO'): obo = os.path.join( cf.options.testdir, 'raw','GOnt','go.obo.bz2' ) gene_map_file = os.path.join( cf.options.testdir, 'raw','GOnt','ath_go.tsv.bz2' ) return co.GOnt.from_obo( obo, gene_map_file, 'AtGO', 'Arabidopsis Gene Ontology', AtTair10, id_col=0, go_col=5 ) else: return co.GOnt('AtGO')
def ZmRoot(Zm5bFGS): if cf.test.force.COB: co.del_dataset('Expr', 'ZmRoot', force=True) if not co.available_datasets('Expr', 'ZmRoot'): return co.COB.from_table(os.path.join(cf.options.testdir, 'raw', 'Expr', 'RNASEQ', 'ROOTFPKM.tsv.gz'), 'ZmRoot', 'Maize Root Network', Zm5bFGS, rawtype='RNASEQ', max_gene_missing_data=0.3, max_accession_missing_data=0.08, min_single_sample_expr=1, min_expr=0.001, quantile=False, max_val=300) else: return co.COB('ZmRoot')
def ZmWallace(Zm5bFGS): if cf.test.force.Ontology: co.del_dataset('GWAS', 'ZmWallace', force=True) if not co.available_datasets('GWAS', 'ZmWallace'): # Grab path the csv csv = os.path.join( cf.options.testdir, 'raw', 'GWAS', 'WallacePLoSGenet', 'Wallace_etal_2014_PLoSGenet_GWAS_hits-150112.txt.bz2') # Define our reference geneome df = pd.DataFrame.from_csv(csv, index_col=None, sep='\t') # Import class from dataframe gwas = co.GWAS.from_DataFrame(df, 'ZmWallace', 'Wallace PLoS ONE Dataset.', Zm5bFGS, term_col='trait', chr_col='chr', pos_col='pos') return gwas else: return co.GWAS('ZmWallace')
def ZmWallace(Zm5bFGS): if cf.test.force.Ontology: co.del_dataset('GWAS','ZmWallace',safe=False) if not co.available_datasets('GWAS','ZmWallace'): # Grab path the csv csv = os.path.join( cf.options.testdir, 'raw','GWAS','WallacePLoSGenet', 'Wallace_etal_2014_PLoSGenet_GWAS_hits-150112.txt.bz2' ) # Define our reference geneome df = pd.DataFrame.from_csv(csv,index_col=None,sep='\t') # Import class from dataframe gwas = co.GWAS.from_DataFrame( df, 'ZmWallace', 'Wallace PLoS ONE Dataset.', Zm5bFGS, term_col='trait', chr_col='chr', pos_col='pos' ) return gwas else: return co.GWAS('ZmWallace')
def AtLeaf(AtTair10): if cf.test.force.COB: co.del_dataset('Expr', 'AtLeaf', force=True) if not co.available_datasets('Expr', 'AtLeaf'): Leaf = [ 'GSE14578', 'GSE5630', 'GSE13739', #'GSE26199', 'GSE5686', 'GSE5615', 'GSE5620', 'GSE5628', 'GSE5624', 'GSE5626', 'GSE5621', 'GSE5622', 'GSE5623', 'GSE5625', 'GSE5688' ] LeafFam = sum([ co.Family.from_file( os.path.join(cf.options.testdir, 'raw', 'GSE', '{}_family.soft.gz'.format(x))) for x in Leaf ]) #LeafFam.to_keepfile("LeafKeep.tsv", keep_hint="lea") return co.COB.from_DataFrame( LeafFam.series_matrix(keepfile=os.path.join( cf.options.testdir, 'raw', 'GSE', 'LeafKeep.tsv')), 'AtLeaf', 'Arabidopsis Leaf', AtTair10, rawtype='MICROARRAY', max_gene_missing_data=0.3, min_expr=0.01, quantile=True, ) else: return co.COB('AtLeaf')
def ZmSAM(Zm5bFGS): if cf.test.force.COB: co.del_dataset('Expr','ZmSAM',safe=False) if not co.available_datasets('Expr','ZmSAM'): return co.COB.from_table( os.path.join( cf.options.testdir, 'raw','Expr','RNASEQ', 'TranscriptomeProfiling_B73_Atlas_SAM_FGS_LiLin_20140316.txt.gz' ), 'ZmSAM', 'Maize Root Network', Zm5bFGS, rawtype='RNASEQ', max_gene_missing_data=0.4, min_expr=0.1, quantile=False, dry_run=False, max_val=250 ) else: return co.COB('ZmSAM')
def ZmRoot(Zm5bFGS): if cf.test.force.COB: co.del_dataset('Expr','ZmRoot',safe=False) if not co.available_datasets('Expr','ZmRoot'): return co.COB.from_table( os.path.join( cf.options.testdir, 'raw','Expr', 'RNASEQ','ROOTFPKM.tsv.gz' ), 'ZmRoot', 'Maize Root Network', Zm5bFGS, rawtype='RNASEQ', max_gene_missing_data=0.3, max_accession_missing_data=0.08, min_single_sample_expr=1, min_expr=0.001, quantile=False, max_val=300 ) else: return co.COB('ZmRoot')
def ZmPAN(Zm5bFGS): if cf.test.force.COB: co.del_dataset('Expr','ZmPAN',safe=False) if not co.available_datasets('Expr','ZmPAN'): return co.COB.from_table( os.path.join( cf.options.testdir, 'raw','Expr','RNASEQ', 'PANGenomeFPKM.txt.gz' ), 'ZmPAN', 'Maize Root Network', Zm5bFGS, rawtype='RNASEQ', max_gene_missing_data=0.4, min_expr=1, quantile=False, dry_run=False, sep=',', max_val=300 ) else: return co.COB('ZmPAN')
def snp2gene(args): ''' Perform SNP (locus) to candidate gene mapping ''' if args.out != sys.stdout: # Create any non-existant directories if os.path.dirname(args.out) != '': os.makedirs(os.path.dirname(args.out),exist_ok=True) if os.path.exists(args.out) and not args.force: print( "Output for {} exists! Skipping!".format( args.out ),file=sys.stderr ) return None # Set a flag saying this is from a COB refgen from_cob = False # Create the refgen (option to create it from a COB) if co.available_datasets('Expr',args.refgen): refgen = co.COB(args.refgen).refgen from_cob = args.refgen elif co.available_datasets('RefGen',args.refgen): refgen = co.RefGen(args.refgen) # Create the GWAS object ont = co.GWAS(args.gwas) if 'all' in args.terms: terms = ont.iter_terms() else: terms = [ont[term] for term in args.terms] data = pd.DataFrame() results = [] for term in terms: for window_size in args.candidate_window_size: for flank_limit in args.candidate_flank_limit: if 'effective' in args.snp2gene: # Map to effective effective_loci = term.effective_loci( window_size=window_size ) elif 'strongest' in args.snp2gene: effective_loci = term.strongest_loci( window_size=window_size, attr=args.strongest_attr, lowest=args.strongest_higher ) genes = pd.DataFrame([ x.as_dict() for x in refgen.candidate_genes( effective_loci, flank_limit=flank_limit, include_parent_locus=True, include_num_siblings=True, include_num_intervening=True, include_rank_intervening=True, include_SNP_distance=True, include_parent_attrs=args.include_parent_attrs, attrs={'Term':term.id}, ) ]) genes['FlankLimit'] = flank_limit genes['WindowSize'] = window_size genes['RefGen'] = refgen.name if from_cob != False: genes['COB'] = from_cob data = pd.concat([data,genes]) # Add data from gene info files original_number_genes = len(data) for info_file in args.gene_info: log('Adding info for {}',info_file) # Assume the file is a table info = pd.read_table(info_file,sep='\t') if len(info.columns) == 1: info = pd.read_table(info_file,sep=',') # try to match as many columns as possible matching_columns = set(data.columns).intersection(info.columns) log("Joining SNP2Gene mappings with info file on: {}",','.join(matching_columns)) data = pd.merge(data,info,how='left') if len(data) != original_number_genes: log.warn( 'There were multiple info rows for some genes. ' 'Beware of potential duplicate candidate gene entries! ' ) # Generate the output file data.to_csv(args.out,index=None,sep='\t') log("Summary stats") print('-'*100) #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit)) print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),len(data.ID.unique()))) print("Number of candidate genes per term:") print(data.groupby('Term').apply(lambda df: len(df.ID)))
'default':dflt['snpLevels'],'min':1,'max':10,'int':True}, 'pCutoff':{'title':'Probability Cutoff', 'default':dflt['pCutoff'],'min':0.0,'max':1.0,'int':False}, 'minTerm':{'title':'Min Genes (GO)', 'default':dflt['minTerm'],'min':1,'max':99,'int':True}, 'maxTerm':{'title':'Max Genes (GO)', 'default':dflt['maxTerm'],'min':100,'max':1000,'int':True}, } # ---------------------------------------- # Load things to memeory to prepare # ---------------------------------------- # Generate network list based on allowed list print('Preloading networks into memory...') if len(conf['networks']) < 1: conf['networks'] = list(co.available_datasets('Expr')['Name'].values) networks = {x:co.COB(x) for x in conf['networks']} network_info = [[net.name, net._global('parent_refgen'), net.description] for name,net in networks.items()] print('Availible Networks: ' + str(networks)) # Generate ontology list based on allowed list and load them into memory print('Preloading GWASes into Memory...') if len(conf['gwas']) < 1: conf['gwas'] = list(co.available_datasets('GWAS')['Name'].values) onts = {x:co.GWAS(x) for x in conf['gwas']} onts_info = {} for m,net in networks.items(): ref = net._global('parent_refgen') onts_info[net.name] = [] for n,ont in onts.items(): if ont.refgen.name == ref:
def snp2gene(args): ''' Perform SNP (locus) to candidate gene mapping ''' if args.out != sys.stdout: # Create any non-existant directories if os.path.dirname(args.out) != '': os.makedirs(os.path.dirname(args.out), exist_ok=True) if os.path.exists(args.out) and not args.force: print("Output for {} exists! Skipping!".format(args.out), file=sys.stderr) return None # Set a flag saying this is from a COB refgen from_cob = False # Create the refgen (option to create it from a COB) if co.available_datasets('Expr', args.refgen): refgen = co.COB(args.refgen).refgen from_cob = args.refgen elif co.available_datasets('RefGen', args.refgen): refgen = co.RefGen(args.refgen) # Create the GWAS object ont = co.GWAS(args.gwas) if 'all' in args.terms: terms = ont.iter_terms() else: terms = [ont[term] for term in args.terms] data = pd.DataFrame() results = [] for term in terms: for window_size in args.candidate_window_size: for flank_limit in args.candidate_flank_limit: if 'effective' in args.snp2gene: # Map to effective effective_loci = term.effective_loci( window_size=window_size) elif 'strongest' in args.snp2gene: effective_loci = term.strongest_loci( window_size=window_size, attr=args.strongest_attr, lowest=args.strongest_higher) genes = pd.DataFrame([ x.as_dict() for x in refgen.candidate_genes( effective_loci, flank_limit=flank_limit, include_parent_locus=True, include_num_siblings=True, include_num_intervening=True, include_rank_intervening=True, include_SNP_distance=True, include_parent_attrs=args.include_parent_attrs, attrs={'Term': term.id}, ) ]) genes['FlankLimit'] = flank_limit genes['WindowSize'] = window_size genes['RefGen'] = refgen.name if from_cob != False: genes['COB'] = from_cob data = pd.concat([data, genes]) # Add data from gene info files original_number_genes = len(data) for info_file in args.gene_info: log('Adding info for {}', info_file) # Assume the file is a table info = pd.read_table(info_file, sep='\t') if len(info.columns) == 1: info = pd.read_table(info_file, sep=',') # try to match as many columns as possible matching_columns = set(data.columns).intersection(info.columns) log("Joining SNP2Gene mappings with info file on: {}", ','.join(matching_columns)) data = pd.merge(data, info, how='left') if len(data) != original_number_genes: log.warn('There were multiple info rows for some genes. ' 'Beware of potential duplicate candidate gene entries! ') # Generate the output file data.to_csv(args.out, index=None, sep='\t') log("Summary stats") print('-' * 100) #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit)) print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()), len(data.ID.unique()))) print("Number of candidate genes per term:") print(data.groupby('Term').apply(lambda df: len(df.ID)))
# Prefetch the gene names for all the networks print('Fetching gene names for networks...') network_genes = {} for name, net in networks.items(): ids = list(net._expr.index.values) als = co.RefGen(net._global('parent_refgen')).aliases(ids) for k,v in als.items(): ids += v network_genes[name] = list(set(ids)) print('Found gene names') # Find all of the GWAS data we have available print('Finding GWAS Data...') gwas_data_db = {} for gwas in co.available_datasets('GWASData')['Name']: gwas_data_db[gwas] = co.GWASData(gwas) # Find any functional annotations we have print('Finding functional annotations...') func_data_db = {} for func in co.available_datasets('RefGenFunc')['Name']: print('Processing annotations for {}...'.format(func)) func_data_db[func] = co.RefGenFunc(func) func_data_db[func].to_csv(os.path.join(scratch_folder,(func+'.tsv'))) geneWordBuilder(func,[os.path.join(scratch_folder,(func+'.tsv'))],[1],['2 end'],['tab'],[True]) # Find any GO ontologies we have for the networks we have print('Finding applicable GO Ontologies...') GOnt_db = {} for name in co.available_datasets('GOnt')['Name']:
print('Availible Networks: ' + str(networks)) # Prefetch the gene neames for all the networks print('Fetching gene names for networks...') network_genes = {} for name, net in networks.items(): ids = list(net._expr.index.values) als = co.RefGen(net._global('parent_refgen')).aliases(ids) for k,v in als.items(): ids += v network_genes[name] = list(set(ids)) print('Found gene names') # Generate in Memory Avalible GWAS datasets list print('Finding available GWAS datasets...') gwas_sets = {"data" : list(co.available_datasets('GWAS')[ ['Name','Description']].itertuples(index=False))} # Find all of the GWAS data we have available print('Finding GWAS Data...') gwas_data_db = {} for gwas in co.available_datasets('GWASData')['Name']: gwas_data_db[gwas] = co.GWASData(gwas) # Find any functional annotations we have print('Finding functional annotations...') func_data_db = {} for func in co.available_datasets('RefGenFunc')['Name']: print('Processing annotations for {}...'.format(func)) func_data_db[func] = co.RefGenFunc(func) func_data_db[func].to_csv(os.path.join(scratch_folder,(func+'.tsv'))) geneWordBuilder(func,[os.path.join(scratch_folder,(func+'.tsv'))],[1],['2 end'],['tab'],[True])
def all_available_datasets(): return str(co.available_datasets())
'default':dflt['hpo'], 'state':dflt['hpo'], 'isBool':True}, 'visEnrich':{ 'default':dflt['visEnrich'], 'state':dflt['visEnrich'], 'isBool':True}, } # ---------------------------------------- # Load things to memeory to prepare # ---------------------------------------- # Generate network list based on allowed list print('Preloading networks into memory...') if len(conf['networks']) < 1: conf['networks'] = list(co.available_datasets('Expr')['Name'].values) networks = {x:co.COB(x) for x in conf['networks']} network_info = [] refLinks = {} for name,net in networks.items(): network_info.append({ 'name':net.name, 'refgen':net._global('parent_refgen'), 'desc':net.description, }) if net._global('parent_refgen') in conf['refLinks']: refLinks[net.name] = conf['refLinks'][net._global('parent_refgen')] print('Availible Networks: ' + str(networks)) # Generate ontology list based on allowed list and load them into memory