def transform(self,func=np.log2): ''' Transform the samples table VALUES in place''' if not self.is_raw(): log.warn("Attempting to perform transormation of apparently non raw data") self.tbl.VALUE = func(list(map(float,self.tbl.VALUE.values))) self.name = self.name # make sure we didnt introduce and -Inf values self.tbl.loc[self.tbl.VALUE == float('-Inf'),'VALUE'] = np.nan return True
def create(cls,name,description,type='Camoco'): ''' This is a class method to create a new camoco type object. It initializes base directory hierarchy ''' basedir = os.path.realpath( os.path.expanduser(cf.get('options','basedir')) ) # Create the basedir if not exists try: os.makedirs(basedir,exist_ok=True) os.makedirs(os.path.join(basedir,"logs"),exist_ok=True) os.makedirs(os.path.join(basedir,"databases"),exist_ok=True) os.makedirs(os.path.join(basedir,"analyses"),exist_ok=True) os.makedirs(os.path.join(basedir,"tmp"),exist_ok=True) except Exception as e: log(' Could not create files in {}',basedir) raise try: # Create the base camoco database lite.Connection( os.path.join(basedir,'databases','Camoco.Camoco.db') ).cursor().execute(''' CREATE TABLE IF NOT EXISTS datasets ( name TEXT NOT NULL, description TEXT, type TEXT, added datetime DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(name,type) ); INSERT OR IGNORE INTO datasets (name,description,type) VALUES ('Camoco','Camoco base','Camoco'); INSERT OR FAIL INTO datasets (name,description,type) VALUES (?,?,?)''',(name,description,type) ) except ConstraintError as e: log.warn('CAUTION! {}.{} Database already exists.',name,type) self = cls(name) return self
def create(cls, name, description, type='Camoco'): ''' This is a class method to create a new camoco type object. It initializes base directory hierarchy ''' basedir = os.path.realpath( os.path.expanduser(cf.get('options', 'basedir'))) # Create the basedir if not exists try: os.makedirs(basedir, exist_ok=True) os.makedirs(os.path.join(basedir, "logs"), exist_ok=True) os.makedirs(os.path.join(basedir, "databases"), exist_ok=True) os.makedirs(os.path.join(basedir, "analyses"), exist_ok=True) os.makedirs(os.path.join(basedir, "tmp"), exist_ok=True) except Exception as e: log(' Could not create files in {}', basedir) raise try: # Create the base camoco database lite.Connection( os.path.join(basedir, 'databases', 'Camoco.Camoco.db')).cursor().execute( ''' CREATE TABLE IF NOT EXISTS datasets ( name TEXT NOT NULL, description TEXT, type TEXT, added datetime DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(name,type) ); INSERT OR IGNORE INTO datasets (name,description,type) VALUES ('Camoco','Camoco base','Camoco'); INSERT OR FAIL INTO datasets (name,description,type) VALUES (?,?,?)''', (name, description, type)) except ConstraintError as e: log.warn('CAUTION! {}.{} Database already exists.', name, type) self = cls(name) return self
def snp2gene(args): ''' Perform SNP (locus) to candidate gene mapping ''' if args.out != sys.stdout: # Create any non-existant directories if os.path.dirname(args.out) != '': os.makedirs(os.path.dirname(args.out),exist_ok=True) if os.path.exists(args.out) and not args.force: print( "Output for {} exists! Skipping!".format( args.out ),file=sys.stderr ) return None # Set a flag saying this is from a COB refgen from_cob = False # Create the refgen (option to create it from a COB) if co.Tools.available_datasets('Expr',args.refgen): refgen = co.COB(args.refgen).refgen from_cob = args.refgen elif co.Tools.available_datasets('RefGen',args.refgen): refgen = co.RefGen(args.refgen) # Create the GWAS object ont = co.GWAS(args.gwas) if 'all' in args.terms: terms = ont.iter_terms() else: terms = [ont[term] for term in args.terms] data = pd.DataFrame() results = [] for term in terms: for window_size in args.candidate_window_size: for flank_limit in args.candidate_flank_limit: if 'effective' in args.snp2gene: # Map to effective effective_loci = term.effective_loci( window_size=window_size ) elif 'strongest' in args.snp2gene: effective_loci = term.strongest_loci( window_size=window_size, attr=args.strongest_attr, lowest=args.strongest_higher ) genes = pd.DataFrame([ x.as_dict() for x in refgen.candidate_genes( effective_loci, flank_limit=flank_limit, include_parent_locus=True, include_num_siblings=True, include_num_intervening=True, include_rank_intervening=True, include_SNP_distance=True, include_parent_attrs=args.include_parent_attrs, attrs={'Term':term.id}, ) ]) genes['FlankLimit'] = flank_limit genes['WindowSize'] = window_size genes['RefGen'] = refgen.name if from_cob != False: genes['COB'] = from_cob data = pd.concat([data,genes]) # Add data from gene info files original_number_genes = len(data) for info_file in args.gene_info: log('Adding info for {}',info_file) # Assume the file is a table info = pd.read_table(info_file,sep='\t') if len(info.columns) == 1: info = pd.read_table(info_file,sep=',') # try to match as many columns as possible matching_columns = set(data.columns).intersection(info.columns) log("Joining SNP2Gene mappings with info file on: {}",','.join(matching_columns)) data = pd.merge(data,info,how='left') if len(data) != original_number_genes: log.warn( 'There were multiple info rows for some genes. ' 'Beware of potential duplicate candidate gene entries! ' ) # Generate the output file data.to_csv(args.out,index=None,sep='\t') log("Summary stats") print('-'*100) #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit)) print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),len(data.ID.unique()))) print("Number of candidate genes per term:") print(data.groupby('Term').apply(lambda df: len(df.ID)))
def __add__(self,other): if self.is_raw() and not other.is_raw(): log.warn('WARNING! attempting to combine {} and {} which are not both normalized') super().__add__(self,other)
def snp2gene(args): ''' Perform SNP (locus) to candidate gene mapping ''' if args.out != sys.stdout: # Create any non-existant directories if os.path.dirname(args.out) != '': os.makedirs(os.path.dirname(args.out),exist_ok=True) if os.path.exists(args.out) and not args.force: print( "Output for {} exists! Skipping!".format( args.out ),file=sys.stderr ) return None # Set a flag saying this is from a COB refgen from_cob = False # Create the refgen (option to create it from a COB) if co.available_datasets('Expr',args.refgen): refgen = co.COB(args.refgen).refgen from_cob = args.refgen elif co.available_datasets('RefGen',args.refgen): refgen = co.RefGen(args.refgen) # Create the GWAS object ont = co.GWAS(args.gwas) if 'all' in args.terms: terms = ont.iter_terms() else: terms = [ont[term] for term in args.terms] data = pd.DataFrame() results = [] for term in terms: for window_size in args.candidate_window_size: for flank_limit in args.candidate_flank_limit: if 'effective' in args.snp2gene: # Map to effective effective_loci = term.effective_loci( window_size=window_size ) elif 'strongest' in args.snp2gene: effective_loci = term.strongest_loci( window_size=window_size, attr=args.strongest_attr, lowest=args.strongest_higher ) genes = pd.DataFrame([ x.as_dict() for x in refgen.candidate_genes( effective_loci, flank_limit=flank_limit, include_parent_locus=True, include_num_siblings=True, include_num_intervening=True, include_rank_intervening=True, include_SNP_distance=True, include_parent_attrs=args.include_parent_attrs, attrs={'Term':term.id}, ) ]) genes['FlankLimit'] = flank_limit genes['WindowSize'] = window_size genes['RefGen'] = refgen.name if from_cob != False: genes['COB'] = from_cob data = pd.concat([data,genes]) # Add data from gene info files original_number_genes = len(data) for info_file in args.gene_info: log('Adding info for {}',info_file) # Assume the file is a table info = pd.read_table(info_file,sep='\t') if len(info.columns) == 1: info = pd.read_table(info_file,sep=',') # try to match as many columns as possible matching_columns = set(data.columns).intersection(info.columns) log("Joining SNP2Gene mappings with info file on: {}",','.join(matching_columns)) data = pd.merge(data,info,how='left') if len(data) != original_number_genes: log.warn( 'There were multiple info rows for some genes. ' 'Beware of potential duplicate candidate gene entries! ' ) # Generate the output file data.to_csv(args.out,index=None,sep='\t') log("Summary stats") print('-'*100) #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit)) print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),len(data.ID.unique()))) print("Number of candidate genes per term:") print(data.groupby('Term').apply(lambda df: len(df.ID)))