def from_gff(cls, filename, name, description, build, organism): ''' imports refgen from gff file ''' self = cls.create(name, description, type='RefGen') self._global('build', build) self._global('organism', organism) genes = list() with open(filename, 'r') as IN: for line in IN: #skip comment lines if line.startswith('#'): continue chrom, source, feature, start, end, score, strand, frame, attributes = line.strip( ).split() attributes = dict([ (field.split('=')) for field in attributes.strip(';').split(';') ]) if feature == 'chromosome': self.log('Found a chromosome: {}', attributes['ID']) self.add_chromosome(Chrom(attributes['ID'], end)) if feature == 'gene': genes.append( Gene(chrom, int(start), int(end), attributes['ID'].upper(), strand=strand, build=build, organism=organism)) self.add_gene(genes) return self
def from_ids(self, gene_list, check_shape=False, enumerated=False): ''' returns gene object list from an iterable of id strings ''' genes = [ Gene(*x, build=self.build, organism=self.organism) for x in self.db.cursor().execute(''' SELECT chromosome,start,end,id FROM genes WHERE id IN ('{}') '''.format("','".join(map(str.upper, gene_list)))) ] if check_shape and len(genes) != len(gene_list): raise ValueError('Some input ids do not have genes in reference') return genes
def build_cob(args): try: # Build the refgen refgen = co.RefGen(args.refgen) # Check that the sep is likely right. if len(pd.read_table(args.filename, sep=args.sep).columns) == 1: print(("Detected only 1 column in {}, are you sure " "colunms are separated by '{}'?").format( args.filename, args.sep)) return None if args.allow_non_membership: refgen = refgen.copy('{}_tmp'.format(refgen.name), 'temp refgen'.format(refgen.name)) # Add non membership genes for gid in pd.read_table(args.filename, sep=args.sep).index: refgen.add_gene(Gene(None, None, id=gid)) quality_control = False if args.skip_quality_control else True normalize = False if args.skip_normalization else True quantile = False if args.skip_quantile else True # Check to see if this dataset is already built if available_datasets('Expr', args.name): print('Warning! This dataset has already been built.') co.Tools.del_dataset('Expr', args.name, force=args.force) # Basically just pass all the CLI arguments to the COB class method cob = co.COB.from_table( args.filename, args.name, args.description, refgen, # Optional arguments sep=args.sep, rawtype=args.rawtype, # Data Processing quality_control=quality_control, normalization=normalize, quantile=quantile, # Data processing parameters max_gene_missing_data=args.max_gene_missing_data, max_accession_missing_data=args.max_accession_missing_data, min_single_sample_expr=args.min_single_sample_expr, min_expr=args.min_expr, max_val=args.max_val, dry_run=args.dry_run, zscore_cutoff=args.zscore_cutoff, index_col=args.index_col) print(cob.summary()) except Exception as e: print("Build failed. Rolling back: removing corrupted files...") co.Tools.del_dataset('Expr', args.name, force=True) raise e
def __getitem__(self, item): try: gene_data = self.db.cursor().execute( ''' SELECT chromosome,start,end,id FROM genes WHERE id = ? ''', (item, )).fetchone() return Gene(*gene_data, build=self.build, organism=self.organism) except Exception as e: pass try: _ = (x for x in item) return list(self.from_ids(list(_))) except TypeError as e: self.log('not iterable: {}', e) pass return None
def downstream_genes(self, locus, gene_limit=1000): ''' returns genes downstream of a locus. Genes are ordered so that the nearest genes are at the beginning of the list. ''' return [ Gene(*x, build=self.build, organism=self.organism) for x in self.db.cursor().execute( ''' SELECT chromosome,start,end,id FROM genes WHERE chromosome = ? AND start > ? AND start < ? ORDER BY start ASC LIMIT ? ''', (locus.chrom, locus.start, locus.downstream, gene_limit)) ]
def within_gene(self, locus): ''' Returns the gene the locus is within, or None ''' try: x = [ Gene(*x, build=self.build, organism=self.organism) for x in self.db.cursor().execute( ''' SELECT chromosome,start,end,id FROM genes WHERE chromosome = ? AND start < ? AND end > ? ''', (locus.chrom, locus.start, locus.start)) ][0] return x except Exception as e: return None
def genes_within(self, loci, chain=True): ''' Returns the genes within a locus, or None ''' try: iterator = iter(loci) genes = [ self.genes_within(locus, chain=chain) for locus in iterator ] if chain: genes = list(itertools.chain(*genes)) return genes except TypeError as e: return [ Gene(*x, build=self.build, organism=self.organism) for x in self.db.cursor().execute( ''' SELECT chromosome,start,end,id FROM genes WHERE chromosome = ? AND start > ? AND end < ? ''', (loci.chrom, loci.start, loci.end)) ]
def build_cob(args): try: # Build the refgen refgen = co.RefGen(args.refgen) # Check that the sep is likely right. if len(pd.read_table(args.filename, sep=args.sep).columns) == 1: print(("Detected only 1 column in {}, are you sure " "colunms are separated by '{}'?").format( args.filename, args.sep)) return None elif (len(pd.read_table(args.filename, sep=args.sep).columns) < 20 and args.non_interactive != True): print(( "Detected fewer than 20 accessions in the expression matrix. " "Calculating co-expression with this many datapoints is not advised" )) if input("are you sure you want to continue? [y/n]: ").upper( ) == "Y": pass else: sys.exit(1) if args.allow_non_membership: refgen = refgen.copy("{}_tmp".format(refgen.name), "temp refgen".format(refgen.name)) # Add non membership genes for gid in pd.read_table(args.filename, sep=args.sep).index: refgen.add_gene(Gene(None, None, id=gid)) quality_control = False if args.skip_quality_control else True normalize = False if args.skip_normalization else True quantile = True if args.quantile else False # Check to see if this dataset is already built if available_datasets("Expr", args.name): print("Warning! This dataset has already been built.") co.Tools.del_dataset("Expr", args.name, force=args.force) # Basically just pass all the CLI arguments to the COB class method cob = co.COB.from_table( args.filename, args.name, args.description, refgen, # Optional arguments sep=args.sep, rawtype=args.rawtype, # Data Processing quality_control=quality_control, normalization=normalize, quantile=quantile, # Data processing parameters max_gene_missing_data=args.max_gene_missing_data, max_accession_missing_data=args.max_accession_missing_data, min_single_sample_expr=args.min_single_sample_expr, min_expr=args.min_expr, max_val=args.max_val, dry_run=args.dry_run, zscore_cutoff=args.zscore_cutoff, index_col=args.index_col, ) print(cob.summary()) except Exception as e: print( f"Build failed for {args.name}. Rolling back: removing corrupted files..." ) co.Tools.del_dataset("Expr", args.name, force=True) raise e
def iter_genes(self): ''' iterates over genes in refgen, only returns genes within gene filter ''' return (Gene(*x, build=self.build, organism=self.organism) for x in self.db.cursor().execute(''' SELECT chromosome,start,end,id FROM genes '''))
def random_gene(self): return Gene(*self.db.cursor().execute( ''' SELECT chromosome,start,end,id from genes WHERE rowid = ? ''', (random.randint(1, self.num_genes()), )).fetchone())