def __init__(self, infasta=None, outfile=None, k=6, binary=True, mean=True, std=True, leave=True, silent=False, label=False): self.infasta = infasta self.seqs = None if infasta is not None: self.reader = Reader(infasta) self.seqs = self.reader.get_seqs() self.outfile = outfile self.k = k self.binary = binary self.mean = mean if isinstance(mean, str): self.mean = np.load(mean) self.std = std if isinstance(std, str): self.std = np.load(std) self.leave = leave self.silent = silent self.label = label self.counts = None self.kmers = [''.join(i) for i in product('AGTC', repeat=k)] self.map = {k: i for k, i in zip(self.kmers, range(4**k))}
def save(self, names=None): """Saves the counts appropriately based on current settings. There are four output methods for the counts: 1. Binary. This saves just the counts as a binary numpy array. 2. No labels. Saves in plain text, but without any labels. 3. Default names. If no names are provided, fasta headers will be used as labels. 4. Custom names. Provide a list of names if you want to label lncRNAs with your own names. Parameters ---------- names : [str] (default=None) Unique names for rows of the Dataframe. """ assert not (self.binary and self.label ), 'You cannot label a binary file. Set only one as True.' assert self.outfile is not None, 'Please provide an outfile location.' if self.binary: np.save(self.outfile, self.counts) elif self.label: if names is None: if self.reader is None: self.reader = Reader(self.infasta) names = self.reader.get_headers() df = DataFrame(data=self.counts, index=names, columns=self.kmers) df.to_csv(self.outfile) else: np.savetxt(self.outfile, self.counts, delimiter=',', fmt='%1.6f')
def __init__(self, infasta=None, outfile=None, k=6, binary=True, mean=True, std=True, leave=True, silent=False, label=False, percentage=True): self.infasta = infasta self.seqs = None if infasta is not None: self.seqs = Reader(infasta).get_seqs() self.outfile = outfile self.k = k self.binary = binary self.mean = mean if isinstance(mean, str): self.mean = np.load(mean) self.std = std if isinstance(std, str): self.std = np.load(std) self.leave = leave self.silent = silent self.label = label self.perc = percentage self.counts = None self.kmers = [''.join(i) for i in product('AGTC', repeat=k)] self.map = {k:i for k,i in zip(self.kmers, range(4**k))} if len(self.seqs) == 1 and self.std is True: err = ('You cannot standardize a single sequence. ' 'Please pass the path to an std. dev. array, ' 'or use raw counts by setting std=False.') raise ValueError(err)
def __init__(self, infasta=None, outfile=None, k=6): self.seqs = None if infasta is not None: self.data, self.names, self.seqs = Reader(infasta).get_data() self.outfile = outfile self.k = k self.counts = None self.kmers = [''.join(i) for i in list(product('AGTC', repeat=k))]
class BasicCounter: """Generates overlapping kmer counts for a fasta file Parameters ---------- infasta : str (default=None) Full path to fasta file to be counted outfile : str (default=None) Full path to the counts file to be saved k : int (default=6) Size of kmer to be counted binary : bool (default=True) Saves as numpy array if True, else saves as csv mean : bool, np.array, str (default=True) Set the mean to 0 for each kmer/column of the count matrix. If str, provide path to a previously calculated mean array. std : bool or str (default=True) Set the std. dev. to 1 for each kmer/column of the count matrix If str, provide path to a previously calculated std array. leave : bool (default=True) Set to False if get_counts is used within another tqdm loop silent : bool (default=False) Set to True to turn off tqdm progress bar Attributes ---------- counts : None Stores the ndarray of kmer counts kmers : list str elements of all kmers of size k map : dict Mapping of kmers to column values """ def __init__(self, infasta=None, outfile=None, k=6, binary=True, mean=True, std=True, leave=True, silent=False, label=False): self.infasta = infasta self.seqs = None if infasta is not None: self.reader = Reader(infasta) self.seqs = self.reader.get_seqs() self.outfile = outfile self.k = k self.binary = binary self.mean = mean if isinstance(mean, str): self.mean = np.load(mean) self.std = std if isinstance(std, str): self.std = np.load(std) self.leave = leave self.silent = silent self.label = label self.counts = None self.kmers = [''.join(i) for i in product('AGTC', repeat=k)] self.map = {k: i for k, i in zip(self.kmers, range(4**k))} def occurrences(self, row, seq): """Counts kmers on a per kilobase scale""" counts = defaultdict(int) length = len(seq) increment = 1000 / length for c in range(length - self.k + 1): kmer = seq[c:c + self.k] counts[kmer] += increment for kmer, n in counts.items(): if kmer in self.map: row[self.map[kmer]] = n return row def _progress(self): """Determine which iterator to loop over for counting.""" if self.silent: return self.seqs if not self.leave: tqdm_seqs = my_tqdm()(self.seqs, desc='Kmers', leave=False) else: tqdm_seqs = my_tqdm()(self.seqs) return tqdm_seqs def center(self): """mean center counts by column""" if self.mean is True: self.mean = np.mean(self.counts, axis=0) self.counts -= self.mean def standardize(self): """divide out the standard deviations from columns of the count matrix""" if self.std is True: self.std = np.std(self.counts, axis=0) self.counts /= self.std def get_counts(self): """Generates kmer counts for a fasta file""" self.counts = np.zeros([len(self.seqs), 4**self.k], dtype=np.float32) seqs = self._progress() for i, seq in enumerate(seqs): self.counts[i] = self.occurrences(self.counts[i], seq) if self.mean is not False: self.center() if self.std is not False: self.standardize() def save(self, names=None): """Saves the counts appropriately based on current settings. There are four output methods for the counts: 1. Binary. This saves just the counts as a binary numpy array. 2. No labels. Saves in plain text, but without any labels. 3. Default names. If no names are provided, fasta headers will be used as labels. 4. Custom names. Provide a list of names if you want to label lncRNAs with your own names. Parameters ---------- names : [str] (default=None) Unique names for rows of the Dataframe. """ assert not (self.binary and self.label ), 'You cannot label a binary file. Set only one as True.' assert self.outfile is not None, 'Please provide an outfile location.' if self.binary: np.save(self.outfile, self.counts) elif self.label: if names is None: if self.reader is None: self.reader = Reader(self.infasta) names = self.reader.get_headers() df = DataFrame(data=self.counts, index=names, columns=self.kmers) df.to_csv(self.outfile) else: np.savetxt(self.outfile, self.counts, delimiter=',', fmt='%1.6f') def make_count_file(self, names=None): """Wrapper function for the most common way to generate count files. Given a numpy file name, it will save a numpy file where counts have been: cast as a dense array, centered, and standardized. Parameters ---------- names : [str] (default=None) lncRNA names to pass to self.save """ self.get_counts() if self.outfile is not None: self.save(names) return self.counts