def _init_list_results(self): # init IJ content and IJ skew IJ_content_res = np.empty((1, self.__len__())) IJ_content_res[:] = np.NAN IJ_skew_res = np.empty((1, self.__len__())) IJ_skew_res[:] = np.NAN return IJ_content_res, IJ_skew_res
def _init_list_results(self): # init IJ content and IJ skew IJ_content_res = np.empty((1,self.__len__())) IJ_content_res[:] = np.NAN IJ_skew_res = np.empty((1,self.__len__())) IJ_skew_res[:] = np.NAN return IJ_content_res, IJ_skew_res
def _base_content(filename, window_size, letters, circular=False): # DOC: see gc_content fasta = FastxFile(filename) checker = set(letters) chrom_gc_content = dict() for chrom in fasta: mid = int(window_size / 2) # Create gc_content array gc_content = np.empty(len(chrom.sequence)) gc_content[:] = np.nan if circular: chrom.sequence = (chrom.sequence[-mid:] + chrom.sequence + chrom.sequence[:mid]) # Does not shift index of array mid = 0 # Count first window content counter = Counter(chrom.sequence[0:window_size]) gc_count = 0 for letter in letters: gc_count += counter[letter] gc_content[mid] = gc_count for i in range(1, len(chrom.sequence) - window_size + 1): if chrom.sequence[i - 1] in checker: gc_count -= 1 if chrom.sequence[i + window_size - 1] in checker: gc_count += 1 gc_content[i + mid] = gc_count chrom_gc_content[chrom.name] = gc_content / window_size return chrom_gc_content
def _get_info(self): """Populates the data structures for plotting. Will be called on request""" stats = {"A":0, "C":0, "G":0, "T":0, "N":0} stats["qualities"] = [] stats["mean_qualities"] = [] stats["mean_length"] = 0 stats["sequences"] = [] minimum = 1e6 maximum = 0 # FIXME this self.N takes time in the cosntructor # do we need it ? self.lengths = np.empty(self.N) self.gc_list = [] total_length = 0 C = defaultdict(int) if self.verbose: pb = Progress(self.N) sequences = [] mean_qualities = [] qualities = [] # could use multiprocessing # FastxFile has shown some errors while handling gzip files # created with zlib (e.g. from atropos). This is now replaced # by the Atropos FastqReader for now. #fastq = pysam.FastxFile(self.filename) with FastqReader(self.filename) as f: for i, record in enumerate(f): N = len(record.sequence) self.lengths[i] = N # we can store all qualities and sequences reads, so # just max_sample are stored: if i < self.max_sample: quality = [ord(x) -33 for x in record.qualities] mean_qualities.append(sum(quality) / N) qualities.append(quality) sequences.append(record.sequence) # store count of all qualities for k in record.qualities: C[k] += 1 GG = record.sequence.count('G') CC = record.sequence.count('C') self.gc_list.append((GG+CC)/float(N)*100) # not using a counter, or loop speed up the code stats["A"] += record.sequence.count("A") stats["C"] += CC stats["G"] += GG stats["T"] += record.sequence.count("T") stats["N"] += record.sequence.count("N") total_length += len(record.sequence) if self.verbose: pb.animate(i+1) # other data self.qualities = qualities self.mean_qualities = mean_qualities self.minimum = int(self.lengths.min()) self.maximum = int(self.lengths.max()) self.sequences = sequences self.gc_content = np.mean(self.gc_list) stats['mean_length'] = total_length / float(self.N) stats['total_bp'] = stats['A'] + stats['C'] + stats['G'] + stats["T"] + stats['N'] stats['mean_quality'] = sum([(ord(k) -33)*v for k,v in C.items()]) / stats['total_bp'] self.stats = stats