Ejemplo n.º 1
0
 def _init_list_results(self):
     # init IJ content and IJ skew
     IJ_content_res = np.empty((1, self.__len__()))
     IJ_content_res[:] = np.NAN
     IJ_skew_res = np.empty((1, self.__len__()))
     IJ_skew_res[:] = np.NAN
     return IJ_content_res, IJ_skew_res
Ejemplo n.º 2
0
 def _init_list_results(self):
     # init IJ content and IJ skew
     IJ_content_res = np.empty((1,self.__len__()))
     IJ_content_res[:] = np.NAN
     IJ_skew_res = np.empty((1,self.__len__()))
     IJ_skew_res[:] = np.NAN
     return IJ_content_res, IJ_skew_res
Ejemplo n.º 3
0
def _base_content(filename, window_size, letters, circular=False):
    # DOC: see gc_content
    fasta = FastxFile(filename)
    checker = set(letters)
    chrom_gc_content = dict()
    for chrom in fasta:
        mid = int(window_size / 2)
        # Create gc_content array
        gc_content = np.empty(len(chrom.sequence))
        gc_content[:] = np.nan
        if circular:
            chrom.sequence = (chrom.sequence[-mid:] + chrom.sequence +
                              chrom.sequence[:mid])
            # Does not shift index of array
            mid = 0
        # Count first window content
        counter = Counter(chrom.sequence[0:window_size])
        gc_count = 0
        for letter in letters:
            gc_count += counter[letter]

        gc_content[mid] = gc_count
        for i in range(1, len(chrom.sequence) - window_size + 1):
            if chrom.sequence[i - 1] in checker:
                gc_count -= 1
            if chrom.sequence[i + window_size - 1] in checker:
                gc_count += 1
            gc_content[i + mid] = gc_count
        chrom_gc_content[chrom.name] = gc_content / window_size
    return chrom_gc_content
Ejemplo n.º 4
0
def _base_content(filename, window_size, letters, circular=False):
    # DOC: see gc_content
    fasta = FastxFile(filename)
    checker = set(letters)
    chrom_gc_content = dict()
    for chrom in fasta:
        mid = int(window_size / 2)
        # Create gc_content array
        gc_content = np.empty(len(chrom.sequence))
        gc_content[:] = np.nan
        if circular:
            chrom.sequence = (chrom.sequence[-mid:] + chrom.sequence +
                    chrom.sequence[:mid])
            # Does not shift index of array
            mid = 0
        # Count first window content
        counter = Counter(chrom.sequence[0:window_size])
        gc_count = 0
        for letter in letters:
            gc_count += counter[letter]

        gc_content[mid] = gc_count
        for i in range(1, len(chrom.sequence) - window_size + 1):
            if chrom.sequence[i - 1] in checker:
                gc_count -= 1
            if chrom.sequence[i + window_size - 1] in checker:
                gc_count += 1
            gc_content[i + mid] = gc_count
        chrom_gc_content[chrom.name] = gc_content / window_size
    return chrom_gc_content
Ejemplo n.º 5
0
    def _get_info(self):
        """Populates the data structures for plotting.

        Will be called on request"""

        stats = {"A":0, "C":0, "G":0, "T":0, "N":0}
        stats["qualities"] = []
        stats["mean_qualities"] = []
        stats["mean_length"] = 0
        stats["sequences"] = []

        minimum = 1e6
        maximum = 0
        # FIXME this self.N takes time in the cosntructor
        # do we need it ?
        self.lengths = np.empty(self.N)
        self.gc_list = []
        total_length = 0
        C = defaultdict(int)
        if self.verbose:
            pb = Progress(self.N)

        sequences = []
        mean_qualities = []
        qualities = []
        # could use multiprocessing
        # FastxFile has shown some errors while handling gzip files
        # created with zlib (e.g. from atropos). This is now replaced
        # by the Atropos FastqReader for now.
        #fastq = pysam.FastxFile(self.filename)

        with FastqReader(self.filename) as f:
            for i, record in enumerate(f):
                N = len(record.sequence)
                self.lengths[i] = N

                # we can store all qualities and sequences reads, so
                # just max_sample are stored:
                if i < self.max_sample:
                    quality = [ord(x) -33 for x in record.qualities]
                    mean_qualities.append(sum(quality) / N)
                    qualities.append(quality)
                    sequences.append(record.sequence)

                # store count of all qualities
                for k in record.qualities:
                    C[k] += 1

                GG = record.sequence.count('G') 
                CC = record.sequence.count('C')
                self.gc_list.append((GG+CC)/float(N)*100)

                # not using a counter, or loop speed up the code
                stats["A"] += record.sequence.count("A")
                stats["C"] += CC
                stats["G"] += GG
                stats["T"] += record.sequence.count("T")
                stats["N"] += record.sequence.count("N")

                total_length += len(record.sequence)

                if self.verbose:
                    pb.animate(i+1)

        # other data
        self.qualities = qualities
        self.mean_qualities = mean_qualities
        self.minimum = int(self.lengths.min())
        self.maximum = int(self.lengths.max())
        self.sequences = sequences
        self.gc_content = np.mean(self.gc_list)
        stats['mean_length'] = total_length / float(self.N)
        stats['total_bp'] = stats['A'] + stats['C'] + stats['G'] + stats["T"] + stats['N']
        stats['mean_quality'] = sum([(ord(k) -33)*v for k,v in C.items()]) / stats['total_bp']

        self.stats = stats