Ejemplo n.º 1
0
    def __init__(self, reads=False, path_to_baga=False):
        '''
        Initialise with a baga.CollectData.Reads object
        or
        path to a previously save baga object of this class
        '''
        assert bool(reads) ^ bool(path_to_baga), 'Instantiate with baga.CollectData.Reads or '\
                'the path to a previously saved baga.PreparedReads.Reads object' # xor

        if reads:
            try:
                self.read_files = reads.read_files
            except AttributeError:
                print('baga.PrepareReads.Reads needs a baga.CollectData.Reads object '\
                        'with a "read_files" attribute. This can be obtained with the '\
                        '"getFromENA()" or "getFromPath()" methods.')
        else:
            try:
                loaded_baga = _cPickle.load(_gzip.open(path_to_baga, 'rb'))
                for attribute_name in dir(loaded_baga):
                    if attribute_name[0] != '_':
                        setattr(self, attribute_name,
                                getattr(loaded_baga, attribute_name))
            except IOError:
                print('Could not access {}'.format(file_name))
Ejemplo n.º 2
0
 def saveLocal(self, name):
     '''
     Save a downloaded read info to a local compressed pickle file.
     'name' can exclude extension: .baga will be added
     '''
     fileout = 'baga.CollectData.Reads-%s.baga' % name
     print('Saving to %s' % fileout)
     _cPickle.dump(self, _gzip.open(fileout, 'wb'))
Ejemplo n.º 3
0
 def saveLocal(self, name):
     '''
     Save a downloaded read info to a local compressed pickle file.
     'name' can exclude extension: .baga will be added
     '''
     fileout = 'baga.CollectData.Reads-%s.baga' % name
     print('Saving to %s' % fileout)
     _cPickle.dump(self, _gzip.open(fileout, 'wb'))
Ejemplo n.º 4
0
 def __init__(self, reads = False, path_to_baga = False):
     '''
     Initialise with a baga.CollectData.Reads object
     or
     path to a previously save baga object of this class
     '''
     assert bool(reads) ^ bool(path_to_baga), 'Instantiate with baga.CollectData.Reads or '\
             'the path to a previously saved baga.PreparedReads.Reads object' # xor
     
     if reads:
         try:
             self.read_files = reads.read_files
         except AttributeError:
             print('baga.PrepareReads.Reads needs a baga.CollectData.Reads object '\
                     'with a "read_files" attribute. This can be obtained with the '\
                     '"getFromENA()" or "getFromPath()" methods.')
     else:
         try:
             loaded_baga = _cPickle.load(_gzip.open(path_to_baga,'rb'))
             for attribute_name in dir(loaded_baga):
                 if attribute_name[0] != '_':
                     setattr(self, attribute_name, getattr(loaded_baga, attribute_name))
         except IOError:
             print('Could not access {}'.format(file_name))
Ejemplo n.º 5
0
    def subsample(self, genome_size = 6601757, 
                        read_cov_depth = 80, 
                        pc_loss = 0.2, 
                        force = False, 
                        cov_closeness = 5):
        '''
        Given the size in basepairs of a genome sequence, downsample fastq files to a 
        desired average read coverage depth predicted after read alignment. Read lengths
        are taken from the file. By default, 20% are assumed to be lost at downstream 
        quality control stages (e.g. quality score based trimming). The percent loss is 
        used in coverage depth estimation. cov_closeness, which defaults to 5, will prevent
        subsampling if within 5x coverage: avoids time consuming subsampling that will only 
        make a small difference.
        '''

        subsampled_read_files = {}
        start_time = _time.time()
        for cnum,(pairname,files) in enumerate(self.read_files.items()):
            
            processed_path_1 = insert_suffix(files[1], '_subsmp')
            processed_path_2 = insert_suffix(files[2], '_subsmp')
            
            if not all([_os.path.exists(processed_path_1), 
                        _os.path.exists(processed_path_2)]) \
                    or force:
                
                if files[1][-2:] == 'gz':
                    fh1 = _gzip.open(files[1])
                else:
                    fh1 = open(files[1])
                
                aread = _SeqIO.parse(fh1, 'fastq').next()
                read_len = len(aread.seq)
                
                print('Counting reads in %s' % files[1])
                fh1.seek(0)
                lines = 0
                # report per half million reads
                interval = 2000000
                nextreport = interval
                for line in fh1:
                    lines += 1
                    if lines == nextreport:
                        print('{:,} reads'.format(lines/4))
                        nextreport += interval
                
                totalreads = lines / 4.0
                print('Found %s reads' % totalreads)
                full_depth_coverage = read_len * 2 * totalreads * (1 - pc_loss) / genome_size
                print('These paired read files would provide approximately {:.1f}x coverage depth'.format(full_depth_coverage))
                numreads2keep = int( round(genome_size * read_cov_depth / (read_len * 2) /  (1 - pc_loss), 0) )
                
                if numreads2keep >= totalreads:
                    print('This pair of read files is estimated to provide only {:.1f}x coverage, but {}x requested.'.format(full_depth_coverage, read_cov_depth))
                    print('No sampling performed. Original files will be used')
                    # pass original files over with subsampled
                    subsampled_read_files[pairname] = {}
                    subsampled_read_files[pairname][1] = files[1]
                    subsampled_read_files[pairname][2] = files[2]
                    fh1.close()
                    if len(self.read_files) > 1:
                        # report durations, time left etc
                        _report_time(start_time, cnum, len(self.read_files))
                    
                    continue
                elif full_depth_coverage < read_cov_depth + cov_closeness:
                    print('This pair of read files is estimated to provide {:.1f}x coverage which is within {}x of {}x requested.'.format(full_depth_coverage, cov_closeness, read_cov_depth))
                    print('No sampling performed. Original files will be used')
                    # pass original files over with subsampled
                    subsampled_read_files[pairname] = {}
                    subsampled_read_files[pairname][1] = files[1]
                    subsampled_read_files[pairname][2] = files[2]
                    fh1.close()
                    if len(self.read_files) > 1:
                        # report durations, time left etc
                        _report_time(start_time, cnum, len(self.read_files))
                    
                    continue
                else:
                    print('For approximately {}x read coverage, will retain {} of {} {}bp read pairs'.format(
                                    read_cov_depth, numreads2keep, totalreads, read_len))
                    
                    fh1.seek(0)
                    if files[2][-2:] == 'gz':
                        fh2 = _gzip.open(files[2])
                    else:
                        fh2 = open(files[2])
                    
                    fout1 = _gzip.open(processed_path_1, 'wb')
                    fout2 = _gzip.open(processed_path_2, 'wb')
                    
                    batch_size = 200000
                    keep_per_pop = int(numreads2keep / float(totalreads) * batch_size) + 1
                    nextwrite = batch_size
                    written = 0
                    n1 = 0
                    n2 = 0
                    these_lines1 = []
                    these_lines2 = []
                    reportfreq = 10
                    thisreport = 0
                    print('Subsampling . . .')
                    for line in fh1:
                        these_lines1 += [line]
                        if len(these_lines1) % 4 == 0:
                            n1 += 1
                            
                        if n1 == nextwrite:
                            keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop))
                            keep_these = []
                            for i in keep_indices:
                                i1 = i * 4
                                i2 = i * 4 + 4
                                keep_these += these_lines1[i1:i2]
                            
                            # try parsing a read for QC
                            assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq')
                            fout1.write(''.join(keep_these))
                            these_lines1 = []
                            written += keep_per_pop
                            thisreport += 1
                            if thisreport == reportfreq or written == keep_per_pop:
                                # report first time and at intevals
                                print('Written {:,} reads ({:.1%}) to {}'.format(written,
                                                                                 written/float(numreads2keep),
                                                                                 processed_path_1))
                            
                            for line2 in fh2:
                                these_lines2 += [line2]
                                if len(these_lines2) % 4 == 0:
                                    n2 += 1
                                
                                if n2 == nextwrite:
                                    keep_these = []
                                    for i in keep_indices:
                                        i1 = i * 4
                                        i2 = i * 4 + 4
                                        keep_these += these_lines2[i1:i2]
                                    
                                    assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq')
                                    fout2.write(''.join(keep_these))
                                    these_lines2 = []
                                    if thisreport == reportfreq or written == keep_per_pop:
                                        thisreport = 0
                                        print('Written {:,} reads ({:.1%}) to {}'.format(written,
                                                                                         written/float(numreads2keep),
                                                                                         processed_path_2))
                                    nextwrite += batch_size
                                    break
                    
                    # write remainder
                    remainder = nextwrite - n1
                    keep_in_remainder = int(keep_per_pop * (remainder / float(batch_size))) + 1
                    keep_indices = sorted(_sample(xrange(remainder), keep_in_remainder))
                    keep_these = []
                    for i in keep_indices:
                        i1 = i * 4
                        i2 = i * 4 + 4
                        keep_these += these_lines1[i1:i2]
                    
                    # try parsing a read for QC
                    assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq')
                    fout1.write(''.join(keep_these))
                    written += keep_in_remainder
                    print('Written {:,} reads ({:.1%}) to {}'.format(written,
                                                                             written/float(numreads2keep),
                                                                             processed_path_1))
                    
                    # get remainder
                    for line2 in fh2:
                        these_lines2 += [line2]
                    
                    # write remainder
                    keep_these = []
                    for i in keep_indices:
                        i1 = i * 4
                        i2 = i * 4 + 4
                        keep_these += these_lines2[i1:i2]
                    
                    assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') ###### check why keep_these was empty
                    fout2.write(''.join(keep_these))
                    print('Written {:,} reads ({:.1%}) to {}'.format(written,
                                                                             written/float(numreads2keep),
                                                                             processed_path_2))
                    
                    # not sure if this is quicker/slower (more calls to .join())
                    # this_read = []
                    # for line in fh1:
                        # this_read += [line]
                        # if len(this_read) == 4:
                            # these_reads1 += [''.join(this_read)]
                            # #these_reads1 += this_read
                            # this_read = []
                            # n1 += 1
                            
                        # if n1 == nextwrite:
                            # keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop))
                            # # try parsing a read for QC
                            # assert _SeqIO.read(_StringIO(these_reads1[0]), 'fastq')
                            # fout1.write(''.join([these_reads1[i] for i in keep_indices]))
                            # these_reads1 = []
                            # written += keep_per_pop
                            # print('Written {:,} reads ({:.2%}) to {}'.format(written,
                                                                             # written/float(numreads2keep),
                                                                             # processed_path_1))
                            # for line2 in fh2:
                                # this_read += [line2]
                                # if len(this_read) == 4:
                                    # these_reads2 += [''.join(this_read)]
                                    # this_read = []
                                    # n2 += 1
                                
                                # if n2 == nextwrite:
                                    # assert _SeqIO.read(_StringIO(these_reads2[0]), 'fastq')
                                    # fout2.write(''.join([these_reads2[i] for i in keep_indices]))
                                    # these_reads2 = []
                                    # print('Written {:,} reads ({:.2%}) to {}'.format(written,
                                                                                     # written/float(numreads2keep),
                                                                                     # processed_path_2))
                                    # nextwrite += batch_size
                                    # break
                    
                    fout1.close()
                    fout2.close()
                    fh1.close()
                    fh2.close()
                
            else:
                print('Found:')
                print(processed_path_1)
                print(processed_path_2)
                print('use "force = True" to overwrite')
            
            if len(self.read_files) > 1:
                # report durations, time left etc
                _report_time(start_time, cnum, len(self.read_files))
            
            subsampled_read_files[pairname] = {}
            subsampled_read_files[pairname][1] = processed_path_1
            subsampled_read_files[pairname][2] = processed_path_2

        # replace here as this step is optional
        self.fullsized_read_files = list(self.read_files)
        self.read_files = subsampled_read_files
Ejemplo n.º 6
0
    def subsample(self,
                  genome_size=6601757,
                  read_cov_depth=80,
                  pc_loss=0.2,
                  force=False,
                  cov_closeness=5):
        '''
        Given the size in basepairs of a genome sequence, downsample fastq files to a 
        desired average read coverage depth predicted after read alignment. Read lengths
        are taken from the file. By default, 20% are assumed to be lost at downstream 
        quality control stages (e.g. quality score based trimming). The percent loss is 
        used in coverage depth estimation. cov_closeness, which defaults to 5, will prevent
        subsampling if within 5x coverage: avoids time consuming subsampling that will only 
        make a small difference.
        '''

        subsampled_read_files = {}
        start_time = _time.time()
        for cnum, (pairname, files) in enumerate(self.read_files.items()):

            processed_path_1 = insert_suffix(files[1], '_subsmp')
            processed_path_2 = insert_suffix(files[2], '_subsmp')

            if not all([_os.path.exists(processed_path_1),
                        _os.path.exists(processed_path_2)]) \
                    or force:

                if files[1][-2:] == 'gz':
                    fh1 = _gzip.open(files[1])
                else:
                    fh1 = open(files[1])

                aread = _SeqIO.parse(fh1, 'fastq').next()
                read_len = len(aread.seq)

                print('Counting reads in %s' % files[1])
                fh1.seek(0)
                lines = 0
                # report per half million reads
                interval = 2000000
                nextreport = interval
                for line in fh1:
                    lines += 1
                    if lines == nextreport:
                        print('{:,} reads'.format(lines / 4))
                        nextreport += interval

                totalreads = lines / 4.0
                print('Found %s reads' % totalreads)
                full_depth_coverage = read_len * 2 * totalreads * (
                    1 - pc_loss) / genome_size
                print(
                    'These paired read files would provide approximately {:.1f}x coverage depth'
                    .format(full_depth_coverage))
                numreads2keep = int(
                    round(
                        genome_size * read_cov_depth / (read_len * 2) /
                        (1 - pc_loss), 0))

                if numreads2keep >= totalreads:
                    print(
                        'This pair of read files is estimated to provide only {:.1f}x coverage, but {}x requested.'
                        .format(full_depth_coverage, read_cov_depth))
                    print('No sampling performed. Original files will be used')
                    # pass original files over with subsampled
                    subsampled_read_files[pairname] = {}
                    subsampled_read_files[pairname][1] = files[1]
                    subsampled_read_files[pairname][2] = files[2]
                    fh1.close()
                    if len(self.read_files) > 1:
                        # report durations, time left etc
                        _report_time(start_time, cnum, len(self.read_files))

                    continue
                elif full_depth_coverage < read_cov_depth + cov_closeness:
                    print(
                        'This pair of read files is estimated to provide {:.1f}x coverage which is within {}x of {}x requested.'
                        .format(full_depth_coverage, cov_closeness,
                                read_cov_depth))
                    print('No sampling performed. Original files will be used')
                    # pass original files over with subsampled
                    subsampled_read_files[pairname] = {}
                    subsampled_read_files[pairname][1] = files[1]
                    subsampled_read_files[pairname][2] = files[2]
                    fh1.close()
                    if len(self.read_files) > 1:
                        # report durations, time left etc
                        _report_time(start_time, cnum, len(self.read_files))

                    continue
                else:
                    print(
                        'For approximately {}x read coverage, will retain {} of {} {}bp read pairs'
                        .format(read_cov_depth, numreads2keep, totalreads,
                                read_len))

                    fh1.seek(0)
                    if files[2][-2:] == 'gz':
                        fh2 = _gzip.open(files[2])
                    else:
                        fh2 = open(files[2])

                    fout1 = _gzip.open(processed_path_1, 'wb')
                    fout2 = _gzip.open(processed_path_2, 'wb')

                    batch_size = 200000
                    keep_per_pop = int(
                        numreads2keep / float(totalreads) * batch_size) + 1
                    nextwrite = batch_size
                    written = 0
                    n1 = 0
                    n2 = 0
                    these_lines1 = []
                    these_lines2 = []
                    reportfreq = 10
                    thisreport = 0
                    print('Subsampling . . .')
                    for line in fh1:
                        these_lines1 += [line]
                        if len(these_lines1) % 4 == 0:
                            n1 += 1

                        if n1 == nextwrite:
                            keep_indices = sorted(
                                _sample(xrange(batch_size), keep_per_pop))
                            keep_these = []
                            for i in keep_indices:
                                i1 = i * 4
                                i2 = i * 4 + 4
                                keep_these += these_lines1[i1:i2]

                            # try parsing a read for QC
                            assert _SeqIO.read(
                                _StringIO(''.join(keep_these[:4])), 'fastq')
                            fout1.write(''.join(keep_these))
                            these_lines1 = []
                            written += keep_per_pop
                            thisreport += 1
                            if thisreport == reportfreq or written == keep_per_pop:
                                # report first time and at intevals
                                print(
                                    'Written {:,} reads ({:.1%}) to {}'.format(
                                        written,
                                        written / float(numreads2keep),
                                        processed_path_1))

                            for line2 in fh2:
                                these_lines2 += [line2]
                                if len(these_lines2) % 4 == 0:
                                    n2 += 1

                                if n2 == nextwrite:
                                    keep_these = []
                                    for i in keep_indices:
                                        i1 = i * 4
                                        i2 = i * 4 + 4
                                        keep_these += these_lines2[i1:i2]

                                    assert _SeqIO.read(
                                        _StringIO(''.join(keep_these[:4])),
                                        'fastq')
                                    fout2.write(''.join(keep_these))
                                    these_lines2 = []
                                    if thisreport == reportfreq or written == keep_per_pop:
                                        thisreport = 0
                                        print(
                                            'Written {:,} reads ({:.1%}) to {}'
                                            .format(
                                                written,
                                                written / float(numreads2keep),
                                                processed_path_2))
                                    nextwrite += batch_size
                                    break

                    # write remainder
                    remainder = nextwrite - n1
                    keep_in_remainder = int(
                        keep_per_pop * (remainder / float(batch_size))) + 1
                    keep_indices = sorted(
                        _sample(xrange(remainder), keep_in_remainder))
                    keep_these = []
                    for i in keep_indices:
                        i1 = i * 4
                        i2 = i * 4 + 4
                        keep_these += these_lines1[i1:i2]

                    # try parsing a read for QC
                    assert _SeqIO.read(_StringIO(''.join(keep_these[:4])),
                                       'fastq')
                    fout1.write(''.join(keep_these))
                    written += keep_in_remainder
                    print('Written {:,} reads ({:.1%}) to {}'.format(
                        written, written / float(numreads2keep),
                        processed_path_1))

                    # get remainder
                    for line2 in fh2:
                        these_lines2 += [line2]

                    # write remainder
                    keep_these = []
                    for i in keep_indices:
                        i1 = i * 4
                        i2 = i * 4 + 4
                        keep_these += these_lines2[i1:i2]

                    assert _SeqIO.read(
                        _StringIO(''.join(keep_these[:4])),
                        'fastq')  ###### check why keep_these was empty
                    fout2.write(''.join(keep_these))
                    print('Written {:,} reads ({:.1%}) to {}'.format(
                        written, written / float(numreads2keep),
                        processed_path_2))

                    # not sure if this is quicker/slower (more calls to .join())
                    # this_read = []
                    # for line in fh1:
                    # this_read += [line]
                    # if len(this_read) == 4:
                    # these_reads1 += [''.join(this_read)]
                    # #these_reads1 += this_read
                    # this_read = []
                    # n1 += 1

                    # if n1 == nextwrite:
                    # keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop))
                    # # try parsing a read for QC
                    # assert _SeqIO.read(_StringIO(these_reads1[0]), 'fastq')
                    # fout1.write(''.join([these_reads1[i] for i in keep_indices]))
                    # these_reads1 = []
                    # written += keep_per_pop
                    # print('Written {:,} reads ({:.2%}) to {}'.format(written,
                    # written/float(numreads2keep),
                    # processed_path_1))
                    # for line2 in fh2:
                    # this_read += [line2]
                    # if len(this_read) == 4:
                    # these_reads2 += [''.join(this_read)]
                    # this_read = []
                    # n2 += 1

                    # if n2 == nextwrite:
                    # assert _SeqIO.read(_StringIO(these_reads2[0]), 'fastq')
                    # fout2.write(''.join([these_reads2[i] for i in keep_indices]))
                    # these_reads2 = []
                    # print('Written {:,} reads ({:.2%}) to {}'.format(written,
                    # written/float(numreads2keep),
                    # processed_path_2))
                    # nextwrite += batch_size
                    # break

                    fout1.close()
                    fout2.close()
                    fh1.close()
                    fh2.close()

            else:
                print('Found:')
                print(processed_path_1)
                print(processed_path_2)
                print('use "force = True" to overwrite')

            if len(self.read_files) > 1:
                # report durations, time left etc
                _report_time(start_time, cnum, len(self.read_files))

            subsampled_read_files[pairname] = {}
            subsampled_read_files[pairname][1] = processed_path_1
            subsampled_read_files[pairname][2] = processed_path_2

        # replace here as this step is optional
        self.fullsized_read_files = list(self.read_files)
        self.read_files = subsampled_read_files