Example #1
0
    def getFromENA(self, run_acc_list, 
                         ftp_server_url = 'ftp.sra.ebi.ac.uk', 
                         local_reads_path = ['reads']):
        '''
        Given a list of 'run' accession numbers for paired end short read analyses, 
        download the read files from the European Nucleotide Archive.

        If using a mirror server, supply an alternative for 'ftp_server_url'.

        'local_reads_path' can be a path string or list or folder names.
        '''
        if isinstance(local_reads_path, list):
            local_reads_path = _os.path.sep.join(local_reads_path)

        if not _os.path.exists(local_reads_path):
            _os.makedirs(local_reads_path)

        print('Logging in to %s' % ftp_server_url)
        ftp = _FTP(ftp_server_url)
        # anonymous login
        print(ftp.login())

        def check_connection(ftp):
            try:
                print('FTP: %s' % ftp.voidcmd("NOOP"))
                # http://docs.python.org/2/library/ftplib.html
                return(True)
            except IOError as e:
                print('Seems to be a problem with the connection to FTP server:')
                print('I/O error({0}): {1}'.format(e.errno, e.strerror) )
                return(False)

        def calc_checksum(filepath):
            hasher = _md5()
            handle = open(filepath, 'rb')
            buff = handle.read(65536)
            while len(buff) > 0:
                hasher.update(buff)
                buff = handle.read(65536)
            
            return(hasher.hexdigest())

        downloaded_read_files = {}

        start_time = _time.time()
        failed = []
        for cnum,run_acc in enumerate(run_acc_list):
            
            query_url_base = 'http://www.ebi.ac.uk/ena/data/warehouse/search?query='
            success = False
            tries = 0
            max_tries = 5
            while not success:
                rest_req = '"run_accession=%s"&result=read_run&fields=fastq_ftp,fastq_md5&display=report' % run_acc
                print('Sending query to ENA:\n%s' % rest_req)
                result = _urllib2.urlopen(query_url_base + rest_req).read()
                print('ENA accession numbers query result:\n%s' % result)
                if result.count('ERR') == 7:
                    success = True
                else:
                    print('Query result from ENA was unexpected on attempt %s of %s' % (tries, max_tries))
                    _time.sleep(0.5)
                    tries += 1
                    if tries == max_tries:
                        print('Attempt %s failed. Try again later and if problem persists, report bug.' % tries)
                        failed += [run_acc]
                        break
                        #_sys.exit(1)
            
            if not success:
                continue
            
            md5s = result.split('\n')[-2].split('\t')[-1][:-1].split(';')
            
            ENA_paths = result.split('\n')[-2].split('\t')[-2][:-1].split(';')
            
            ENA_reads_pair_paths = {}
            ENA_reads_pair_paths[1] = ENA_paths[0].replace(ftp_server_url, '')
            ENA_reads_pair_paths[2] = ENA_paths[1].replace(ftp_server_url, '')
            
            local_reads_pair_paths = {}
            local_reads_pair_paths[1] = local_reads_path + \
                                        _os.path.sep + \
                                        ENA_reads_pair_paths[1].split('/')[-1]
            local_reads_pair_paths[2] = local_reads_path + \
                                        _os.path.sep + \
                                        ENA_reads_pair_paths[2].split('/')[-1]
            
            downloaded_read_files[run_acc] = {}
            
            for f in (1,2):
                # ensure connection is still open
                while not check_connection(ftp):
                    _sleep(0.5)
                    print('Attempting to re-establish connection . . .')
                    ftp = _FTP(ftp_server_url)
                    # anonymous login
                    print(ftp.login())
                    pass
                
                expected_checksum = md5s[f - 1]
                
                exists = _os.path.exists(local_reads_pair_paths[f])
                if exists:
                    print('File %s for %s exists locally: %s' % (f, run_acc, local_reads_pair_paths[f]))
                    actual_checksum = calc_checksum(local_reads_pair_paths[f])
                    if actual_checksum == expected_checksum:
                        print('File checksum matches: %s. Skipping download' % (expected_checksum))
                        downloaded_read_files[run_acc][f] = local_reads_pair_paths[f]
                        continue
                    else:
                        print('Checksum mismatch')
                
                print('Downloading via %s: %s' % (ftp_server_url, ENA_reads_pair_paths[f]))
                res = ftp.retrbinary('RETR %s' % ENA_reads_pair_paths[f], 
                                     open(local_reads_pair_paths[f], 'wb').write)
                print('FTP: %s' % res)
                
                print('Calculating checksum . . .')
                actual_checksum = calc_checksum(local_reads_pair_paths[f])
                
                if actual_checksum == expected_checksum:
                    print('File checksum matches: %s.' % (expected_checksum))
                    downloaded_read_files[run_acc][f] = local_reads_pair_paths[f]
                else:
                    print('Checksum mismatch for: %s')
            
            if len(run_acc_list) > 1:
                # report durations, time left etc
                _report_time(start_time, cnum, len(run_acc_list))

        if len(failed) > 0:
            print('WARNING: some accession numbers did not return a result from ENA')
            print('Try searching http://www.ebi.ac.uk/ena in a web-browser for:')
            print(', '.join(failed))

        self.read_files = downloaded_read_files
Example #2
0
    def getFromENA(self,
                   run_acc_list,
                   ftp_server_url='ftp.sra.ebi.ac.uk',
                   local_reads_path=['reads']):
        '''
        Given a list of 'run' accession numbers for paired end short read analyses, 
        download the read files from the European Nucleotide Archive.

        If using a mirror server, supply an alternative for 'ftp_server_url'.

        'local_reads_path' can be a path string or list or folder names.
        '''
        if isinstance(local_reads_path, list):
            local_reads_path = _os.path.sep.join(local_reads_path)

        if not _os.path.exists(local_reads_path):
            _os.makedirs(local_reads_path)

        print('Logging in to %s' % ftp_server_url)
        ftp = _FTP(ftp_server_url)
        # anonymous login
        print(ftp.login())

        def check_connection(ftp):
            try:
                print('FTP: %s' % ftp.voidcmd("NOOP"))
                # http://docs.python.org/2/library/ftplib.html
                return (True)
            except IOError as e:
                print(
                    'Seems to be a problem with the connection to FTP server:')
                print('I/O error({0}): {1}'.format(e.errno, e.strerror))
                return (False)

        def calc_checksum(filepath):
            hasher = _md5()
            handle = open(filepath, 'rb')
            buff = handle.read(65536)
            while len(buff) > 0:
                hasher.update(buff)
                buff = handle.read(65536)

            return (hasher.hexdigest())

        downloaded_read_files = {}

        start_time = _time.time()
        failed = []
        for cnum, run_acc in enumerate(run_acc_list):

            query_url_base = 'http://www.ebi.ac.uk/ena/data/warehouse/search?query='
            success = False
            tries = 0
            max_tries = 5
            while not success:
                rest_req = '"run_accession=%s"&result=read_run&fields=fastq_ftp,fastq_md5&display=report' % run_acc
                print('Sending query to ENA:\n%s' % rest_req)
                result = _urllib2.urlopen(query_url_base + rest_req).read()
                print('ENA accession numbers query result:\n%s' % result)
                if result.count('ERR') == 7:
                    success = True
                else:
                    print(
                        'Query result from ENA was unexpected on attempt %s of %s'
                        % (tries, max_tries))
                    _time.sleep(0.5)
                    tries += 1
                    if tries == max_tries:
                        print(
                            'Attempt %s failed. Try again later and if problem persists, report bug.'
                            % tries)
                        failed += [run_acc]
                        break
                        #_sys.exit(1)

            if not success:
                continue

            md5s = result.split('\n')[-2].split('\t')[-1][:-1].split(';')

            ENA_paths = result.split('\n')[-2].split('\t')[-2][:-1].split(';')

            ENA_reads_pair_paths = {}
            ENA_reads_pair_paths[1] = ENA_paths[0].replace(ftp_server_url, '')
            ENA_reads_pair_paths[2] = ENA_paths[1].replace(ftp_server_url, '')

            local_reads_pair_paths = {}
            local_reads_pair_paths[1] = local_reads_path + \
                                        _os.path.sep + \
                                        ENA_reads_pair_paths[1].split('/')[-1]
            local_reads_pair_paths[2] = local_reads_path + \
                                        _os.path.sep + \
                                        ENA_reads_pair_paths[2].split('/')[-1]

            downloaded_read_files[run_acc] = {}

            for f in (1, 2):
                # ensure connection is still open
                while not check_connection(ftp):
                    _sleep(0.5)
                    print('Attempting to re-establish connection . . .')
                    ftp = _FTP(ftp_server_url)
                    # anonymous login
                    print(ftp.login())
                    pass

                expected_checksum = md5s[f - 1]

                exists = _os.path.exists(local_reads_pair_paths[f])
                if exists:
                    print('File %s for %s exists locally: %s' %
                          (f, run_acc, local_reads_pair_paths[f]))
                    actual_checksum = calc_checksum(local_reads_pair_paths[f])
                    if actual_checksum == expected_checksum:
                        print('File checksum matches: %s. Skipping download' %
                              (expected_checksum))
                        downloaded_read_files[run_acc][
                            f] = local_reads_pair_paths[f]
                        continue
                    else:
                        print('Checksum mismatch')

                print('Downloading via %s: %s' %
                      (ftp_server_url, ENA_reads_pair_paths[f]))
                res = ftp.retrbinary(
                    'RETR %s' % ENA_reads_pair_paths[f],
                    open(local_reads_pair_paths[f], 'wb').write)
                print('FTP: %s' % res)

                print('Calculating checksum . . .')
                actual_checksum = calc_checksum(local_reads_pair_paths[f])

                if actual_checksum == expected_checksum:
                    print('File checksum matches: %s.' % (expected_checksum))
                    downloaded_read_files[run_acc][f] = local_reads_pair_paths[
                        f]
                else:
                    print('Checksum mismatch for: %s')

            if len(run_acc_list) > 1:
                # report durations, time left etc
                _report_time(start_time, cnum, len(run_acc_list))

        if len(failed) > 0:
            print(
                'WARNING: some accession numbers did not return a result from ENA'
            )
            print(
                'Try searching http://www.ebi.ac.uk/ena in a web-browser for:')
            print(', '.join(failed))

        self.read_files = downloaded_read_files
Example #3
0
    def SPAdes(self,
               exe=[],
               output_folder=['assemblies', 'SPAdes'],
               mem_num_gigs=8,
               max_cpus=-1,
               single_assembly=False,
               careful=True,
               only_assembler=False):
        '''
        de novo assembly of short reads using SPAdes

        By default, the provided short reads in dictionary: self.paths_to_reads
        will be assembled separately, unless single_assembly set to True in 
        which case each set of paired read fastq files will be used in a 
        single assembly.

        http://spades.bioinf.spbau.ru/release3.6.1/manual.html
        relevent inputs:
        -o <output_dir> Specify the output directory. Required option.
        --sc required for MDA (single-cell) data.
        --only-error-correction
        --only-assembler
        --careful reduce the number of mismatches and short indels. Run MismatchCorrector – a post processing tool. Recommended.
        --continue from the specified output folder starting from the last available check-point
        --restart-from <check_point>
            ec start from error correction
            as restart assembly module from the first iteration
            k<int> restart from the iteration with specified k values, e.g. k55
            mc restart mismatch correction
        --pe1-12 <file_name> interlaced forward and reverse paired-end reads.
        --pe1-1 <file_name> File with forward reads.
        --pe1-2 <file_name> File with reverse reads.
        --pe1-s <file_name> File with unpaired reads . . use --pe2-... for next library
        --threads <int>
        --memory <int> max memory in Gb
        -k <int,int,...>  Comma-separated list of odd ascending k-mers
        If --sc is set the default value are 21,33,55, for multicell data sets it is auto
        --cov-cutoff <float> positive float value, or 'auto', or 'off'. Default value is 'off'
        '''

        assert isinstance(
            output_folder,
            list), 'Provide output folder as list of folders forming path'

        base_output_path = _os.path.sep.join(output_folder)

        if not _os.path.exists(base_output_path):
            _os.makedirs(base_output_path)

        # max threads is slightly different to cpus
        # . . can probably use more
        max_processes = _decide_max_processes(max_cpus)

        # if an exe is not provided, use that stored in Dependencies
        if len(exe):
            use_exe = _os.path.sep.join(exe)
        else:
            from baga import Dependencies
            use_exe = _get_exe_path('spades')

        def run_SPAdes(cmd):
            proc = _subprocess.Popen(cmd,
                                     stdout=_subprocess.PIPE,
                                     stderr=_subprocess.PIPE)
            # allow for failed SPAdes runs (possibly caused by small fastq files) <== but also check they were actually built properly
            try:
                stdout_value, stderr_value = proc.communicate()
                checkthese = []
                getline = False
                for line in stdout_value.split('\n'):
                    if 'Warnings saved to' in line:
                        getline = False
                    if getline:
                        l = line.rstrip()
                        if len(l):
                            checkthese += [l]
                    if 'SPAdes pipeline finished WITH WARNINGS!' in line:
                        getline = True

                if len(checkthese):
                    print('SPAdes completed with warnings:\n{}\n'.format(
                        '\n'.join(checkthese)))
                else:
                    print('SPAdes completed without warnings')

                # with open('___SPAdes_{}_good_{}.log'.format(cnum, thetime), 'w') as fout:
                # fout.write(stdout_value)
                path2contigs = _os.path.sep.join(
                    [this_output_path, 'contigs.fasta'])
            except _subprocess.CalledProcessError as e:
                print('SPAdes probably did not complete: error returned ({})'.
                      format(proc.returncode))
                print('Error: {}'.format(e))
                print(
                    'Writing some info relevent to SPAdes crash to ___SPAdes_{}_bad_{}.log'
                    .format(cnum, thetime))
                with open('___SPAdes_{}_bad_{}.log'.format(cnum, thetime),
                          'w') as fout:
                    fout.write(dir(proc))
                    fout.write('\n' + str(e.returncode) + '\n')
                    fout.write(
                        _os.path.sep.join([this_output_path, 'contigs.fasta']))

                path2contigs = None

            return (path2contigs)

        if isinstance(use_exe, list):
            # allow for use of prepended executable with script to run
            cmd = list(use_exe)
        else:
            # or just executable
            cmd = [use_exe]

        contigs = {}
        if single_assembly:
            print(
                'Combining reads aligned at multiple regions into single assembly'
            )
            if isinstance(use_exe, list):
                # allow for use of prepended executable with script to run
                cmd = list(use_exe)
            else:
                # or just executable
                cmd = [use_exe]
            for cnum, (pairname, files) in enumerate(self.read_files.items()):
                # allow use of tuples or dicts by converting dicts to lists
                if isinstance(files, dict):
                    use_files = []
                    for k, v in sorted(files.items()):
                        use_files += [v]
                else:
                    use_files = files

                cmd += ['--pe{}-1'.format(cnum + 1), use_files[0]]
                cmd += ['--pe{}-2'.format(cnum + 1), use_files[1]]
                try:
                    # use unpaired reads if available
                    cmd += ['--pe{}-s'.format(cnum + 1), use_files[2]]
                except IndexError:
                    pass
            try:
                # add a second library if provided
                if isinstance(self.read_files2[pairname], dict):
                    # if a dict supplied, make it a list
                    use_files2 = []
                    for k, v in sorted(self.read_files2[pairname].items()):
                        use_files2 += [v]
                else:
                    use_files2 = self.read_files2[pairname]
                cmd += ['--pe{}-1'.format(cnum + 2), use_files2[0]]
                cmd += ['--pe{}-2'.format(cnum + 2), use_files2[1]]
                try:
                    cmd += ['--pe{}-s'.format(cnum + 2), use_files2[2]]
                except IndexError:
                    pass
            except AttributeError:
                pass

            ## this isn't very flexible:
            # retain <sample>__<genome> from pairname:
            # pairname == <sample>__<genome>_<start>-<end>+<padding>
            # and replace with multiregion
            folder = '{}__{}_{}'.format(
                pairname.split('__')[0],
                pairname.split('__')[1].split('_')[0], 'multi_region')
            this_output_path = _os.path.sep.join(output_folder + [folder])
            if not _os.path.exists(this_output_path):
                _os.makedirs(this_output_path)

            cmd += ['-o', this_output_path]
            cmd += ['--threads', str(max_processes)]
            cmd += ['--memory', str(mem_num_gigs)]
            if only_assembler:
                cmd += ['--only-assembler']
            if careful:
                cmd += ['--careful']
            thetime = _time.asctime(_time.localtime(_time.time()))
            print('about to launch SPAdes . . . at {}'.format(thetime))
            print(' '.join(cmd))
            contigs['multi_region'] = run_SPAdes(cmd)
        else:
            start_time = _time.time()
            # prepare commandline and launch each SPAdes assembly
            contigs = {}
            for cnum, (pairname,
                       files) in enumerate(sorted(self.read_files.items())):
                if isinstance(use_exe, list):
                    # allow for use of prepended executable with script to run
                    cmd = list(use_exe)
                else:
                    # or just executable
                    cmd = [use_exe]
                # allow use of tuples or dicts by converting dicts to lists
                if isinstance(files, dict):
                    use_files = []
                    for k, v in sorted(files.items()):
                        use_files += [v]
                else:
                    use_files = files

                cmd += ['--pe1-1', use_files[0]]
                cmd += ['--pe1-2', use_files[1]]
                try:
                    # use unpaired reads if available
                    cmd += ['--pe1-s', use_files[2]]
                except IndexError:
                    pass
                try:
                    # add a second library if provided
                    if isinstance(self.read_files2[pairname], dict):
                        # if a dict supplied, make it a list
                        use_files2 = []
                        for k, v in sorted(self.read_files2[pairname].items()):
                            use_files2 += [v]
                    else:
                        use_files2 = self.read_files2[pairname]
                    cmd += ['--pe2-1', use_files2[0]]
                    cmd += ['--pe2-2', use_files2[1]]
                    try:
                        cmd += ['--pe2-s', use_files2[2]]
                    except IndexError:
                        pass
                except AttributeError:
                    pass

                this_output_path = _os.path.sep.join(output_folder +
                                                     [pairname])
                if not _os.path.exists(this_output_path):
                    _os.makedirs(this_output_path)

                cmd += ['-o', this_output_path]
                cmd += ['--threads', str(max_processes)]
                cmd += ['--memory', str(mem_num_gigs)]
                if only_assembler:
                    cmd += ['--only-assembler']
                if careful:
                    cmd += ['--careful']
                thetime = _time.asctime(_time.localtime(_time.time()))
                print('about to launch SPAdes . . . at {}'.format(thetime))
                print(' '.join(cmd))
                contigs[pairname] = run_SPAdes(cmd)
                if len(self.read_files) > 1:
                    # report durations, time left etc
                    _report_time(start_time, cnum, len(self.read_files))

        self.paths_to_contigs = contigs
Example #4
0
    def subsample(self, genome_size = 6601757, 
                        read_cov_depth = 80, 
                        pc_loss = 0.2, 
                        force = False, 
                        cov_closeness = 5):
        '''
        Given the size in basepairs of a genome sequence, downsample fastq files to a 
        desired average read coverage depth predicted after read alignment. Read lengths
        are taken from the file. By default, 20% are assumed to be lost at downstream 
        quality control stages (e.g. quality score based trimming). The percent loss is 
        used in coverage depth estimation. cov_closeness, which defaults to 5, will prevent
        subsampling if within 5x coverage: avoids time consuming subsampling that will only 
        make a small difference.
        '''

        subsampled_read_files = {}
        start_time = _time.time()
        for cnum,(pairname,files) in enumerate(self.read_files.items()):
            
            processed_path_1 = insert_suffix(files[1], '_subsmp')
            processed_path_2 = insert_suffix(files[2], '_subsmp')
            
            if not all([_os.path.exists(processed_path_1), 
                        _os.path.exists(processed_path_2)]) \
                    or force:
                
                if files[1][-2:] == 'gz':
                    fh1 = _gzip.open(files[1])
                else:
                    fh1 = open(files[1])
                
                aread = _SeqIO.parse(fh1, 'fastq').next()
                read_len = len(aread.seq)
                
                print('Counting reads in %s' % files[1])
                fh1.seek(0)
                lines = 0
                # report per half million reads
                interval = 2000000
                nextreport = interval
                for line in fh1:
                    lines += 1
                    if lines == nextreport:
                        print('{:,} reads'.format(lines/4))
                        nextreport += interval
                
                totalreads = lines / 4.0
                print('Found %s reads' % totalreads)
                full_depth_coverage = read_len * 2 * totalreads * (1 - pc_loss) / genome_size
                print('These paired read files would provide approximately {:.1f}x coverage depth'.format(full_depth_coverage))
                numreads2keep = int( round(genome_size * read_cov_depth / (read_len * 2) /  (1 - pc_loss), 0) )
                
                if numreads2keep >= totalreads:
                    print('This pair of read files is estimated to provide only {:.1f}x coverage, but {}x requested.'.format(full_depth_coverage, read_cov_depth))
                    print('No sampling performed. Original files will be used')
                    # pass original files over with subsampled
                    subsampled_read_files[pairname] = {}
                    subsampled_read_files[pairname][1] = files[1]
                    subsampled_read_files[pairname][2] = files[2]
                    fh1.close()
                    if len(self.read_files) > 1:
                        # report durations, time left etc
                        _report_time(start_time, cnum, len(self.read_files))
                    
                    continue
                elif full_depth_coverage < read_cov_depth + cov_closeness:
                    print('This pair of read files is estimated to provide {:.1f}x coverage which is within {}x of {}x requested.'.format(full_depth_coverage, cov_closeness, read_cov_depth))
                    print('No sampling performed. Original files will be used')
                    # pass original files over with subsampled
                    subsampled_read_files[pairname] = {}
                    subsampled_read_files[pairname][1] = files[1]
                    subsampled_read_files[pairname][2] = files[2]
                    fh1.close()
                    if len(self.read_files) > 1:
                        # report durations, time left etc
                        _report_time(start_time, cnum, len(self.read_files))
                    
                    continue
                else:
                    print('For approximately {}x read coverage, will retain {} of {} {}bp read pairs'.format(
                                    read_cov_depth, numreads2keep, totalreads, read_len))
                    
                    fh1.seek(0)
                    if files[2][-2:] == 'gz':
                        fh2 = _gzip.open(files[2])
                    else:
                        fh2 = open(files[2])
                    
                    fout1 = _gzip.open(processed_path_1, 'wb')
                    fout2 = _gzip.open(processed_path_2, 'wb')
                    
                    batch_size = 200000
                    keep_per_pop = int(numreads2keep / float(totalreads) * batch_size) + 1
                    nextwrite = batch_size
                    written = 0
                    n1 = 0
                    n2 = 0
                    these_lines1 = []
                    these_lines2 = []
                    reportfreq = 10
                    thisreport = 0
                    print('Subsampling . . .')
                    for line in fh1:
                        these_lines1 += [line]
                        if len(these_lines1) % 4 == 0:
                            n1 += 1
                            
                        if n1 == nextwrite:
                            keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop))
                            keep_these = []
                            for i in keep_indices:
                                i1 = i * 4
                                i2 = i * 4 + 4
                                keep_these += these_lines1[i1:i2]
                            
                            # try parsing a read for QC
                            assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq')
                            fout1.write(''.join(keep_these))
                            these_lines1 = []
                            written += keep_per_pop
                            thisreport += 1
                            if thisreport == reportfreq or written == keep_per_pop:
                                # report first time and at intevals
                                print('Written {:,} reads ({:.1%}) to {}'.format(written,
                                                                                 written/float(numreads2keep),
                                                                                 processed_path_1))
                            
                            for line2 in fh2:
                                these_lines2 += [line2]
                                if len(these_lines2) % 4 == 0:
                                    n2 += 1
                                
                                if n2 == nextwrite:
                                    keep_these = []
                                    for i in keep_indices:
                                        i1 = i * 4
                                        i2 = i * 4 + 4
                                        keep_these += these_lines2[i1:i2]
                                    
                                    assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq')
                                    fout2.write(''.join(keep_these))
                                    these_lines2 = []
                                    if thisreport == reportfreq or written == keep_per_pop:
                                        thisreport = 0
                                        print('Written {:,} reads ({:.1%}) to {}'.format(written,
                                                                                         written/float(numreads2keep),
                                                                                         processed_path_2))
                                    nextwrite += batch_size
                                    break
                    
                    # write remainder
                    remainder = nextwrite - n1
                    keep_in_remainder = int(keep_per_pop * (remainder / float(batch_size))) + 1
                    keep_indices = sorted(_sample(xrange(remainder), keep_in_remainder))
                    keep_these = []
                    for i in keep_indices:
                        i1 = i * 4
                        i2 = i * 4 + 4
                        keep_these += these_lines1[i1:i2]
                    
                    # try parsing a read for QC
                    assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq')
                    fout1.write(''.join(keep_these))
                    written += keep_in_remainder
                    print('Written {:,} reads ({:.1%}) to {}'.format(written,
                                                                             written/float(numreads2keep),
                                                                             processed_path_1))
                    
                    # get remainder
                    for line2 in fh2:
                        these_lines2 += [line2]
                    
                    # write remainder
                    keep_these = []
                    for i in keep_indices:
                        i1 = i * 4
                        i2 = i * 4 + 4
                        keep_these += these_lines2[i1:i2]
                    
                    assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') ###### check why keep_these was empty
                    fout2.write(''.join(keep_these))
                    print('Written {:,} reads ({:.1%}) to {}'.format(written,
                                                                             written/float(numreads2keep),
                                                                             processed_path_2))
                    
                    # not sure if this is quicker/slower (more calls to .join())
                    # this_read = []
                    # for line in fh1:
                        # this_read += [line]
                        # if len(this_read) == 4:
                            # these_reads1 += [''.join(this_read)]
                            # #these_reads1 += this_read
                            # this_read = []
                            # n1 += 1
                            
                        # if n1 == nextwrite:
                            # keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop))
                            # # try parsing a read for QC
                            # assert _SeqIO.read(_StringIO(these_reads1[0]), 'fastq')
                            # fout1.write(''.join([these_reads1[i] for i in keep_indices]))
                            # these_reads1 = []
                            # written += keep_per_pop
                            # print('Written {:,} reads ({:.2%}) to {}'.format(written,
                                                                             # written/float(numreads2keep),
                                                                             # processed_path_1))
                            # for line2 in fh2:
                                # this_read += [line2]
                                # if len(this_read) == 4:
                                    # these_reads2 += [''.join(this_read)]
                                    # this_read = []
                                    # n2 += 1
                                
                                # if n2 == nextwrite:
                                    # assert _SeqIO.read(_StringIO(these_reads2[0]), 'fastq')
                                    # fout2.write(''.join([these_reads2[i] for i in keep_indices]))
                                    # these_reads2 = []
                                    # print('Written {:,} reads ({:.2%}) to {}'.format(written,
                                                                                     # written/float(numreads2keep),
                                                                                     # processed_path_2))
                                    # nextwrite += batch_size
                                    # break
                    
                    fout1.close()
                    fout2.close()
                    fh1.close()
                    fh2.close()
                
            else:
                print('Found:')
                print(processed_path_1)
                print(processed_path_2)
                print('use "force = True" to overwrite')
            
            if len(self.read_files) > 1:
                # report durations, time left etc
                _report_time(start_time, cnum, len(self.read_files))
            
            subsampled_read_files[pairname] = {}
            subsampled_read_files[pairname][1] = processed_path_1
            subsampled_read_files[pairname][2] = processed_path_2

        # replace here as this step is optional
        self.fullsized_read_files = list(self.read_files)
        self.read_files = subsampled_read_files
Example #5
0
    def SPAdes(self, 
            exe = [], 
            output_folder = ['assemblies','SPAdes'],
            mem_num_gigs = 8, 
            max_cpus = -1,
            single_assembly = False,
            careful = True,
            only_assembler = False):
        '''
        de novo assembly of short reads using SPAdes

        By default, the provided short reads in dictionary: self.paths_to_reads
        will be assembled separately, unless single_assembly set to True in 
        which case each set of paired read fastq files will be used in a 
        single assembly.

        http://spades.bioinf.spbau.ru/release3.6.1/manual.html
        relevent inputs:
        -o <output_dir> Specify the output directory. Required option.
        --sc required for MDA (single-cell) data.
        --only-error-correction
        --only-assembler
        --careful reduce the number of mismatches and short indels. Run MismatchCorrector – a post processing tool. Recommended.
        --continue from the specified output folder starting from the last available check-point
        --restart-from <check_point>
            ec start from error correction
            as restart assembly module from the first iteration
            k<int> restart from the iteration with specified k values, e.g. k55
            mc restart mismatch correction
        --pe1-12 <file_name> interlaced forward and reverse paired-end reads.
        --pe1-1 <file_name> File with forward reads.
        --pe1-2 <file_name> File with reverse reads.
        --pe1-s <file_name> File with unpaired reads . . use --pe2-... for next library
        --threads <int>
        --memory <int> max memory in Gb
        -k <int,int,...>  Comma-separated list of odd ascending k-mers
        If --sc is set the default value are 21,33,55, for multicell data sets it is auto
        --cov-cutoff <float> positive float value, or 'auto', or 'off'. Default value is 'off'
        '''

        assert isinstance(output_folder, list), 'Provide output folder as list of folders forming path'

        base_output_path = _os.path.sep.join(output_folder)

        if not _os.path.exists(base_output_path):
            _os.makedirs(base_output_path)

        # max threads is slightly different to cpus
        # . . can probably use more
        max_processes = _decide_max_processes( max_cpus )

        # if an exe is not provided, use that stored in Dependencies
        if len(exe):
            use_exe = _os.path.sep.join(exe)
        else:
            from baga import Dependencies
            use_exe = _get_exe_path('spades')

        def run_SPAdes(cmd):
            proc = _subprocess.Popen(cmd, stdout=_subprocess.PIPE, stderr=_subprocess.PIPE)
            # allow for failed SPAdes runs (possibly caused by small fastq files) <== but also check they were actually built properly
            try:
                stdout_value, stderr_value = proc.communicate()
                checkthese = []
                getline = False
                for line in stdout_value.split('\n'):
                    if 'Warnings saved to' in line:
                        getline = False
                    if getline:
                        l = line.rstrip()
                        if len(l):
                            checkthese += [l]
                    if 'SPAdes pipeline finished WITH WARNINGS!' in line:
                        getline = True
                
                if len(checkthese):
                    print('SPAdes completed with warnings:\n{}\n'.format('\n'.join(checkthese)))
                else:
                    print('SPAdes completed without warnings')
                
                # with open('___SPAdes_{}_good_{}.log'.format(cnum, thetime), 'w') as fout:
                    # fout.write(stdout_value)
                path2contigs = _os.path.sep.join([this_output_path,'contigs.fasta'])
            except _subprocess.CalledProcessError as e:
                print('SPAdes probably did not complete: error returned ({})'.format(proc.returncode))
                print('Error: {}'.format(e))
                print('Writing some info relevent to SPAdes crash to ___SPAdes_{}_bad_{}.log'.format(cnum, thetime))
                with open('___SPAdes_{}_bad_{}.log'.format(cnum, thetime), 'w') as fout:
                    fout.write(dir(proc))
                    fout.write('\n' + str(e.returncode) + '\n')
                    fout.write(_os.path.sep.join([this_output_path,'contigs.fasta']))
                
                path2contigs = None
            
            return(path2contigs)

        if isinstance(use_exe, list):
            # allow for use of prepended executable with script to run
            cmd = list(use_exe)
        else:
            # or just executable
            cmd = [use_exe]

        contigs = {}
        if single_assembly:
            print('Combining reads aligned at multiple regions into single assembly')
            if isinstance(use_exe, list):
                # allow for use of prepended executable with script to run
                cmd = list(use_exe)
            else:
                # or just executable
                cmd = [use_exe]
            for cnum, (pairname, files) in enumerate(self.read_files.items()):
                # allow use of tuples or dicts by converting dicts to lists
                if isinstance(files, dict):
                    use_files = []
                    for k,v in sorted(files.items()):
                        use_files += [v]
                else:
                    use_files = files
                
                cmd += ['--pe{}-1'.format(cnum+1), use_files[0]]
                cmd += ['--pe{}-2'.format(cnum+1), use_files[1]]
                try:
                    # use unpaired reads if available
                    cmd += ['--pe{}-s'.format(cnum+1), use_files[2]]
                except IndexError:
                    pass
            try:
                # add a second library if provided
                if isinstance(self.read_files2[pairname], dict):
                    # if a dict supplied, make it a list
                    use_files2 = []
                    for k,v in sorted(self.read_files2[pairname].items()):
                        use_files2 += [v]
                else:
                    use_files2 = self.read_files2[pairname]
                cmd += ['--pe{}-1'.format(cnum+2), use_files2[0]]
                cmd += ['--pe{}-2'.format(cnum+2), use_files2[1]]
                try:
                    cmd += ['--pe{}-s'.format(cnum+2), use_files2[2]]
                except IndexError:
                    pass
            except AttributeError:
                pass
            
            ## this isn't very flexible:
            # retain <sample>__<genome> from pairname:
            # pairname == <sample>__<genome>_<start>-<end>+<padding>
            # and replace with multiregion
            folder = '{}__{}_{}'.format(pairname.split('__')[0],
                                        pairname.split('__')[1].split('_')[0],
                                        'multi_region')
            this_output_path = _os.path.sep.join(output_folder + [folder])
            if not _os.path.exists(this_output_path):
                _os.makedirs(this_output_path)
            
            cmd += ['-o', this_output_path]
            cmd += ['--threads', str(max_processes)]
            cmd += ['--memory', str(mem_num_gigs)]
            if only_assembler:
                cmd += ['--only-assembler']
            if careful:
                cmd += ['--careful']
            thetime = _time.asctime( _time.localtime(_time.time()) )
            print('about to launch SPAdes . . . at {}'.format(thetime))
            print(' '.join(cmd))
            contigs['multi_region'] = run_SPAdes(cmd)
        else:
            start_time = _time.time()
            # prepare commandline and launch each SPAdes assembly
            contigs = {}
            for cnum, (pairname, files) in enumerate(sorted(self.read_files.items())):
                if isinstance(use_exe, list):
                    # allow for use of prepended executable with script to run
                    cmd = list(use_exe)
                else:
                    # or just executable
                    cmd = [use_exe]
                # allow use of tuples or dicts by converting dicts to lists
                if isinstance(files, dict):
                    use_files = []
                    for k,v in sorted(files.items()):
                        use_files += [v]
                else:
                    use_files = files
                
                cmd += ['--pe1-1', use_files[0]]
                cmd += ['--pe1-2', use_files[1]]
                try:
                    # use unpaired reads if available
                    cmd += ['--pe1-s', use_files[2]]
                except IndexError:
                    pass
                try:
                    # add a second library if provided
                    if isinstance(self.read_files2[pairname], dict):
                        # if a dict supplied, make it a list
                        use_files2 = []
                        for k,v in sorted(self.read_files2[pairname].items()):
                            use_files2 += [v]
                    else:
                        use_files2 = self.read_files2[pairname]
                    cmd += ['--pe2-1', use_files2[0]]
                    cmd += ['--pe2-2', use_files2[1]]
                    try:
                        cmd += ['--pe2-s', use_files2[2]]
                    except IndexError:
                        pass
                except AttributeError:
                    pass
                
                this_output_path = _os.path.sep.join(output_folder + [pairname])
                if not _os.path.exists(this_output_path):
                    _os.makedirs(this_output_path)
                
                cmd += ['-o', this_output_path]
                cmd += ['--threads', str(max_processes)]
                cmd += ['--memory', str(mem_num_gigs)]
                if only_assembler:
                    cmd += ['--only-assembler']
                if careful:
                    cmd += ['--careful']
                thetime = _time.asctime( _time.localtime(_time.time()) )
                print('about to launch SPAdes . . . at {}'.format(thetime))
                print(' '.join(cmd))
                contigs[pairname] = run_SPAdes(cmd)
                if len(self.read_files) > 1:
                    # report durations, time left etc
                    _report_time(start_time, cnum, len(self.read_files))

        self.paths_to_contigs = contigs
Example #6
0
    def subsample(self,
                  genome_size=6601757,
                  read_cov_depth=80,
                  pc_loss=0.2,
                  force=False,
                  cov_closeness=5):
        '''
        Given the size in basepairs of a genome sequence, downsample fastq files to a 
        desired average read coverage depth predicted after read alignment. Read lengths
        are taken from the file. By default, 20% are assumed to be lost at downstream 
        quality control stages (e.g. quality score based trimming). The percent loss is 
        used in coverage depth estimation. cov_closeness, which defaults to 5, will prevent
        subsampling if within 5x coverage: avoids time consuming subsampling that will only 
        make a small difference.
        '''

        subsampled_read_files = {}
        start_time = _time.time()
        for cnum, (pairname, files) in enumerate(self.read_files.items()):

            processed_path_1 = insert_suffix(files[1], '_subsmp')
            processed_path_2 = insert_suffix(files[2], '_subsmp')

            if not all([_os.path.exists(processed_path_1),
                        _os.path.exists(processed_path_2)]) \
                    or force:

                if files[1][-2:] == 'gz':
                    fh1 = _gzip.open(files[1])
                else:
                    fh1 = open(files[1])

                aread = _SeqIO.parse(fh1, 'fastq').next()
                read_len = len(aread.seq)

                print('Counting reads in %s' % files[1])
                fh1.seek(0)
                lines = 0
                # report per half million reads
                interval = 2000000
                nextreport = interval
                for line in fh1:
                    lines += 1
                    if lines == nextreport:
                        print('{:,} reads'.format(lines / 4))
                        nextreport += interval

                totalreads = lines / 4.0
                print('Found %s reads' % totalreads)
                full_depth_coverage = read_len * 2 * totalreads * (
                    1 - pc_loss) / genome_size
                print(
                    'These paired read files would provide approximately {:.1f}x coverage depth'
                    .format(full_depth_coverage))
                numreads2keep = int(
                    round(
                        genome_size * read_cov_depth / (read_len * 2) /
                        (1 - pc_loss), 0))

                if numreads2keep >= totalreads:
                    print(
                        'This pair of read files is estimated to provide only {:.1f}x coverage, but {}x requested.'
                        .format(full_depth_coverage, read_cov_depth))
                    print('No sampling performed. Original files will be used')
                    # pass original files over with subsampled
                    subsampled_read_files[pairname] = {}
                    subsampled_read_files[pairname][1] = files[1]
                    subsampled_read_files[pairname][2] = files[2]
                    fh1.close()
                    if len(self.read_files) > 1:
                        # report durations, time left etc
                        _report_time(start_time, cnum, len(self.read_files))

                    continue
                elif full_depth_coverage < read_cov_depth + cov_closeness:
                    print(
                        'This pair of read files is estimated to provide {:.1f}x coverage which is within {}x of {}x requested.'
                        .format(full_depth_coverage, cov_closeness,
                                read_cov_depth))
                    print('No sampling performed. Original files will be used')
                    # pass original files over with subsampled
                    subsampled_read_files[pairname] = {}
                    subsampled_read_files[pairname][1] = files[1]
                    subsampled_read_files[pairname][2] = files[2]
                    fh1.close()
                    if len(self.read_files) > 1:
                        # report durations, time left etc
                        _report_time(start_time, cnum, len(self.read_files))

                    continue
                else:
                    print(
                        'For approximately {}x read coverage, will retain {} of {} {}bp read pairs'
                        .format(read_cov_depth, numreads2keep, totalreads,
                                read_len))

                    fh1.seek(0)
                    if files[2][-2:] == 'gz':
                        fh2 = _gzip.open(files[2])
                    else:
                        fh2 = open(files[2])

                    fout1 = _gzip.open(processed_path_1, 'wb')
                    fout2 = _gzip.open(processed_path_2, 'wb')

                    batch_size = 200000
                    keep_per_pop = int(
                        numreads2keep / float(totalreads) * batch_size) + 1
                    nextwrite = batch_size
                    written = 0
                    n1 = 0
                    n2 = 0
                    these_lines1 = []
                    these_lines2 = []
                    reportfreq = 10
                    thisreport = 0
                    print('Subsampling . . .')
                    for line in fh1:
                        these_lines1 += [line]
                        if len(these_lines1) % 4 == 0:
                            n1 += 1

                        if n1 == nextwrite:
                            keep_indices = sorted(
                                _sample(xrange(batch_size), keep_per_pop))
                            keep_these = []
                            for i in keep_indices:
                                i1 = i * 4
                                i2 = i * 4 + 4
                                keep_these += these_lines1[i1:i2]

                            # try parsing a read for QC
                            assert _SeqIO.read(
                                _StringIO(''.join(keep_these[:4])), 'fastq')
                            fout1.write(''.join(keep_these))
                            these_lines1 = []
                            written += keep_per_pop
                            thisreport += 1
                            if thisreport == reportfreq or written == keep_per_pop:
                                # report first time and at intevals
                                print(
                                    'Written {:,} reads ({:.1%}) to {}'.format(
                                        written,
                                        written / float(numreads2keep),
                                        processed_path_1))

                            for line2 in fh2:
                                these_lines2 += [line2]
                                if len(these_lines2) % 4 == 0:
                                    n2 += 1

                                if n2 == nextwrite:
                                    keep_these = []
                                    for i in keep_indices:
                                        i1 = i * 4
                                        i2 = i * 4 + 4
                                        keep_these += these_lines2[i1:i2]

                                    assert _SeqIO.read(
                                        _StringIO(''.join(keep_these[:4])),
                                        'fastq')
                                    fout2.write(''.join(keep_these))
                                    these_lines2 = []
                                    if thisreport == reportfreq or written == keep_per_pop:
                                        thisreport = 0
                                        print(
                                            'Written {:,} reads ({:.1%}) to {}'
                                            .format(
                                                written,
                                                written / float(numreads2keep),
                                                processed_path_2))
                                    nextwrite += batch_size
                                    break

                    # write remainder
                    remainder = nextwrite - n1
                    keep_in_remainder = int(
                        keep_per_pop * (remainder / float(batch_size))) + 1
                    keep_indices = sorted(
                        _sample(xrange(remainder), keep_in_remainder))
                    keep_these = []
                    for i in keep_indices:
                        i1 = i * 4
                        i2 = i * 4 + 4
                        keep_these += these_lines1[i1:i2]

                    # try parsing a read for QC
                    assert _SeqIO.read(_StringIO(''.join(keep_these[:4])),
                                       'fastq')
                    fout1.write(''.join(keep_these))
                    written += keep_in_remainder
                    print('Written {:,} reads ({:.1%}) to {}'.format(
                        written, written / float(numreads2keep),
                        processed_path_1))

                    # get remainder
                    for line2 in fh2:
                        these_lines2 += [line2]

                    # write remainder
                    keep_these = []
                    for i in keep_indices:
                        i1 = i * 4
                        i2 = i * 4 + 4
                        keep_these += these_lines2[i1:i2]

                    assert _SeqIO.read(
                        _StringIO(''.join(keep_these[:4])),
                        'fastq')  ###### check why keep_these was empty
                    fout2.write(''.join(keep_these))
                    print('Written {:,} reads ({:.1%}) to {}'.format(
                        written, written / float(numreads2keep),
                        processed_path_2))

                    # not sure if this is quicker/slower (more calls to .join())
                    # this_read = []
                    # for line in fh1:
                    # this_read += [line]
                    # if len(this_read) == 4:
                    # these_reads1 += [''.join(this_read)]
                    # #these_reads1 += this_read
                    # this_read = []
                    # n1 += 1

                    # if n1 == nextwrite:
                    # keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop))
                    # # try parsing a read for QC
                    # assert _SeqIO.read(_StringIO(these_reads1[0]), 'fastq')
                    # fout1.write(''.join([these_reads1[i] for i in keep_indices]))
                    # these_reads1 = []
                    # written += keep_per_pop
                    # print('Written {:,} reads ({:.2%}) to {}'.format(written,
                    # written/float(numreads2keep),
                    # processed_path_1))
                    # for line2 in fh2:
                    # this_read += [line2]
                    # if len(this_read) == 4:
                    # these_reads2 += [''.join(this_read)]
                    # this_read = []
                    # n2 += 1

                    # if n2 == nextwrite:
                    # assert _SeqIO.read(_StringIO(these_reads2[0]), 'fastq')
                    # fout2.write(''.join([these_reads2[i] for i in keep_indices]))
                    # these_reads2 = []
                    # print('Written {:,} reads ({:.2%}) to {}'.format(written,
                    # written/float(numreads2keep),
                    # processed_path_2))
                    # nextwrite += batch_size
                    # break

                    fout1.close()
                    fout2.close()
                    fh1.close()
                    fh2.close()

            else:
                print('Found:')
                print(processed_path_1)
                print(processed_path_2)
                print('use "force = True" to overwrite')

            if len(self.read_files) > 1:
                # report durations, time left etc
                _report_time(start_time, cnum, len(self.read_files))

            subsampled_read_files[pairname] = {}
            subsampled_read_files[pairname][1] = processed_path_1
            subsampled_read_files[pairname][2] = processed_path_2

        # replace here as this step is optional
        self.fullsized_read_files = list(self.read_files)
        self.read_files = subsampled_read_files