Example #1
0
    def sortIndexBAMs(self, path_to_exe=False, force=False, max_cpus=-1):
        if not path_to_exe:
            path_to_exe = _get_exe_path('samtools')

        processes = set()
        max_processes = _decide_max_processes(max_cpus)

        paths_to_BAMs_dd_si = []
        for SAM in self.paths_to_BAMs_dd:
            BAM_out = SAM[:-4] + '_si.bam'
            if not _os.path.exists(BAM_out) or force:
                cmd = '{0} sort {1} {2}_si; {0} index {2}_si.bam'.format(
                    path_to_exe, SAM, SAM[:-4])
                print('Called: %s' % cmd)
                processes.add(_subprocess.Popen(cmd, shell=True))
                if len(processes) >= max_processes:
                    (pid, exit_status) = _os.wait()
                    processes.difference_update(
                        [p for p in processes if p.poll() is not None])
            else:
                print('Found:')
                print(BAM_out)
                print('use "force = True" to overwrite')

            paths_to_BAMs_dd_si += [BAM_out]

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()

        self.paths_to_BAMs_dd_si = paths_to_BAMs_dd_si
Example #2
0
    def removeDuplicates(self,
                         path_to_jar=False,
                         force=False,
                         mem_num_gigs=2,
                         max_cpus=-1):
        if not path_to_jar:
            path_to_jar = _get_jar_path('picard')

        processes = set()
        max_processes = _decide_max_processes(max_cpus)

        print('Print: will use %s cpus for picard' % max_processes)

        paths_to_BAMs_dd = []
        for BAM in self.paths_to_BAMs:
            BAM_out = BAM[:-4] + '_dd.bam'
            if not _os.path.exists(BAM_out) or force:
                # an alternative parallel code with polling not waiting
                # while len(processes) >= max_processes:
                # _sleep(1)
                # print('processes: %s' % (len(processes)))
                # processes.difference_update(
                # [p for p in processes if p.poll() is not None]
                # )
                picard_command = [
                    'MarkDuplicates', 'I=', BAM, 'O=', BAM_out, 'M=',
                    BAM[:-4] + '_dd.log'
                ]  #, 'VALIDATION_STRINGENCY=','LENIENT']
                cmd = ['java',
                       '-Xmx%sg' % mem_num_gigs, '-jar', path_to_jar
                       ] + picard_command
                processes.add(_subprocess.Popen(cmd, shell=False))
                print('Called: %s' % (' '.join(map(str, cmd))))
                #_subprocess.call(cmd, shell=False)
                print('processes: %s, max_processes: %s' %
                      (len(processes), max_processes))
                if len(processes) >= max_processes:
                    (pid, exit_status) = _os.wait()
                    processes.difference_update(
                        [p for p in processes if p.poll() is not None])

            else:
                print('Found:')
                print(BAM_out)
                print('use "force = True" to overwrite')

            paths_to_BAMs_dd += [BAM_out]

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()

        self.paths_to_BAMs_dd = paths_to_BAMs_dd
Example #3
0
        def run_SPAdes(cmd):
            proc = _subprocess.Popen(cmd,
                                     stdout=_subprocess.PIPE,
                                     stderr=_subprocess.PIPE)
            # allow for failed SPAdes runs (possibly caused by small fastq files) <== but also check they were actually built properly
            try:
                stdout_value, stderr_value = proc.communicate()
                checkthese = []
                getline = False
                for line in stdout_value.split('\n'):
                    if 'Warnings saved to' in line:
                        getline = False
                    if getline:
                        l = line.rstrip()
                        if len(l):
                            checkthese += [l]
                    if 'SPAdes pipeline finished WITH WARNINGS!' in line:
                        getline = True

                if len(checkthese):
                    print('SPAdes completed with warnings:\n{}\n'.format(
                        '\n'.join(checkthese)))
                else:
                    print('SPAdes completed without warnings')

                # with open('___SPAdes_{}_good_{}.log'.format(cnum, thetime), 'w') as fout:
                # fout.write(stdout_value)
                path2contigs = _os.path.sep.join(
                    [this_output_path, 'contigs.fasta'])
            except _subprocess.CalledProcessError as e:
                print('SPAdes probably did not complete: error returned ({})'.
                      format(proc.returncode))
                print('Error: {}'.format(e))
                print(
                    'Writing some info relevent to SPAdes crash to ___SPAdes_{}_bad_{}.log'
                    .format(cnum, thetime))
                with open('___SPAdes_{}_bad_{}.log'.format(cnum, thetime),
                          'w') as fout:
                    fout.write(dir(proc))
                    fout.write('\n' + str(e.returncode) + '\n')
                    fout.write(
                        _os.path.sep.join([this_output_path, 'contigs.fasta']))

                path2contigs = None

            return (path2contigs)
Example #4
0
def check_no_error(path='',
                   system=False,
                   java_commands=False,
                   extra=False,
                   extra_pre=False,
                   success_returncode=1):
    '''
    check a binary executable can be called.
    
    A failure is considered as OSError, not a non-zero exit status.
    Can check in default baga location or a specified path or the system path 
    or java.
    Additional commands and be appended with 'extra' list or pre-pended with 
    'extra-pre' list e.g., specifying 'python2' for when python3 defaults but 
    python >3.3 is installed but not supported.
    '''
    if java_commands:
        p = _subprocess.Popen(['java'] + java_commands,
                              stdout=_subprocess.PIPE,
                              stderr=_subprocess.STDOUT)
        output = p.stdout.read()
        if 'Error:' in output:
            print(output)
            return (False)
        else:
            print(output)
            return (True)
    else:
        if system:
            cmd = path[-1:]
        elif path[0][0] == _os.path.sep or path[0] == '':
            # absolute path to elsewhere provided
            cmd = [_os.path.sep.join(path)]
        else:
            cmd = [_os.path.sep.join(['external_programs'] + path)]

        if extra:
            cmd += extra

        if extra_pre:
            cmd = extra_pre + cmd

        try:
            o = _subprocess.check_output(cmd)
            o = '\n'.join(['external> ' + l for l in o.split('\n')]) + '\n'
            print(o)  # eventually put in the debugging log <== too much noise
            # some programs output a help screen if commands not given
            # . . . they are installed and available
            return (True)

        except _subprocess.CalledProcessError as e:
            if e.returncode == success_returncode:
                # some programs that exit return non-zero code if commands not given
                # usually this is 1, but occasionally it is 2 (e.g. cutadapt)
                # but a fail on 2 is e.g. python calling a non-existent python program
                # should be set in the dependencies dictionary if not 1 (default)
                # . . . but they are installed and available
                return (True)

            else:
                return (False)

        except OSError as e:
            print('{}: {}'.format(cmd[0], e))
            return (False)
Example #5
0
    def trim(self, path_to_exe=False, force=False, max_cpus=-1):

        if not path_to_exe:
            exe_sickle = _get_exe_path('sickle')
        else:
            exe_sickle = _os.path.sep.join(path_to_exe)

        e1 = 'Could not find "adaptorcut_read_files" attribute. \
        Before quality score trimming, reads must be cleaned of \
        library preparation sequences. Please run cutAdaptors() \
        method on this Reads instance.'

        assert hasattr(self, 'adaptorcut_read_files'), e1

        e2 = 'Could not find %s. Either run cutAdaptors() again \
        or ensure file exists'

        for pairname, files in self.adaptorcut_read_files.items():
            assert _os.path.exists(files[1]), e2 % files[1]
            assert _os.path.exists(files[1]), e2 % files[1]

        trimmed_read_files = {}

        print(sorted(self.adaptorcut_read_files))

        cmds = []
        processed_paths_to_do = []
        for pairname, files in self.adaptorcut_read_files.items():
            processed_path_1 = insert_suffix(files[1], '_qual')
            processed_path_2 = insert_suffix(files[2], '_qual')
            processed_path_s = insert_suffix(files[2], '_singletons_qual')
            # Illumina quality using CASAVA >= 1.8 is Sanger encoded
            QSscore_scale = 'sanger'
            cmd = [
                exe_sickle,
                'pe',
                '-f',
                files[1],
                '-r',
                files[2],
                '-t',
                QSscore_scale,
                '-o',
                processed_path_1,
                '-p',
                processed_path_2,
                '-s',
                processed_path_s,
                # quality 25, length 50 (of 150)
                '-q',
                '25',
                '-l',
                '50'
            ]
            if not all([_os.path.exists(processed_path_1),
                        _os.path.exists(processed_path_2),
                        _os.path.exists(processed_path_s)]) \
                    or force:

                # collect expected outputs
                processed_paths_to_do += [(processed_path_1, processed_path_2,
                                           processed_path_s)]
                # collect all the commands to be issued
                cmds += [(pairname, cmd)]

            else:
                print('Found:')
                print(processed_path_1)
                print(processed_path_2)
                print(processed_path_s)
                print('use "force = True" to overwrite')

                trimmed_read_files[pairname] = {}
                trimmed_read_files[pairname][1] = processed_path_1
                trimmed_read_files[pairname][2] = processed_path_2

        if len(cmds):
            max_processes = _decide_max_processes(max_cpus)

            processes = {}

            ### how to combine this which hangs on _os.wait()
            for pairname, cmd in cmds:

                print('Called: "%s"' % ' '.join(cmd))
                # process is key, open file being piped to is value
                # baga CollectReads currently includes path in pairname
                this_stdout_file = open(pairname + '_sickle.log', "w")
                thisprocess = _subprocess.Popen(cmd,
                                                shell=False,
                                                stdout=this_stdout_file)
                processes[thisprocess] = this_stdout_file

                if len(processes) >= max_processes:
                    _os.wait()
                    finished = dict([(p, f) for p, f in processes.items()
                                     if p.poll() is not None])
                    # close files for finished processes
                    for process, stdout_file in finished.items():
                        stdout_file.close()
                        # update active processes
                        del processes[process]

            # Check if all the child processes were closed
            for p in processes:
                if p.poll() is None:
                    p.wait()

        fails = []
        for (pairname, cmd), (processed_path_1, processed_path_2,
                              processed_path_s) in zip(cmds,
                                                       processed_paths_to_do):
            if _os.path.exists(processed_path_1) and _os.path.exists(
                    processed_path_2):
                print('Found:')
                print(processed_path_1)
                print(processed_path_2)
                trimmed_read_files[pairname] = {}
                trimmed_read_files[pairname][1] = processed_path_1
                trimmed_read_files[pairname][2] = processed_path_2
            else:
                print('Processing of the following pair seems to have failed')
                print(processed_path_1)
                print(processed_path_2)
                fails += [(processed_path_1, processed_path_2)]

        assert len(
            fails
        ) == 0, 'There was a problem finding all of the output from sickle. Try repeating this or an earlier step with the --force option to overwite previous, possibly incomplete, files'

        self.trimmed_read_files = trimmed_read_files
Example #6
0
    def cutAdaptors(self, path_to_exe=False, force=False, max_cpus=-1):

        if not path_to_exe:
            path_to_exe = _get_exe_path('cutadapt')

        adaptorcut_read_files = {}
        adaptor_seqs = [
            'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC',
            'AGATCGGAAGAGCACACGTCT',
            'AGATCGGAAGAGC',
            'GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG',
            'ACACTCTTTCCCTACACGACGCTCTTCCGATCT',
        ]

        cmds = []
        processed_paths_to_do = []
        for cnum, (pairname, files) in enumerate(self.read_files.items()):

            processed_path_1 = insert_suffix(files[1], '_adpt')
            processed_path_2 = insert_suffix(files[2], '_adpt')

            # print(files[1], processed_path_1)
            # print(files[2], processed_path_2)

            # single end
            cmd = [path_to_exe] + \
                  [a for b in [('-a', a) for a in adaptor_seqs] for a in b] + \
                  ['-o', processed_path_1, files[1]]
            # paired end
            cmd = [path_to_exe] + \
                  [a for b in [('-a', a) for a in adaptor_seqs] for a in b] + \
                  [a for b in [('-A', a) for a in adaptor_seqs] for a in b] + \
                  ['-o', processed_path_1, '-p', processed_path_2] + \
                  [files[1], files[2]]

            if not all([_os.path.exists(processed_path_1),
                        _os.path.exists(processed_path_2)]) \
                    or force:

                # collect expected outputs
                processed_paths_to_do += [(processed_path_1, processed_path_2)]
                # collect all the commands to be issued
                cmds += [(pairname, cmd)]

            else:
                print('Found:')
                print(processed_path_1)
                print(processed_path_2)
                print('use "force = True" to overwrite')

                adaptorcut_read_files[pairname] = {}
                adaptorcut_read_files[pairname][1] = processed_path_1
                adaptorcut_read_files[pairname][2] = processed_path_2

        if len(cmds):
            max_processes = _decide_max_processes(max_cpus)

            processes = {}

            ### how to combine this which hangs on _os.wait()
            for pairname, cmd in cmds:

                print('Called: "%s"' % ' '.join(cmd))
                # process is key, open file being piped to is value
                # baga CollectReads currently includes path in pairname
                this_stdout_file = open(pairname + '_cutadapt.log', "w")
                thisprocess = _subprocess.Popen(cmd,
                                                shell=False,
                                                stdout=this_stdout_file)
                processes[thisprocess] = this_stdout_file

                if len(processes) >= max_processes:
                    _os.wait()
                    finished = dict([(p, f) for p, f in processes.items()
                                     if p.poll() is not None])
                    # close files for finished processes
                    for process, stdout_file in finished.items():
                        stdout_file.close()
                        # update active processes
                        del processes[process]

            # Check if all the child processes were closed
            for p in processes:
                if p.poll() is None:
                    p.wait()

        fails = []
        for (pairname, cmd), (processed_path_1,
                              processed_path_2) in zip(cmds,
                                                       processed_paths_to_do):
            if _os.path.exists(processed_path_1) and _os.path.exists(
                    processed_path_2):
                print('Found:')
                print(processed_path_1)
                print(processed_path_2)
                adaptorcut_read_files[pairname] = {}
                adaptorcut_read_files[pairname][1] = processed_path_1
                adaptorcut_read_files[pairname][2] = processed_path_2
            else:
                print('Processing of the following pair seems to have failed')
                print(processed_path_1)
                print(processed_path_2)
                fails += [(processed_path_1, processed_path_2)]

        assert len(
            fails
        ) == 0, 'There was a problem finding all of the output from cutadapt. Try repeating this or an eralier step with the --force option to overwite previous, possibly incomplete, files'

        self.adaptorcut_read_files = adaptorcut_read_files
Example #7
0
    def IndelRealignGATK(self,
                         jar=[
                             'external_programs', 'GenomeAnalysisTK',
                             'GenomeAnalysisTK.jar'
                         ],
                         picard_jar=False,
                         samtools_exe=False,
                         use_java='java',
                         force=False,
                         mem_num_gigs=2,
                         max_cpus=-1):

        # GATK is manually downloaded by user and placed in folder of their choice
        jar = _os.path.sep.join(jar)

        if not picard_jar:
            picard_jar = _get_jar_path('picard')

        if not samtools_exe:
            samtools_exe = _get_exe_path('samtools')

        genome_fna = 'genome_sequences/{}.fna'.format(self.genome_id)

        e1 = 'Could not find "paths_to_BAMs_dd_si" attribute. Before starting GATK analysis, read alignments must have duplicates removed. Please run: .toBAMS(), .removeDuplicates(), .sortIndexBAMs() methods on this SAMs instance, or --deduplicate if using baga_cli.py.'

        assert hasattr(self, 'paths_to_BAMs_dd_si'), e1

        e2 = 'Could not find %s. Please ensure file exists'

        for BAM in self.paths_to_BAMs_dd_si:
            assert _os.path.exists(BAM), e2 % BAM

        # always (re)generate dict in case of upstream changes in data
        print('Creating sequence dictionary for %s' % genome_fna)
        _subprocess.call([
            use_java, '-jar', picard_jar, 'CreateSequenceDictionary', 'R=',
            genome_fna, 'O=', genome_fna[:-4] + '.dict'
        ])

        # always (re)index in case of upstream changes in data
        print('Writing index files for %s' % genome_fna)
        _subprocess.call([samtools_exe, 'faidx', genome_fna])

        processes = set()
        max_processes = _decide_max_processes(max_cpus)

        for BAM in self.paths_to_BAMs_dd_si:
            intervals = BAM[:-4] + '.intervals'
            if not _os.path.exists(intervals) or force:
                cmd = [
                    use_java,
                    '-Xmx%sg' % mem_num_gigs, '-jar', jar, '-T',
                    'RealignerTargetCreator', '-R', genome_fna, '-I', BAM,
                    '-o', intervals
                ]  #, '--validation_strictness', 'LENIENT']
                print(' '.join(map(str, cmd)))
                processes.add(_subprocess.Popen(cmd, shell=False))
                if len(processes) >= max_processes:
                    (pid, exit_status) = _os.wait()
                    processes.difference_update(
                        [p for p in processes if p.poll() is not None])
            else:
                print('Found:')
                print(intervals)
                print('use "force = True" to overwrite')

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()

        paths_to_BAMs_dd_si_ra = []
        for BAM in self.paths_to_BAMs_dd_si:
            intervals = BAM[:-4] + '.intervals'
            bam_out = BAM[:-4] + '_realn.bam'
            if not _os.path.exists(bam_out) or force:
                cmd = [
                    use_java, '-Xmx4g', '-jar', jar, '-T', 'IndelRealigner',
                    '-R', genome_fna, '-I', BAM, '-targetIntervals', intervals,
                    '-o', bam_out, '--filter_bases_not_stored'
                ]
                print(' '.join(map(str, cmd)))
                processes.add(_subprocess.Popen(cmd, shell=False))
                if len(processes) >= max_processes:
                    _os.wait()
                    processes.difference_update(
                        [p for p in processes if p.poll() is not None])
            else:
                print('Found:')
                print(bam_out)
                print('use "force = True" to overwrite')

            paths_to_BAMs_dd_si_ra += [bam_out]

        for p in processes:
            if p.poll() is None:
                p.wait()

        # the last list of BAMs in ready_BAMs is input for CallgVCFsGATK
        # both IndelRealignGATK and recalibBaseScoresGATK put here
        self.ready_BAMs = [paths_to_BAMs_dd_si_ra]
Example #8
0
    def generateReads(self,
                      path_to_exe=False,
                      paths_to_genomes=False,
                      readcov=60,
                      readlen=100,
                      fraglen=350,
                      sterrfraglen=20,
                      model=4,
                      max_cpus=-1):
        '''
        Call GemSIM to generate reads

        Need to have written genome sequences to generate from, possibly with 
        generated SNPs, small indels and large deletions.
        '''

        #max_cpus etc

        if paths_to_genomes:
            use_genomes = sorted(paths_to_genomes)
        elif hasattr(self, 'written_genomes'):
            use_genomes = sorted(self.written_genomes)
        else:
            raise ValueError(
                'provide either paths_to_genomes or generate some then .writeSequences()'
            )

        if not path_to_exe:
            path_to_exe = _get_exe_path('gemsim')

        comment2 = '''
        to generate reads put GemSIM v1.6 into subfolder GemSIM_v1.6 and issue these commands:
        GemSIM_v1.6/GemReads.py -r LESB58_for_GemSim_01.fasta -n 1980527 -l d -u 350 -s 20 -m GemSIM_v1.6/models/ill100v4_p.gzip -c -q 33 -p -o GemSimLESB58_01
        '''

        num_pairs = len(self.genome.sequence) * readcov / (readlen * 2)

        if model == 4:
            path_to_model = _os.path.sep.join(
                path_to_exe.split(_os.path.sep)[:-1] +
                ['models', 'ill100v4_p.gzip'])
        elif model == 5:
            path_to_model = _os.path.sep.join(
                path_to_exe.split(_os.path.sep)[:-1] +
                ['models', 'ill100v5_p.gzip'])

        print('Using error model: {}'.format(path_to_model))
        print(
            'Generating {:,} {}bp read pairs for {}x coverage depth of a {}bp genome ({})'
            .format(num_pairs, readlen, readcov, len(self.genome.sequence),
                    self.genome.id))

        processes = set()
        max_processes = _decide_max_processes(max_cpus)

        import time
        start = time.time()
        out_raw = []
        for i, genome_in in enumerate(use_genomes):
            # could use per genome length . . less consistent than using reference
            # genome_len = len(_SeqIO.read(genome_in,'fasta').seq)
            # num_pairs = genome_len * readcov / (readlen*2)
            outprefix = 'GemSim_{}_{:02d}'.format(self.genome.id, i + 1)
            cmd = [
                path_to_exe, '-r', genome_in, '-n', num_pairs, '-l', 'd', '-u',
                fraglen, '-s', sterrfraglen, '-m', path_to_model, '-c', '-q',
                33, '-p', '-o', outprefix
            ]
            out_raw += [outprefix + '_fir.fastq', outprefix + '_sec.fastq']
            # this would be better to rename and compress all in one
            # maybe as a shell script? Then resuming (--force) would be easier.
            if _os.path.exists(outprefix+'_fir.fastq') and \
                    _os.path.exists(outprefix+'_sec.fastq'):
                print('Found output for {}_fir.fastq (and sec), not regenerating, '\
                'delete these to start from scratch'.format(outprefix))
            else:
                cmd = map(str, cmd)
                print(' '.join(cmd))
                processes.add(_subprocess.Popen(cmd, shell=False))
            if len(processes) >= max_processes:
                (pid, exit_status) = _os.wait()
                processes.difference_update(
                    [p for p in processes if p.poll() is not None])

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()

        missing = []
        for o in out_raw:
            if not _os.path.exists(o):
                missing += [o]

        assert len(missing) == 0, 'Could not find:\n{}'.format(
            '\n'.join(missing))
        print('all finished after {} minutes'.format(
            int(round((time.time() - start) / 60.0))))

        outdir = _os.path.sep.join(['simulated_reads', self.genome.id])
        try:
            _os.makedirs(outdir)
        except OSError:
            pass

        for o in out_raw:
            new = _os.path.sep.join(
                [outdir, o.replace('fir', 'R1').replace('sec', 'R2')])
            print('{} ==> {}'.format(o, new))
            _os.rename(o, new)
            cmd = ['gzip', new]
            print(' '.join(cmd))
            _subprocess.call(cmd)