Example #1
0
def discosnp_install():
    print(_os.listdir('.'))
    _subprocess.call(['bash','./compile_discoSnp++.sh'])
    # ensure VCF generation works if system default python is version 3
    # VCF code is python 2 only
    fixed = open('run_VCF_creator.sh').read().replace('python ','python2 ')
    open('run_VCF_creator.sh','w').write(fixed)
Example #2
0
def discosnp_install():
    print(_os.listdir('.'))
    _subprocess.call(['bash', './compile_discoSnp++.sh'])
    # ensure VCF generation works if system default python is version 3
    # VCF code is python 2 only
    fixed = open('run_VCF_creator.sh').read().replace('python ', 'python2 ')
    open('run_VCF_creator.sh', 'w').write(fixed)
Example #3
0
def pysam_install():
    _subprocess.call([_sys.executable, 'setup.py', 'build'])
    for f in _os.listdir('build/lib.linux-x86_64-2.7/pysam'):
        if f[-3:] == '.so':
            #print(f)
            _shutil.copy('build/lib.linux-x86_64-2.7/pysam/{}'.format(f), 'pysam/{}'.format(f))
    
    try:
        _shutil.rmtree('../pysam')
    except OSError:
        pass
    
    _os.rename('pysam', '../pysam')
Example #4
0
def pysam_install():
    _subprocess.call([_sys.executable, 'setup.py', 'build'])
    for f in _os.listdir('build/lib.linux-x86_64-2.7/pysam'):
        if f[-3:] == '.so':
            #print(f)
            _shutil.copy('build/lib.linux-x86_64-2.7/pysam/{}'.format(f),
                         'pysam/{}'.format(f))

    try:
        _shutil.rmtree('../pysam')
    except OSError:
        pass

    _os.rename('pysam', '../pysam')
Example #5
0
def prep_simple_make(path=False, configure=False, alt_command=False):
    if path:
        _os.chdir(_os.path.sep.join(path))

    if configure:
        _subprocess.call(['./configure'])

    if alt_command:
        _subprocess.call([alt_command])
    else:
        _subprocess.call(['make'])

    if path:
        _os.chdir(_os.path.sep.join([_os.path.pardir] * len(path)))
Example #6
0
def prep_simple_make(path = False, configure = False, alt_command = False):
    if path:
        _os.chdir(_os.path.sep.join(path))
    
    if configure:
        _subprocess.call(['./configure'])
    
    if alt_command:
        _subprocess.call([alt_command])
    else:
        _subprocess.call(['make'])
    
    if path:
        _os.chdir(_os.path.sep.join([_os.path.pardir]*len(path)))
Example #7
0
def get_git(name, description, source, url, commit, checksum, destination,
            preparation, checker):
    '''
    Get a dependency from git
    '''

    if _os.path.realpath(_os.path.curdir) != destination:
        try:
            _os.chdir(destination)
        except OSError:
            _os.makedirs(destination)
            _os.chdir(destination)

    try:
        # clear any previous verions
        _shutil.rmtree(url.split('/')[-1].replace('.git', ''))
    except OSError:
        pass

    git_server = url.replace('https://', '').replace('http://',
                                                     '').split('/')[0]
    print('Downloading {} via git from {} . . .'.format(name, git_server))
    _subprocess.call(['git', 'clone', url])
    _os.chdir(url.split('/')[-1].replace('.git', ''))
    _subprocess.call(['git', 'checkout', commit])
    # if repo uses git submodules, those will be set to the correct revisions for this commit
    # else will do nothing
    _subprocess.call(['git', 'submodule', 'update', '--init'])

    working_dir = _os.path.sep.join(
        [destination, url.split('/')[-1].replace('.git', '')])

    if preparation is not None:
        for do_this in preparation:
            if isinstance(do_this['arguments'], dict):
                do_this['function'](**do_this['arguments'])
            else:
                do_this['function'](*do_this['arguments'])

            # restore position in path if a prepare changed it
            if working_dir != _os.path.realpath(_os.path.curdir):
                _os.chdir(working_dir)

    _os.chdir(_os.path.pardir)
    _os.chdir(_os.path.pardir)
Example #8
0
def get_git(name, description, source, url, commit, checksum, destination, preparation, checker):
    '''
    Get a dependency from git
    '''
    
    if _os.path.realpath(_os.path.curdir) != destination:
        try:
            _os.chdir(destination)
        except OSError:
            _os.makedirs(destination)
            _os.chdir(destination)
    
    try:
        # clear any previous verions
        _shutil.rmtree(url.split('/')[-1].replace('.git',''))
    except OSError:
        pass
    
    git_server = url.replace('https://','').replace('http://','').split('/')[0]
    print('Downloading {} via git from {} . . .'.format(name, git_server))
    _subprocess.call(['git', 'clone', url])
    _os.chdir(url.split('/')[-1].replace('.git',''))
    _subprocess.call(['git', 'checkout', commit])
    # if repo uses git submodules, those will be set to the correct revisions for this commit
    # else will do nothing
    _subprocess.call(['git', 'submodule', 'update', '--init'])
    
    working_dir = _os.path.sep.join([destination,url.split('/')[-1].replace('.git','')])
    
    if preparation is not None:
        for do_this in preparation:
            if isinstance(do_this['arguments'], dict):
                do_this['function'](**do_this['arguments'])
            else:
                do_this['function'](*do_this['arguments'])
            
            # restore position in path if a prepare changed it
            if working_dir != _os.path.realpath(_os.path.curdir):
                _os.chdir(working_dir)
    
    _os.chdir(_os.path.pardir)
    _os.chdir(_os.path.pardir)
Example #9
0
def prep_python_install(extras):
    print('Installing via setup.py . . .')
    # could try here?
    _subprocess.call([_sys.executable, 'setup.py', 'install'] + extras)
Example #10
0
    def IndelRealignGATK(self, 
            jar = ['external_programs', 'GenomeAnalysisTK', 'GenomeAnalysisTK.jar'], 
            picard_jar = False, 
            samtools_exe = False,
            use_java = 'java',
            force = False,
            mem_num_gigs = 2,
            max_cpus = -1):
        
        # GATK is manually downloaded by user and placed in folder of their choice
        jar = _os.path.sep.join(jar)

        if not picard_jar:
            picard_jar = _get_jar_path('picard')

        if not samtools_exe:
            samtools_exe = _get_exe_path('samtools')

        genome_fna = 'genome_sequences/{}.fna'.format(self.genome_id)

        e1 = 'Could not find "paths_to_BAMs_dd_si" attribute. Before starting GATK analysis, read alignments must have duplicates removed. Please run: .toBAMS(), .removeDuplicates(), .sortIndexBAMs() methods on this SAMs instance, or --deduplicate if using baga_cli.py.'

        assert hasattr(self, 'paths_to_BAMs_dd_si'), e1

        e2 = 'Could not find %s. Please ensure file exists'

        for BAM in self.paths_to_BAMs_dd_si:
            assert _os.path.exists(BAM), e2 % BAM

        if not _os.path.exists(genome_fna[:-4] + '.dict'):
            print('Creating sequence dictionary for %s' % genome_fna)
            _subprocess.call([use_java, '-jar', picard_jar, 'CreateSequenceDictionary', 'R=', genome_fna, 'O=', genome_fna[:-4] + '.dict'])

        #have_index_files = [_os.path.exists(genome_fna + '.' + a) for a in ('ann','pac','amb','bwt','sa','fai')]
        have_index_files = [_os.path.exists(genome_fna + '.' + a) for a in ('fai',)]

        if not all(have_index_files):
            print('Writing index files for %s' % genome_fna)
            _subprocess.call([samtools_exe, 'faidx', genome_fna])

        processes = set()
        max_processes = _decide_max_processes( max_cpus )

        for BAM in self.paths_to_BAMs_dd_si:
            intervals = BAM[:-4] + '.intervals'
            if not _os.path.exists(intervals) or force:
                cmd = [use_java, '-Xmx%sg' % mem_num_gigs, '-jar', jar, 
                        '-T', 'RealignerTargetCreator', 
                        '-R', genome_fna, 
                        '-I', BAM, 
                        '-o', intervals] #, '--validation_strictness', 'LENIENT']
                print(' '.join(map(str, cmd)))
                processes.add( _subprocess.Popen(cmd, shell=False) )
                if len(processes) >= max_processes:
                    (pid, exit_status) = _os.wait()
                    processes.difference_update(
                        [p for p in processes if p.poll() is not None])
            else:
                print('Found:')
                print(intervals)
                print('use "force = True" to overwrite')

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()


        paths_to_BAMs_dd_si_ra = []
        for BAM in self.paths_to_BAMs_dd_si:
            intervals = BAM[:-4] + '.intervals'
            bam_out = BAM[:-4] + '_realn.bam'
            if not _os.path.exists(bam_out) or force:
                cmd = [use_java, '-Xmx4g', '-jar', jar, 
                        '-T', 'IndelRealigner', 
                        '-R', genome_fna, 
                        '-I', BAM, 
                        '-targetIntervals', intervals, 
                        '-o', bam_out,
                        '--filter_bases_not_stored']
                print(' '.join(map(str, cmd)))
                processes.add( _subprocess.Popen(cmd, shell=False) )
                if len(processes) >= max_processes:
                    _os.wait()
                    processes.difference_update(
                        [p for p in processes if p.poll() is not None])
            else:
                print('Found:')
                print(bam_out)
                print('use "force = True" to overwrite')
            
            paths_to_BAMs_dd_si_ra += [bam_out]

        for p in processes:
            if p.poll() is None:
                p.wait()

        # the last list of BAMs in ready_BAMs is input for CallgVCFsGATK
        # both IndelRealignGATK and recalibBaseScoresGATK put here
        self.ready_BAMs = [paths_to_BAMs_dd_si_ra]
Example #11
0
    def align(self, insert_size = False, 
                    path_to_exe = False, 
                    local_alns_path = ['alignments'], 
                    force = False, 
                    max_cpus = -1):


        if not path_to_exe:
            path_to_exe = _get_exe_path('bwa')

        # write genome sequence to a fasta file
        try:
            _os.makedirs('genome_sequences')
        except OSError:
            pass

        genome_fna = 'genome_sequences/%s.fna' % self.genome_id

        _SeqIO.write(_SeqRecord(_Seq(self.genome_sequence.tostring()), id = self.genome_id), 
                    genome_fna, 
                    'fasta')

        # make folder for alignments (BAMs)
        local_alns_path = _os.path.sep.join(local_alns_path)
        if not _os.path.exists(local_alns_path):
            _os.makedirs(local_alns_path)

        # make a subdir for this genome
        local_alns_path_genome = _os.path.sep.join([
                                local_alns_path, 
                                self.genome_id])
        if not _os.path.exists(local_alns_path_genome):
            _os.makedirs(local_alns_path_genome)


        max_processes = _decide_max_processes( max_cpus )


        e1 = 'Could not find "read_files" attribute. Before aligning to genome, reads must be quality score trimmed. Please run trim() method on this Reads instance.'

        assert hasattr(self, 'read_files'), e1

        e2 = 'Could not find %s. Either run trim() again or ensure file exists'

        for pairname, files in self.read_files.items():
            assert _os.path.exists(files[1]), e2 % files[1]
            assert _os.path.exists(files[2]), e2 % files[2]

        have_index_files = [_os.path.exists(genome_fna + '.' + a) for a in ('ann','pac','amb','bwt','sa')]

        if not all(have_index_files):
            print('Writing BWA index files for %s' % genome_fna)
            _subprocess.call([path_to_exe, 'index', genome_fna])


        aligned_read_files = {}
        for pairname,files in self.read_files.items():
            RGinfo = r"@RG\tID:%s\tSM:%s\tPL:ILLUMINA" % (pairname,pairname)
            if insert_size:
                cmd = [path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-I', insert_size, '-R', RGinfo, genome_fna, files[1], files[2]]
            else:
                # BWA can estimate on-the-fly
                cmd = [path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-R', RGinfo, genome_fna, files[1], files[2]]
            
            out_sam = _os.path.sep.join([local_alns_path_genome, '%s__%s.sam' % (pairname, self.genome_id)])
            
            if not _os.path.exists(out_sam) or force:
                print('Called: "%s"' % ' '.join(cmd))
                with open(out_sam, "wb") as out:
                    _subprocess.call(cmd, stdout = out)
                
            else:
                print('Found:')
                print(out_sam)
                print('use "force = True" to overwrite')
            
            print(' '.join(cmd))
            
            aligned_read_files[pairname] = out_sam

        self.aligned_read_files = aligned_read_files
Example #12
0
    def IndelRealignGATK(self,
                         jar=[
                             'external_programs', 'GenomeAnalysisTK',
                             'GenomeAnalysisTK.jar'
                         ],
                         picard_jar=False,
                         samtools_exe=False,
                         use_java='java',
                         force=False,
                         mem_num_gigs=2,
                         max_cpus=-1):

        # GATK is manually downloaded by user and placed in folder of their choice
        jar = _os.path.sep.join(jar)

        if not picard_jar:
            picard_jar = _get_jar_path('picard')

        if not samtools_exe:
            samtools_exe = _get_exe_path('samtools')

        genome_fna = 'genome_sequences/{}.fna'.format(self.genome_id)

        e1 = 'Could not find "paths_to_BAMs_dd_si" attribute. Before starting GATK analysis, read alignments must have duplicates removed. Please run: .toBAMS(), .removeDuplicates(), .sortIndexBAMs() methods on this SAMs instance, or --deduplicate if using baga_cli.py.'

        assert hasattr(self, 'paths_to_BAMs_dd_si'), e1

        e2 = 'Could not find %s. Please ensure file exists'

        for BAM in self.paths_to_BAMs_dd_si:
            assert _os.path.exists(BAM), e2 % BAM

        # always (re)generate dict in case of upstream changes in data
        print('Creating sequence dictionary for %s' % genome_fna)
        _subprocess.call([
            use_java, '-jar', picard_jar, 'CreateSequenceDictionary', 'R=',
            genome_fna, 'O=', genome_fna[:-4] + '.dict'
        ])

        # always (re)index in case of upstream changes in data
        print('Writing index files for %s' % genome_fna)
        _subprocess.call([samtools_exe, 'faidx', genome_fna])

        processes = set()
        max_processes = _decide_max_processes(max_cpus)

        for BAM in self.paths_to_BAMs_dd_si:
            intervals = BAM[:-4] + '.intervals'
            if not _os.path.exists(intervals) or force:
                cmd = [
                    use_java,
                    '-Xmx%sg' % mem_num_gigs, '-jar', jar, '-T',
                    'RealignerTargetCreator', '-R', genome_fna, '-I', BAM,
                    '-o', intervals
                ]  #, '--validation_strictness', 'LENIENT']
                print(' '.join(map(str, cmd)))
                processes.add(_subprocess.Popen(cmd, shell=False))
                if len(processes) >= max_processes:
                    (pid, exit_status) = _os.wait()
                    processes.difference_update(
                        [p for p in processes if p.poll() is not None])
            else:
                print('Found:')
                print(intervals)
                print('use "force = True" to overwrite')

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()

        paths_to_BAMs_dd_si_ra = []
        for BAM in self.paths_to_BAMs_dd_si:
            intervals = BAM[:-4] + '.intervals'
            bam_out = BAM[:-4] + '_realn.bam'
            if not _os.path.exists(bam_out) or force:
                cmd = [
                    use_java, '-Xmx4g', '-jar', jar, '-T', 'IndelRealigner',
                    '-R', genome_fna, '-I', BAM, '-targetIntervals', intervals,
                    '-o', bam_out, '--filter_bases_not_stored'
                ]
                print(' '.join(map(str, cmd)))
                processes.add(_subprocess.Popen(cmd, shell=False))
                if len(processes) >= max_processes:
                    _os.wait()
                    processes.difference_update(
                        [p for p in processes if p.poll() is not None])
            else:
                print('Found:')
                print(bam_out)
                print('use "force = True" to overwrite')

            paths_to_BAMs_dd_si_ra += [bam_out]

        for p in processes:
            if p.poll() is None:
                p.wait()

        # the last list of BAMs in ready_BAMs is input for CallgVCFsGATK
        # both IndelRealignGATK and recalibBaseScoresGATK put here
        self.ready_BAMs = [paths_to_BAMs_dd_si_ra]
Example #13
0
    def align(self,
              insert_size=False,
              path_to_exe=False,
              local_alns_path=['alignments'],
              force=False,
              max_cpus=-1):

        if not path_to_exe:
            path_to_exe = _get_exe_path('bwa')

        # write genome sequence to a fasta file
        try:
            _os.makedirs('genome_sequences')
        except OSError:
            pass

        genome_fna = 'genome_sequences/%s.fna' % self.genome_id

        _SeqIO.write(
            _SeqRecord(_Seq(self.genome_sequence.tostring()),
                       id=self.genome_id), genome_fna, 'fasta')

        # make folder for alignments (BAMs)
        local_alns_path = _os.path.sep.join(local_alns_path)
        if not _os.path.exists(local_alns_path):
            _os.makedirs(local_alns_path)

        # make a subdir for this genome
        local_alns_path_genome = _os.path.sep.join(
            [local_alns_path, self.genome_id])
        if not _os.path.exists(local_alns_path_genome):
            _os.makedirs(local_alns_path_genome)

        max_processes = _decide_max_processes(max_cpus)

        e1 = 'Could not find "read_files" attribute. Before aligning to genome, reads must be quality score trimmed. Please run trim() method on this Reads instance.'

        assert hasattr(self, 'read_files'), e1

        e2 = 'Could not find %s. Either run trim() again or ensure file exists'

        for pairname, files in self.read_files.items():
            assert _os.path.exists(files[1]), e2 % files[1]
            assert _os.path.exists(files[2]), e2 % files[2]

        # always (re)index in case of upstream changes in data
        print('Writing BWA index files for %s' % genome_fna)
        _subprocess.call([path_to_exe, 'index', genome_fna])

        aligned_read_files = {}
        for pairname, files in self.read_files.items():
            RGinfo = r"@RG\tID:%s\tSM:%s\tPL:ILLUMINA" % (pairname, pairname)
            if insert_size:
                cmd = [
                    path_to_exe, 'mem', '-t',
                    str(max_processes), '-M', '-a', '-I', insert_size, '-R',
                    RGinfo, genome_fna, files[1], files[2]
                ]
            else:
                # BWA can estimate on-the-fly
                cmd = [
                    path_to_exe, 'mem', '-t',
                    str(max_processes), '-M', '-a', '-R', RGinfo, genome_fna,
                    files[1], files[2]
                ]

            out_sam = _os.path.sep.join([
                local_alns_path_genome,
                '%s__%s.sam' % (pairname, self.genome_id)
            ])

            if not _os.path.exists(out_sam) or force:
                print('Called: "%s"' % ' '.join(cmd))
                with open(out_sam, "wb") as out:
                    _subprocess.call(cmd, stdout=out)

            else:
                print('Found:')
                print(out_sam)
                print('use "force = True" to overwrite')

            print(' '.join(cmd))

            aligned_read_files[pairname] = out_sam

        self.aligned_read_files = aligned_read_files
Example #14
0
    def generateReads(self, path_to_exe = False, 
                            paths_to_genomes = False,
                            readcov = 60,
                            readlen = 100,
                            fraglen = 350,
                            sterrfraglen = 20,
                            model = 4,
                            max_cpus = -1):
        '''
        Call GemSIM to generate reads

        Need to have written genome sequences to generate from, possibly with 
        generated SNPs, small indels and large deletions.
        '''

        #max_cpus etc

        if paths_to_genomes:
            use_genomes = sorted(paths_to_genomes)
        elif hasattr(self, 'written_genomes'):
            use_genomes = sorted(self.written_genomes)
        else:
            raise ValueError('provide either paths_to_genomes or generate some then .writeSequences()')

        if not path_to_exe:
            path_to_exe = _get_exe_path('gemsim')

        comment2 = '''
        to generate reads put GemSIM v1.6 into subfolder GemSIM_v1.6 and issue these commands:
        GemSIM_v1.6/GemReads.py -r LESB58_for_GemSim_01.fasta -n 1980527 -l d -u 350 -s 20 -m GemSIM_v1.6/models/ill100v4_p.gzip -c -q 33 -p -o GemSimLESB58_01
        '''

        num_pairs = len(self.genome.sequence) * readcov / (readlen*2)

        if model == 4:
            path_to_model = _os.path.sep.join(path_to_exe.split(_os.path.sep)[:-1] + ['models','ill100v4_p.gzip'])
        elif model == 5:
            path_to_model = _os.path.sep.join(path_to_exe.split(_os.path.sep)[:-1] + ['models','ill100v5_p.gzip'])

        print('Using error model: {}'.format(path_to_model))
        print('Generating {:,} {}bp read pairs for {}x coverage depth of a {}bp genome ({})'.format(
                num_pairs, readlen, readcov, len(self.genome.sequence), self.genome.id))

        processes = set()
        max_processes = _decide_max_processes( max_cpus )

        import time
        start = time.time()
        out_raw = []
        for i,genome_in in enumerate(use_genomes):
            # could use per genome length . . less consistent than using reference
            # genome_len = len(_SeqIO.read(genome_in,'fasta').seq)
            # num_pairs = genome_len * readcov / (readlen*2)
            outprefix = 'GemSim_{}_{:02d}'.format(self.genome.id, i+1)
            cmd = [path_to_exe, 
                        '-r', genome_in,
                        '-n', num_pairs, 
                        '-l', 'd', '-u', fraglen, '-s', sterrfraglen, 
                        '-m', path_to_model, 
                        '-c', 
                        '-q', 33, '-p',
                        '-o', outprefix]
            out_raw += [outprefix+'_fir.fastq', outprefix+'_sec.fastq']
            # this would be better to rename and compress all in one
            # maybe as a shell script? Then resuming (--force) would be easier.
            if _os.path.exists(outprefix+'_fir.fastq') and \
                    _os.path.exists(outprefix+'_sec.fastq'):
                print('Found output for {}_fir.fastq (and sec), not regenerating, '\
                'delete these to start from scratch'.format(outprefix))
            else:
                cmd = map(str,cmd)
                print(' '.join(cmd))
                processes.add( _subprocess.Popen(cmd, shell=False) )
            if len(processes) >= max_processes:
                (pid, exit_status) = _os.wait()
                processes.difference_update(
                    [p for p in processes if p.poll() is not None])
            

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()

        missing = []
        for o in out_raw:
            if not _os.path.exists(o):
                missing += [o]

        assert len(missing) == 0, 'Could not find:\n{}'.format('\n'.join(missing))
        print('all finished after {} minutes'.format(int(round((time.time() - start)/60.0))))

        outdir = _os.path.sep.join(['simulated_reads',self.genome.id])
        try:
            _os.makedirs(outdir)
        except OSError:
            pass

        for o in out_raw:
            new = _os.path.sep.join([outdir, o.replace('fir','R1').replace('sec','R2')])
            print('{} ==> {}'.format(o, new))
            _os.rename(o, new)
            cmd = ['gzip', new]
            print(' '.join(cmd))
            _subprocess.call(cmd)
Example #15
0
def prep_python_install(extras):
    print('Installing via setup.py . . .')
    # could try here?
    _subprocess.call([_sys.executable, 'setup.py', 'install'] + extras)
Example #16
0
    def generateReads(self,
                      path_to_exe=False,
                      paths_to_genomes=False,
                      readcov=60,
                      readlen=100,
                      fraglen=350,
                      sterrfraglen=20,
                      model=4,
                      max_cpus=-1):
        '''
        Call GemSIM to generate reads

        Need to have written genome sequences to generate from, possibly with 
        generated SNPs, small indels and large deletions.
        '''

        #max_cpus etc

        if paths_to_genomes:
            use_genomes = sorted(paths_to_genomes)
        elif hasattr(self, 'written_genomes'):
            use_genomes = sorted(self.written_genomes)
        else:
            raise ValueError(
                'provide either paths_to_genomes or generate some then .writeSequences()'
            )

        if not path_to_exe:
            path_to_exe = _get_exe_path('gemsim')

        comment2 = '''
        to generate reads put GemSIM v1.6 into subfolder GemSIM_v1.6 and issue these commands:
        GemSIM_v1.6/GemReads.py -r LESB58_for_GemSim_01.fasta -n 1980527 -l d -u 350 -s 20 -m GemSIM_v1.6/models/ill100v4_p.gzip -c -q 33 -p -o GemSimLESB58_01
        '''

        num_pairs = len(self.genome.sequence) * readcov / (readlen * 2)

        if model == 4:
            path_to_model = _os.path.sep.join(
                path_to_exe.split(_os.path.sep)[:-1] +
                ['models', 'ill100v4_p.gzip'])
        elif model == 5:
            path_to_model = _os.path.sep.join(
                path_to_exe.split(_os.path.sep)[:-1] +
                ['models', 'ill100v5_p.gzip'])

        print('Using error model: {}'.format(path_to_model))
        print(
            'Generating {:,} {}bp read pairs for {}x coverage depth of a {}bp genome ({})'
            .format(num_pairs, readlen, readcov, len(self.genome.sequence),
                    self.genome.id))

        processes = set()
        max_processes = _decide_max_processes(max_cpus)

        import time
        start = time.time()
        out_raw = []
        for i, genome_in in enumerate(use_genomes):
            # could use per genome length . . less consistent than using reference
            # genome_len = len(_SeqIO.read(genome_in,'fasta').seq)
            # num_pairs = genome_len * readcov / (readlen*2)
            outprefix = 'GemSim_{}_{:02d}'.format(self.genome.id, i + 1)
            cmd = [
                path_to_exe, '-r', genome_in, '-n', num_pairs, '-l', 'd', '-u',
                fraglen, '-s', sterrfraglen, '-m', path_to_model, '-c', '-q',
                33, '-p', '-o', outprefix
            ]
            out_raw += [outprefix + '_fir.fastq', outprefix + '_sec.fastq']
            # this would be better to rename and compress all in one
            # maybe as a shell script? Then resuming (--force) would be easier.
            if _os.path.exists(outprefix+'_fir.fastq') and \
                    _os.path.exists(outprefix+'_sec.fastq'):
                print('Found output for {}_fir.fastq (and sec), not regenerating, '\
                'delete these to start from scratch'.format(outprefix))
            else:
                cmd = map(str, cmd)
                print(' '.join(cmd))
                processes.add(_subprocess.Popen(cmd, shell=False))
            if len(processes) >= max_processes:
                (pid, exit_status) = _os.wait()
                processes.difference_update(
                    [p for p in processes if p.poll() is not None])

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()

        missing = []
        for o in out_raw:
            if not _os.path.exists(o):
                missing += [o]

        assert len(missing) == 0, 'Could not find:\n{}'.format(
            '\n'.join(missing))
        print('all finished after {} minutes'.format(
            int(round((time.time() - start) / 60.0))))

        outdir = _os.path.sep.join(['simulated_reads', self.genome.id])
        try:
            _os.makedirs(outdir)
        except OSError:
            pass

        for o in out_raw:
            new = _os.path.sep.join(
                [outdir, o.replace('fir', 'R1').replace('sec', 'R2')])
            print('{} ==> {}'.format(o, new))
            _os.rename(o, new)
            cmd = ['gzip', new]
            print(' '.join(cmd))
            _subprocess.call(cmd)