Example #1
0
def pysam_install():
    _subprocess.call([_sys.executable, 'setup.py', 'build'])
    for f in _os.listdir('build/lib.linux-x86_64-2.7/pysam'):
        if f[-3:] == '.so':
            #print(f)
            _shutil.copy('build/lib.linux-x86_64-2.7/pysam/{}'.format(f), 'pysam/{}'.format(f))
    
    try:
        _shutil.rmtree('../pysam')
    except OSError:
        pass
    
    _os.rename('pysam', '../pysam')
Example #2
0
def pysam_install():
    _subprocess.call([_sys.executable, 'setup.py', 'build'])
    for f in _os.listdir('build/lib.linux-x86_64-2.7/pysam'):
        if f[-3:] == '.so':
            #print(f)
            _shutil.copy('build/lib.linux-x86_64-2.7/pysam/{}'.format(f),
                         'pysam/{}'.format(f))

    try:
        _shutil.rmtree('../pysam')
    except OSError:
        pass

    _os.rename('pysam', '../pysam')
Example #3
0
    def generateReads(self, path_to_exe = False, 
                            paths_to_genomes = False,
                            readcov = 60,
                            readlen = 100,
                            fraglen = 350,
                            sterrfraglen = 20,
                            model = 4,
                            max_cpus = -1):
        '''
        Call GemSIM to generate reads

        Need to have written genome sequences to generate from, possibly with 
        generated SNPs, small indels and large deletions.
        '''

        #max_cpus etc

        if paths_to_genomes:
            use_genomes = sorted(paths_to_genomes)
        elif hasattr(self, 'written_genomes'):
            use_genomes = sorted(self.written_genomes)
        else:
            raise ValueError('provide either paths_to_genomes or generate some then .writeSequences()')

        if not path_to_exe:
            path_to_exe = _get_exe_path('gemsim')

        comment2 = '''
        to generate reads put GemSIM v1.6 into subfolder GemSIM_v1.6 and issue these commands:
        GemSIM_v1.6/GemReads.py -r LESB58_for_GemSim_01.fasta -n 1980527 -l d -u 350 -s 20 -m GemSIM_v1.6/models/ill100v4_p.gzip -c -q 33 -p -o GemSimLESB58_01
        '''

        num_pairs = len(self.genome.sequence) * readcov / (readlen*2)

        if model == 4:
            path_to_model = _os.path.sep.join(path_to_exe.split(_os.path.sep)[:-1] + ['models','ill100v4_p.gzip'])
        elif model == 5:
            path_to_model = _os.path.sep.join(path_to_exe.split(_os.path.sep)[:-1] + ['models','ill100v5_p.gzip'])

        print('Using error model: {}'.format(path_to_model))
        print('Generating {:,} {}bp read pairs for {}x coverage depth of a {}bp genome ({})'.format(
                num_pairs, readlen, readcov, len(self.genome.sequence), self.genome.id))

        processes = set()
        max_processes = _decide_max_processes( max_cpus )

        import time
        start = time.time()
        out_raw = []
        for i,genome_in in enumerate(use_genomes):
            # could use per genome length . . less consistent than using reference
            # genome_len = len(_SeqIO.read(genome_in,'fasta').seq)
            # num_pairs = genome_len * readcov / (readlen*2)
            outprefix = 'GemSim_{}_{:02d}'.format(self.genome.id, i+1)
            cmd = [path_to_exe, 
                        '-r', genome_in,
                        '-n', num_pairs, 
                        '-l', 'd', '-u', fraglen, '-s', sterrfraglen, 
                        '-m', path_to_model, 
                        '-c', 
                        '-q', 33, '-p',
                        '-o', outprefix]
            out_raw += [outprefix+'_fir.fastq', outprefix+'_sec.fastq']
            # this would be better to rename and compress all in one
            # maybe as a shell script? Then resuming (--force) would be easier.
            if _os.path.exists(outprefix+'_fir.fastq') and \
                    _os.path.exists(outprefix+'_sec.fastq'):
                print('Found output for {}_fir.fastq (and sec), not regenerating, '\
                'delete these to start from scratch'.format(outprefix))
            else:
                cmd = map(str,cmd)
                print(' '.join(cmd))
                processes.add( _subprocess.Popen(cmd, shell=False) )
            if len(processes) >= max_processes:
                (pid, exit_status) = _os.wait()
                processes.difference_update(
                    [p for p in processes if p.poll() is not None])
            

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()

        missing = []
        for o in out_raw:
            if not _os.path.exists(o):
                missing += [o]

        assert len(missing) == 0, 'Could not find:\n{}'.format('\n'.join(missing))
        print('all finished after {} minutes'.format(int(round((time.time() - start)/60.0))))

        outdir = _os.path.sep.join(['simulated_reads',self.genome.id])
        try:
            _os.makedirs(outdir)
        except OSError:
            pass

        for o in out_raw:
            new = _os.path.sep.join([outdir, o.replace('fir','R1').replace('sec','R2')])
            print('{} ==> {}'.format(o, new))
            _os.rename(o, new)
            cmd = ['gzip', new]
            print(' '.join(cmd))
            _subprocess.call(cmd)
Example #4
0
    def generateReads(self,
                      path_to_exe=False,
                      paths_to_genomes=False,
                      readcov=60,
                      readlen=100,
                      fraglen=350,
                      sterrfraglen=20,
                      model=4,
                      max_cpus=-1):
        '''
        Call GemSIM to generate reads

        Need to have written genome sequences to generate from, possibly with 
        generated SNPs, small indels and large deletions.
        '''

        #max_cpus etc

        if paths_to_genomes:
            use_genomes = sorted(paths_to_genomes)
        elif hasattr(self, 'written_genomes'):
            use_genomes = sorted(self.written_genomes)
        else:
            raise ValueError(
                'provide either paths_to_genomes or generate some then .writeSequences()'
            )

        if not path_to_exe:
            path_to_exe = _get_exe_path('gemsim')

        comment2 = '''
        to generate reads put GemSIM v1.6 into subfolder GemSIM_v1.6 and issue these commands:
        GemSIM_v1.6/GemReads.py -r LESB58_for_GemSim_01.fasta -n 1980527 -l d -u 350 -s 20 -m GemSIM_v1.6/models/ill100v4_p.gzip -c -q 33 -p -o GemSimLESB58_01
        '''

        num_pairs = len(self.genome.sequence) * readcov / (readlen * 2)

        if model == 4:
            path_to_model = _os.path.sep.join(
                path_to_exe.split(_os.path.sep)[:-1] +
                ['models', 'ill100v4_p.gzip'])
        elif model == 5:
            path_to_model = _os.path.sep.join(
                path_to_exe.split(_os.path.sep)[:-1] +
                ['models', 'ill100v5_p.gzip'])

        print('Using error model: {}'.format(path_to_model))
        print(
            'Generating {:,} {}bp read pairs for {}x coverage depth of a {}bp genome ({})'
            .format(num_pairs, readlen, readcov, len(self.genome.sequence),
                    self.genome.id))

        processes = set()
        max_processes = _decide_max_processes(max_cpus)

        import time
        start = time.time()
        out_raw = []
        for i, genome_in in enumerate(use_genomes):
            # could use per genome length . . less consistent than using reference
            # genome_len = len(_SeqIO.read(genome_in,'fasta').seq)
            # num_pairs = genome_len * readcov / (readlen*2)
            outprefix = 'GemSim_{}_{:02d}'.format(self.genome.id, i + 1)
            cmd = [
                path_to_exe, '-r', genome_in, '-n', num_pairs, '-l', 'd', '-u',
                fraglen, '-s', sterrfraglen, '-m', path_to_model, '-c', '-q',
                33, '-p', '-o', outprefix
            ]
            out_raw += [outprefix + '_fir.fastq', outprefix + '_sec.fastq']
            # this would be better to rename and compress all in one
            # maybe as a shell script? Then resuming (--force) would be easier.
            if _os.path.exists(outprefix+'_fir.fastq') and \
                    _os.path.exists(outprefix+'_sec.fastq'):
                print('Found output for {}_fir.fastq (and sec), not regenerating, '\
                'delete these to start from scratch'.format(outprefix))
            else:
                cmd = map(str, cmd)
                print(' '.join(cmd))
                processes.add(_subprocess.Popen(cmd, shell=False))
            if len(processes) >= max_processes:
                (pid, exit_status) = _os.wait()
                processes.difference_update(
                    [p for p in processes if p.poll() is not None])

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()

        missing = []
        for o in out_raw:
            if not _os.path.exists(o):
                missing += [o]

        assert len(missing) == 0, 'Could not find:\n{}'.format(
            '\n'.join(missing))
        print('all finished after {} minutes'.format(
            int(round((time.time() - start) / 60.0))))

        outdir = _os.path.sep.join(['simulated_reads', self.genome.id])
        try:
            _os.makedirs(outdir)
        except OSError:
            pass

        for o in out_raw:
            new = _os.path.sep.join(
                [outdir, o.replace('fir', 'R1').replace('sec', 'R2')])
            print('{} ==> {}'.format(o, new))
            _os.rename(o, new)
            cmd = ['gzip', new]
            print(' '.join(cmd))
            _subprocess.call(cmd)