def __getitem__(self, sl): answer = _copy.copy(self) answer.seq = answer.seq.__getitem__(sl) # answer.seq.alphabet = self.seq.alphabet sr = _SeqRecord("n" * len(self)) sr.features = self.features answer.features = _SeqRecord.__getitem__(sr, sl).features return answer
def align(self, insert_size = False, path_to_exe = False, local_alns_path = ['alignments'], force = False, max_cpus = -1): if not path_to_exe: path_to_exe = _get_exe_path('bwa') # write genome sequence to a fasta file try: _os.makedirs('genome_sequences') except OSError: pass genome_fna = 'genome_sequences/%s.fna' % self.genome_id _SeqIO.write(_SeqRecord(_Seq(self.genome_sequence.tostring()), id = self.genome_id), genome_fna, 'fasta') # make folder for alignments (BAMs) local_alns_path = _os.path.sep.join(local_alns_path) if not _os.path.exists(local_alns_path): _os.makedirs(local_alns_path) # make a subdir for this genome local_alns_path_genome = _os.path.sep.join([ local_alns_path, self.genome_id]) if not _os.path.exists(local_alns_path_genome): _os.makedirs(local_alns_path_genome) max_processes = _decide_max_processes( max_cpus ) e1 = 'Could not find "read_files" attribute. Before aligning to genome, reads must be quality score trimmed. Please run trim() method on this Reads instance.' assert hasattr(self, 'read_files'), e1 e2 = 'Could not find %s. Either run trim() again or ensure file exists' for pairname, files in self.read_files.items(): assert _os.path.exists(files[1]), e2 % files[1] assert _os.path.exists(files[2]), e2 % files[2] have_index_files = [_os.path.exists(genome_fna + '.' + a) for a in ('ann','pac','amb','bwt','sa')] if not all(have_index_files): print('Writing BWA index files for %s' % genome_fna) _subprocess.call([path_to_exe, 'index', genome_fna]) aligned_read_files = {} for pairname,files in self.read_files.items(): RGinfo = r"@RG\tID:%s\tSM:%s\tPL:ILLUMINA" % (pairname,pairname) if insert_size: cmd = [path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-I', insert_size, '-R', RGinfo, genome_fna, files[1], files[2]] else: # BWA can estimate on-the-fly cmd = [path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-R', RGinfo, genome_fna, files[1], files[2]] out_sam = _os.path.sep.join([local_alns_path_genome, '%s__%s.sam' % (pairname, self.genome_id)]) if not _os.path.exists(out_sam) or force: print('Called: "%s"' % ' '.join(cmd)) with open(out_sam, "wb") as out: _subprocess.call(cmd, stdout = out) else: print('Found:') print(out_sam) print('use "force = True" to overwrite') print(' '.join(cmd)) aligned_read_files[pairname] = out_sam self.aligned_read_files = aligned_read_files
def generateSequences(self): ''' Create full length sequences with generated variants applied to the reference sequence. Generated variants are saved to a csv. If large deletions are present, variant positions appropriately are corrected when applied. ''' save_these = {} save_these['SNPs'] = self.SNPs_per_genome save_these['InDels'] = self.indel_dict_by_pos_pergenome if len(self.large_deletions): # generate a version of reference genome with large deletions ranges = sorted(self.large_deletions.values()) genome_large_deletions = self.genome.sequence[:ranges[0][0]] for n,(s,e) in enumerate(ranges[:-1]): genome_large_deletions.extend(self.genome.sequence[e:ranges[n+1][0]]) genome_large_deletions.extend(self.genome.sequence[ranges[n+1][1]:]) # adjust generated variant positions for geneome with deletions def adjust(pos0): offset = 0 for s,e in ranges: if pos0 > e: offset += (e-s) return(pos0 - offset) # adjust the second half of the generated variants SNPs_per_genome_adjusted = self.SNPs_per_genome[:self.num_individuals] for SNPs in self.SNPs_per_genome[self.num_individuals:]: adjusted = [] for pos0,variant in SNPs: adjusted += [(adjust(pos0),variant)] SNPs_per_genome_adjusted += [adjusted] indel_dict_by_pos_pergenome_adjusted = self.indel_dict_by_pos_pergenome[:self.num_individuals] for indels in self.indel_dict_by_pos_pergenome[self.num_individuals:]: adjusted = {} for pos0,indel in indels.items(): adjusted[adjust(pos0)] = indel indel_dict_by_pos_pergenome_adjusted += [adjusted] save_these['SNPs_adjusted'] = SNPs_per_genome_adjusted save_these['InDels_adjusted'] = indel_dict_by_pos_pergenome_adjusted SNPs_per_genome = SNPs_per_genome_adjusted indel_dict_by_pos_pergenome = indel_dict_by_pos_pergenome_adjusted else: SNPs_per_genome = self.SNPs_per_genome indel_dict_by_pos_pergenome = self.indel_dict_by_pos_pergenome # adjusted are needed to apply the variants # unadjusted are needed to check the calling _cPickle.dump(save_these, open('baga.GemSIM_known_variants.p','w')) # save_these = cPickle.load(open('baga.GemSIM_known_variants.p','r')) ### generate genotypes (apply variants) ### genotypes = [] for gn,SNPs in enumerate(SNPs_per_genome): if len(self.large_deletions) and gn >= self.num_individuals: # use genome with large deletions for second batch orig_genome = genome_large_deletions genome = _array('c',genome_large_deletions) else: # else use original orig_genome = self.genome.sequence genome = _array('c',self.genome.sequence) # first SNPs for pos0,SNP in SNPs: assert genome[pos0] != SNP genome[pos0] = SNP # check it worked changed = [pos0 for pos0,(new,old) in enumerate(zip(genome,list(orig_genome))) if new != old] assert changed == [pos0 for pos0,var in SNPs], 'SNPs were not correctly applied . . .' # then indels newgenome = _array('c') last_pos0 = 0 for pos0,indel in sorted(indel_dict_by_pos_pergenome[gn].items()): if isinstance(indel,str): # insertion newgenome.extend(genome[last_pos0:pos0]) newgenome.extend(indel) last_pos0 = pos0 else: # deletion newgenome.extend(genome[last_pos0:pos0]) last_pos0 = pos0 + indel newgenome.extend(genome[last_pos0:]) genome_seqrecord = _SeqRecord(_Seq(newgenome.tostring()), id = self.genome.id+'_sim{:02d}'.format(gn+1), name = '', description = '') genotypes += [genome_seqrecord] print(len(self.genome.sequence),len(genotypes[-1]),genotypes[-1].id) self.genotypes = genotypes
def align(self, insert_size=False, path_to_exe=False, local_alns_path=['alignments'], force=False, max_cpus=-1): if not path_to_exe: path_to_exe = _get_exe_path('bwa') # write genome sequence to a fasta file try: _os.makedirs('genome_sequences') except OSError: pass genome_fna = 'genome_sequences/%s.fna' % self.genome_id _SeqIO.write( _SeqRecord(_Seq(self.genome_sequence.tostring()), id=self.genome_id), genome_fna, 'fasta') # make folder for alignments (BAMs) local_alns_path = _os.path.sep.join(local_alns_path) if not _os.path.exists(local_alns_path): _os.makedirs(local_alns_path) # make a subdir for this genome local_alns_path_genome = _os.path.sep.join( [local_alns_path, self.genome_id]) if not _os.path.exists(local_alns_path_genome): _os.makedirs(local_alns_path_genome) max_processes = _decide_max_processes(max_cpus) e1 = 'Could not find "read_files" attribute. Before aligning to genome, reads must be quality score trimmed. Please run trim() method on this Reads instance.' assert hasattr(self, 'read_files'), e1 e2 = 'Could not find %s. Either run trim() again or ensure file exists' for pairname, files in self.read_files.items(): assert _os.path.exists(files[1]), e2 % files[1] assert _os.path.exists(files[2]), e2 % files[2] # always (re)index in case of upstream changes in data print('Writing BWA index files for %s' % genome_fna) _subprocess.call([path_to_exe, 'index', genome_fna]) aligned_read_files = {} for pairname, files in self.read_files.items(): RGinfo = r"@RG\tID:%s\tSM:%s\tPL:ILLUMINA" % (pairname, pairname) if insert_size: cmd = [ path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-I', insert_size, '-R', RGinfo, genome_fna, files[1], files[2] ] else: # BWA can estimate on-the-fly cmd = [ path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-R', RGinfo, genome_fna, files[1], files[2] ] out_sam = _os.path.sep.join([ local_alns_path_genome, '%s__%s.sam' % (pairname, self.genome_id) ]) if not _os.path.exists(out_sam) or force: print('Called: "%s"' % ' '.join(cmd)) with open(out_sam, "wb") as out: _subprocess.call(cmd, stdout=out) else: print('Found:') print(out_sam) print('use "force = True" to overwrite') print(' '.join(cmd)) aligned_read_files[pairname] = out_sam self.aligned_read_files = aligned_read_files
def generateSequences(self): ''' Create full length sequences with generated variants applied to the reference sequence. Generated variants are saved to a csv. If large deletions are present, variant positions appropriately are corrected when applied. ''' save_these = {} save_these['SNPs'] = self.SNPs_per_genome save_these['InDels'] = self.indel_dict_by_pos_pergenome if len(self.large_deletions): # generate a version of reference genome with large deletions ranges = sorted(self.large_deletions.values()) genome_large_deletions = self.genome.sequence[:ranges[0][0]] for n, (s, e) in enumerate(ranges[:-1]): genome_large_deletions.extend( self.genome.sequence[e:ranges[n + 1][0]]) genome_large_deletions.extend(self.genome.sequence[ranges[n + 1][1]:]) # adjust generated variant positions for geneome with deletions def adjust(pos0): offset = 0 for s, e in ranges: if pos0 > e: offset += (e - s) return (pos0 - offset) # adjust the second half of the generated variants SNPs_per_genome_adjusted = self.SNPs_per_genome[:self. num_individuals] for SNPs in self.SNPs_per_genome[self.num_individuals:]: adjusted = [] for pos0, variant in SNPs: adjusted += [(adjust(pos0), variant)] SNPs_per_genome_adjusted += [adjusted] indel_dict_by_pos_pergenome_adjusted = self.indel_dict_by_pos_pergenome[: self . num_individuals] for indels in self.indel_dict_by_pos_pergenome[self. num_individuals:]: adjusted = {} for pos0, indel in indels.items(): adjusted[adjust(pos0)] = indel indel_dict_by_pos_pergenome_adjusted += [adjusted] save_these['SNPs_adjusted'] = SNPs_per_genome_adjusted save_these[ 'InDels_adjusted'] = indel_dict_by_pos_pergenome_adjusted SNPs_per_genome = SNPs_per_genome_adjusted indel_dict_by_pos_pergenome = indel_dict_by_pos_pergenome_adjusted else: SNPs_per_genome = self.SNPs_per_genome indel_dict_by_pos_pergenome = self.indel_dict_by_pos_pergenome # adjusted are needed to apply the variants # unadjusted are needed to check the calling _cPickle.dump(save_these, open('baga.GemSIM_known_variants.p', 'w')) # save_these = cPickle.load(open('baga.GemSIM_known_variants.p','r')) ### generate genotypes (apply variants) ### genotypes = [] for gn, SNPs in enumerate(SNPs_per_genome): if len(self.large_deletions) and gn >= self.num_individuals: # use genome with large deletions for second batch orig_genome = genome_large_deletions genome = _array('c', genome_large_deletions) else: # else use original orig_genome = self.genome.sequence genome = _array('c', self.genome.sequence) # first SNPs for pos0, SNP in SNPs: assert genome[pos0] != SNP genome[pos0] = SNP # check it worked changed = [ pos0 for pos0, (new, old) in enumerate(zip(genome, list(orig_genome))) if new != old ] assert changed == [pos0 for pos0, var in SNPs ], 'SNPs were not correctly applied . . .' # then indels newgenome = _array('c') last_pos0 = 0 for pos0, indel in sorted(indel_dict_by_pos_pergenome[gn].items()): if isinstance(indel, str): # insertion newgenome.extend(genome[last_pos0:pos0]) newgenome.extend(indel) last_pos0 = pos0 else: # deletion newgenome.extend(genome[last_pos0:pos0]) last_pos0 = pos0 + indel newgenome.extend(genome[last_pos0:]) genome_seqrecord = _SeqRecord(_Seq(newgenome.tostring()), id=self.genome.id + '_sim{:02d}'.format(gn + 1), name='', description='') genotypes += [genome_seqrecord] print(len(self.genome.sequence), len(genotypes[-1]), genotypes[-1].id) self.genotypes = genotypes