def saveLocal(self, name): ''' Save a downloaded read info to a local compressed pickle file. 'name' can exclude extension: .baga will be added ''' fileout = 'baga.CollectData.Reads-%s.baga' % name print('Saving to %s' % fileout) _cPickle.dump(self, _gzip.open(fileout, 'wb'))
def generateSequences(self): ''' Create full length sequences with generated variants applied to the reference sequence. Generated variants are saved to a csv. If large deletions are present, variant positions appropriately are corrected when applied. ''' save_these = {} save_these['SNPs'] = self.SNPs_per_genome save_these['InDels'] = self.indel_dict_by_pos_pergenome if len(self.large_deletions): # generate a version of reference genome with large deletions ranges = sorted(self.large_deletions.values()) genome_large_deletions = self.genome.sequence[:ranges[0][0]] for n,(s,e) in enumerate(ranges[:-1]): genome_large_deletions.extend(self.genome.sequence[e:ranges[n+1][0]]) genome_large_deletions.extend(self.genome.sequence[ranges[n+1][1]:]) # adjust generated variant positions for geneome with deletions def adjust(pos0): offset = 0 for s,e in ranges: if pos0 > e: offset += (e-s) return(pos0 - offset) # adjust the second half of the generated variants SNPs_per_genome_adjusted = self.SNPs_per_genome[:self.num_individuals] for SNPs in self.SNPs_per_genome[self.num_individuals:]: adjusted = [] for pos0,variant in SNPs: adjusted += [(adjust(pos0),variant)] SNPs_per_genome_adjusted += [adjusted] indel_dict_by_pos_pergenome_adjusted = self.indel_dict_by_pos_pergenome[:self.num_individuals] for indels in self.indel_dict_by_pos_pergenome[self.num_individuals:]: adjusted = {} for pos0,indel in indels.items(): adjusted[adjust(pos0)] = indel indel_dict_by_pos_pergenome_adjusted += [adjusted] save_these['SNPs_adjusted'] = SNPs_per_genome_adjusted save_these['InDels_adjusted'] = indel_dict_by_pos_pergenome_adjusted SNPs_per_genome = SNPs_per_genome_adjusted indel_dict_by_pos_pergenome = indel_dict_by_pos_pergenome_adjusted else: SNPs_per_genome = self.SNPs_per_genome indel_dict_by_pos_pergenome = self.indel_dict_by_pos_pergenome # adjusted are needed to apply the variants # unadjusted are needed to check the calling _cPickle.dump(save_these, open('baga.GemSIM_known_variants.p','w')) # save_these = cPickle.load(open('baga.GemSIM_known_variants.p','r')) ### generate genotypes (apply variants) ### genotypes = [] for gn,SNPs in enumerate(SNPs_per_genome): if len(self.large_deletions) and gn >= self.num_individuals: # use genome with large deletions for second batch orig_genome = genome_large_deletions genome = _array('c',genome_large_deletions) else: # else use original orig_genome = self.genome.sequence genome = _array('c',self.genome.sequence) # first SNPs for pos0,SNP in SNPs: assert genome[pos0] != SNP genome[pos0] = SNP # check it worked changed = [pos0 for pos0,(new,old) in enumerate(zip(genome,list(orig_genome))) if new != old] assert changed == [pos0 for pos0,var in SNPs], 'SNPs were not correctly applied . . .' # then indels newgenome = _array('c') last_pos0 = 0 for pos0,indel in sorted(indel_dict_by_pos_pergenome[gn].items()): if isinstance(indel,str): # insertion newgenome.extend(genome[last_pos0:pos0]) newgenome.extend(indel) last_pos0 = pos0 else: # deletion newgenome.extend(genome[last_pos0:pos0]) last_pos0 = pos0 + indel newgenome.extend(genome[last_pos0:]) genome_seqrecord = _SeqRecord(_Seq(newgenome.tostring()), id = self.genome.id+'_sim{:02d}'.format(gn+1), name = '', description = '') genotypes += [genome_seqrecord] print(len(self.genome.sequence),len(genotypes[-1]),genotypes[-1].id) self.genotypes = genotypes
def generateSequences(self): ''' Create full length sequences with generated variants applied to the reference sequence. Generated variants are saved to a csv. If large deletions are present, variant positions appropriately are corrected when applied. ''' save_these = {} save_these['SNPs'] = self.SNPs_per_genome save_these['InDels'] = self.indel_dict_by_pos_pergenome if len(self.large_deletions): # generate a version of reference genome with large deletions ranges = sorted(self.large_deletions.values()) genome_large_deletions = self.genome.sequence[:ranges[0][0]] for n, (s, e) in enumerate(ranges[:-1]): genome_large_deletions.extend( self.genome.sequence[e:ranges[n + 1][0]]) genome_large_deletions.extend(self.genome.sequence[ranges[n + 1][1]:]) # adjust generated variant positions for geneome with deletions def adjust(pos0): offset = 0 for s, e in ranges: if pos0 > e: offset += (e - s) return (pos0 - offset) # adjust the second half of the generated variants SNPs_per_genome_adjusted = self.SNPs_per_genome[:self. num_individuals] for SNPs in self.SNPs_per_genome[self.num_individuals:]: adjusted = [] for pos0, variant in SNPs: adjusted += [(adjust(pos0), variant)] SNPs_per_genome_adjusted += [adjusted] indel_dict_by_pos_pergenome_adjusted = self.indel_dict_by_pos_pergenome[: self . num_individuals] for indels in self.indel_dict_by_pos_pergenome[self. num_individuals:]: adjusted = {} for pos0, indel in indels.items(): adjusted[adjust(pos0)] = indel indel_dict_by_pos_pergenome_adjusted += [adjusted] save_these['SNPs_adjusted'] = SNPs_per_genome_adjusted save_these[ 'InDels_adjusted'] = indel_dict_by_pos_pergenome_adjusted SNPs_per_genome = SNPs_per_genome_adjusted indel_dict_by_pos_pergenome = indel_dict_by_pos_pergenome_adjusted else: SNPs_per_genome = self.SNPs_per_genome indel_dict_by_pos_pergenome = self.indel_dict_by_pos_pergenome # adjusted are needed to apply the variants # unadjusted are needed to check the calling _cPickle.dump(save_these, open('baga.GemSIM_known_variants.p', 'w')) # save_these = cPickle.load(open('baga.GemSIM_known_variants.p','r')) ### generate genotypes (apply variants) ### genotypes = [] for gn, SNPs in enumerate(SNPs_per_genome): if len(self.large_deletions) and gn >= self.num_individuals: # use genome with large deletions for second batch orig_genome = genome_large_deletions genome = _array('c', genome_large_deletions) else: # else use original orig_genome = self.genome.sequence genome = _array('c', self.genome.sequence) # first SNPs for pos0, SNP in SNPs: assert genome[pos0] != SNP genome[pos0] = SNP # check it worked changed = [ pos0 for pos0, (new, old) in enumerate(zip(genome, list(orig_genome))) if new != old ] assert changed == [pos0 for pos0, var in SNPs ], 'SNPs were not correctly applied . . .' # then indels newgenome = _array('c') last_pos0 = 0 for pos0, indel in sorted(indel_dict_by_pos_pergenome[gn].items()): if isinstance(indel, str): # insertion newgenome.extend(genome[last_pos0:pos0]) newgenome.extend(indel) last_pos0 = pos0 else: # deletion newgenome.extend(genome[last_pos0:pos0]) last_pos0 = pos0 + indel newgenome.extend(genome[last_pos0:]) genome_seqrecord = _SeqRecord(_Seq(newgenome.tostring()), id=self.genome.id + '_sim{:02d}'.format(gn + 1), name='', description='') genotypes += [genome_seqrecord] print(len(self.genome.sequence), len(genotypes[-1]), genotypes[-1].id) self.genotypes = genotypes