def compare_kmers(self): ''' ''' self.kmers['ref'] = {} jellyfish = self.params.get_param('jellyfish') kmer_size = int(self.params.get_param('kmer_size')) for i in range(len(self.files['target_ref_fn'])): utils.log(self.logging_name, 'info', 'Indexing kmers for reference sequence %s' % self.files['target_ref_fn'][i]) self.kmers['ref'] = utils.load_kmers(utils.run_jellyfish(self.files['target_ref_fn'][i], jellyfish, kmer_size), self.kmers['ref']) # if 'target_altref_fn' in self.files: # for i in range(len(self.files['target_altref_fn'])): # for j in range(len(self.files['target_altref_fn'][i])): # utils.log(self.logging_name, 'info', 'Indexing kmers for reference sequence %s' % self.files['target_altref_fn'][i]) # self.kmers['ref'] = utils.load_kmers(utils.run_jellyfish(self.files['target_altref_fn'][i][j], jellyfish, kmer_size), self.kmers['ref']) utils.log(self.logging_name, 'info', 'Indexing kmers for sample sequence %s' % self.files['sv_cleaned_fq']) self.kmers['case'] = {} self.kmers['case'] = utils.load_kmers(utils.run_jellyfish(self.files['sv_cleaned_fq'], jellyfish, kmer_size), self.kmers['case']) self.kmers['case_sc'] = {} self.kmers['case_sc'] = utils.load_kmers(utils.run_jellyfish(self.files['sv_sc_unmapped_fa'], jellyfish, kmer_size), self.kmers['case_sc']) sc_mers = set(self.kmers['case'].keys()) & set(self.kmers['case_sc']) sample_only_mers = list(sc_mers.difference(set(self.kmers['ref'].keys()))) if 'normal_bam_file' in self.params.opts: norm_kmers = {} norm_kmers = utils.load_kmers(utils.run_jellyfish(self.files['norm_cleaned_fq'], jellyfish, kmer_size), norm_kmers) sample_only_mers = set(sample_only_mers).difference(set(norm_kmers.keys())) sample_only_mers = list(sample_only_mers) # Write case only kmers out to file. self.files['sample_kmers'] = os.path.join(self.paths['kmers'], self.name + "_sample_kmers.out") sample_kmer_fout = open(self.files['sample_kmers'], 'w') self.kmers['case_only'] = {} for mer in sample_only_mers: sample_kmer_fout.write("\t".join([str(x) for x in [mer, str(self.kmers['case'][mer])]]) + "\n") self.kmers['case_only'][mer] = self.kmers['case'][mer] sample_kmer_fout.close() self.kmers['ref'] = {} self.kmers['case'] = {} self.kmers['case_sc'] = {} utils.log(self.logging_name, 'info', 'Writing %d sample-only kmers to file %s' % (len(self.kmers['case_only']), self.files['sample_kmers'])) self.files['kmer_clusters'] = os.path.join(self.paths['kmers'], self.name + "_sample_kmers_merged.out") utils.log(self.logging_name, 'info', 'Writing kmer clusters to file %s' % self.files['kmer_clusters']) self.contigs = assembler.init_assembly(self.kmers['case_only'], self.cleaned_read_recs['sv'], kmer_size, int(self.params.get_param('trl_sr_thresh')), self.params.get_param('read_len')) self.cleaned_read_recs = None self.kmers['case_only'] = {} self.finalize_contigs()
def compare_kmers(self, kmerPath, name, readLen, targetRefFns): """ """ # Set the reference sequence kmers. self.set_reference_kmers(targetRefFns) # Set sample kmers. self.set_sample_kmers() # Merge the kmers from the cleaned sample sequences and the unmapped and softclipped sequences. scKmers = set(self.kmers['case'].keys()) & set( self.kmers['case_sc'].keys()) # Take the difference from the reference kmers. sampleOnlyKmers = list( scKmers.difference(set(self.kmers['ref'].keys()))) # Add normal sample kmers if available. if self.params.get_param('normal_bam_file'): normKmers = {} self.get_kmers(self.files['norm_cleaned_fq'], normKmers) sampleOnlyKmers = list( set(sampleOnlyKmers).difference(set(normKmers.keys()))) # Write case only kmers out to file. self.files['sample_kmers'] = os.path.join(kmerPath, name + "_sample_kmers.out") sample_kmer_fout = open(self.files['sample_kmers'], 'w') kmer_counter = 1 self.kmers['case_only'] = {} for mer in sampleOnlyKmers: sample_kmer_fout.write("\t".join( [str(x) for x in [mer, str(self.kmers['case'][mer])]]) + "\n") self.kmers['case_only'][mer] = self.kmers['case'][mer] sample_kmer_fout.close() # Clean out data structures. self.kmers['ref'] = {} self.kmers['case'] = {} self.kmers['case_sc'] = {} utils.log( self.loggingName, 'info', 'Writing %d sample-only kmers to file %s' % (len(self.kmers['case_only']), self.files['sample_kmers'])) self.files['kmer_clusters'] = os.path.join( kmerPath, name + "_sample_kmers_merged.out") utils.log( self.loggingName, 'info', 'Writing kmer clusters to file %s' % self.files['kmer_clusters']) self.kmers['clusters'] = assembly.init_assembly( self.kmers['case_only'], self.cleaned_read_recs['sv'], self.params.get_kmer_size(), self.params.get_sr_thresh('min'), readLen) self.clear_cleaned_reads() self.kmers['case_only'] = {}
def compare_kmers(self, kmerPath, name, readLen, targetRefFns): """ """ # Set the reference sequence kmers. self.set_reference_kmers(targetRefFns) # Set sample kmers. self.set_sample_kmers() # Merge the kmers from the cleaned sample sequences and the unmapped and softclipped sequences. scKmers = set(self.kmers['case'].keys()) & set(self.kmers['case_sc'].keys()) # Take the difference from the reference kmers. sampleOnlyKmers = list(scKmers.difference(set(self.kmers['ref'].keys()))) # Add normal sample kmers if available. if self.params.get_param('normal_bam_file'): normKmers = {} self.get_kmers(self.files['norm_cleaned_fq'], normKmers) sampleOnlyKmers = list(set(sampleOnlyKmers).difference(set(normKmers.keys()))) # Write case only kmers out to file. self.files['sample_kmers'] = os.path.join(kmerPath, name + "_sample_kmers.out") sample_kmer_fout = open(self.files['sample_kmers'], 'w') kmer_counter = 1 self.kmers['case_only'] = {} for mer in sampleOnlyKmers: sample_kmer_fout.write("\t".join([str(x) for x in [mer, str(self.kmers['case'][mer])]]) + "\n") self.kmers['case_only'][mer] = self.kmers['case'][mer] sample_kmer_fout.close() # Clean out data structures. self.kmers['ref'] = {} self.kmers['case'] = {} self.kmers['case_sc'] = {} utils.log(self.loggingName, 'info', 'Writing %d sample-only kmers to file %s' % (len(self.kmers['case_only']), self.files['sample_kmers'])) self.files['kmer_clusters'] = os.path.join(kmerPath, name + "_sample_kmers_merged.out") utils.log(self.loggingName, 'info', 'Writing kmer clusters to file %s' % self.files['kmer_clusters']) self.kmers['clusters'] = assembly.init_assembly(self.kmers['case_only'], self.cleaned_read_recs['sv'], self.params.get_kmer_size(), self.params.get_sr_thresh('min'), readLen) self.clear_cleaned_reads() self.kmers['case_only'] = {}