def compare_kmers(self): ''' ''' self.kmers['ref'] = {} jellyfish = self.params.get_param('jellyfish') kmer_size = int(self.params.get_param('kmer_size')) for i in range(len(self.files['target_ref_fn'])): utils.log(self.logging_name, 'info', 'Indexing kmers for reference sequence %s' % self.files['target_ref_fn'][i]) self.kmers['ref'] = utils.load_kmers(utils.run_jellyfish(self.files['target_ref_fn'][i], jellyfish, kmer_size), self.kmers['ref']) # if 'target_altref_fn' in self.files: # for i in range(len(self.files['target_altref_fn'])): # for j in range(len(self.files['target_altref_fn'][i])): # utils.log(self.logging_name, 'info', 'Indexing kmers for reference sequence %s' % self.files['target_altref_fn'][i]) # self.kmers['ref'] = utils.load_kmers(utils.run_jellyfish(self.files['target_altref_fn'][i][j], jellyfish, kmer_size), self.kmers['ref']) utils.log(self.logging_name, 'info', 'Indexing kmers for sample sequence %s' % self.files['sv_cleaned_fq']) self.kmers['case'] = {} self.kmers['case'] = utils.load_kmers(utils.run_jellyfish(self.files['sv_cleaned_fq'], jellyfish, kmer_size), self.kmers['case']) self.kmers['case_sc'] = {} self.kmers['case_sc'] = utils.load_kmers(utils.run_jellyfish(self.files['sv_sc_unmapped_fa'], jellyfish, kmer_size), self.kmers['case_sc']) sc_mers = set(self.kmers['case'].keys()) & set(self.kmers['case_sc']) sample_only_mers = list(sc_mers.difference(set(self.kmers['ref'].keys()))) if 'normal_bam_file' in self.params.opts: norm_kmers = {} norm_kmers = utils.load_kmers(utils.run_jellyfish(self.files['norm_cleaned_fq'], jellyfish, kmer_size), norm_kmers) sample_only_mers = set(sample_only_mers).difference(set(norm_kmers.keys())) sample_only_mers = list(sample_only_mers) # Write case only kmers out to file. self.files['sample_kmers'] = os.path.join(self.paths['kmers'], self.name + "_sample_kmers.out") sample_kmer_fout = open(self.files['sample_kmers'], 'w') self.kmers['case_only'] = {} for mer in sample_only_mers: sample_kmer_fout.write("\t".join([str(x) for x in [mer, str(self.kmers['case'][mer])]]) + "\n") self.kmers['case_only'][mer] = self.kmers['case'][mer] sample_kmer_fout.close() self.kmers['ref'] = {} self.kmers['case'] = {} self.kmers['case_sc'] = {} utils.log(self.logging_name, 'info', 'Writing %d sample-only kmers to file %s' % (len(self.kmers['case_only']), self.files['sample_kmers'])) self.files['kmer_clusters'] = os.path.join(self.paths['kmers'], self.name + "_sample_kmers_merged.out") utils.log(self.logging_name, 'info', 'Writing kmer clusters to file %s' % self.files['kmer_clusters']) self.contigs = assembler.init_assembly(self.kmers['case_only'], self.cleaned_read_recs['sv'], kmer_size, int(self.params.get_param('trl_sr_thresh')), self.params.get_param('read_len')) self.cleaned_read_recs = None self.kmers['case_only'] = {} self.finalize_contigs()
def get_kmers(self, seqFn, kmerDict): """Generic function to run jellyfish on a set of sequences """ jellyfish = self.params.get_param('jellyfish') kmer_size = self.params.get_kmer_size() # Load the kmers into the kmer dictionary based on keyStr value. load_kmers(utils.run_jellyfish(seqFn, jellyfish, kmer_size), kmerDict)