def select_illumina_reads(self, alignment_file, unmapped_filtered_reads, update=False, hmm=None): recruitment_score = None sema = Semaphore(settings.CORES) manager = Manager() selected_reads = manager.list() vntr_bp_in_unmapped_reads = Value('d', 0.0) number_of_reads = 0 read_length = 150 process_list = [] for read_segment in unmapped_filtered_reads: if number_of_reads == 0: read_length = len(str(read_segment.seq)) number_of_reads += 1 if not hmm: hmm = self.get_vntr_matcher_hmm(read_length=read_length) if not recruitment_score: recruitment_score = self.get_min_score_to_select_a_read( read_length) if len(read_segment.seq) < read_length: continue sema.acquire() p = Process(target=self.process_unmapped_read, args=(sema, str(read_segment.seq), hmm, recruitment_score, vntr_bp_in_unmapped_reads, selected_reads)) process_list.append(p) p.start() for p in process_list: p.join() logging.debug('vntr base pairs in unmapped reads: %s' % vntr_bp_in_unmapped_reads.value) vntr_bp_in_mapped_reads = 0 vntr_start = self.reference_vntr.start_point vntr_end = self.reference_vntr.start_point + self.reference_vntr.get_length( ) read_mode = 'r' if alignment_file.endswith('sam') else 'rb' samfile = pysam.AlignmentFile(alignment_file, read_mode) reference = get_reference_genome_of_alignment_file(samfile) chromosome = self.reference_vntr.chromosome if reference == 'HG19' else self.reference_vntr.chromosome[ 3:] for read in samfile.fetch(chromosome, vntr_start, vntr_end): if not recruitment_score: read_length = len(read.seq) recruitment_score = self.get_min_score_to_select_a_read( read_length) if not hmm: hmm = self.get_vntr_matcher_hmm(read_length=read_length) if read.is_unmapped: continue if len(read.seq) < int(read_length * 0.9): logging.debug('Rejecting read for short length: %s' % read.seq) continue read_end = read.reference_end if read.reference_end else read.reference_start + len( read.seq) if vntr_start - read_length < read.reference_start < vntr_end or vntr_start < read_end < vntr_end: if read.seq.count('N') <= 0: sequence = str(read.seq).upper() logp, vpath = hmm.viterbi(sequence) rev_logp, rev_vpath = hmm.viterbi( str(Seq(read.seq).reverse_complement()).upper()) if logp < rev_logp: sequence = str(Seq( read.seq).reverse_complement()).upper() logp = rev_logp vpath = rev_vpath length = len(sequence) if is_low_quality_read(read) and not self.recruit_read( logp, vpath, recruitment_score, length): logging.debug('Rejected Read: %s' % sequence) continue selected_reads.append( SelectedRead(sequence, logp, vpath, read.mapq, read.reference_start)) end = min(read_end, vntr_end) start = max(read.reference_start, vntr_start) vntr_bp_in_mapped_reads += end - start logging.debug('vntr base pairs in mapped reads: %s' % vntr_bp_in_mapped_reads) if update: selected_reads = self.iteratively_update_model( alignment_file, unmapped_filtered_reads, selected_reads, hmm) return selected_reads
def select_illumina_reads(self, alignment_file, unmapped_filtered_reads, update=False, hmm=None): recruitment_score = None dnn_model = None selected_reads = [] vntr_bp_in_unmapped_reads = Value('d', 0.0) number_of_reads = 0 read_length = 150 for read_segment in unmapped_filtered_reads: if number_of_reads == 0: read_length = len(str(read_segment.seq)) number_of_reads += 1 if not hmm: hmm = self.get_vntr_matcher_hmm(read_length=read_length) if not recruitment_score: recruitment_score = self.get_min_score_to_select_a_read( read_length) model_file = settings.DNN_MODELS_DIR + '/%s.hd5' % self.reference_vntr.id if os.path.exists(model_file): dnn_model = load_model(model_file) if len(read_segment.seq) < read_length: continue if dnn_model is None: self.process_unmapped_read(None, str(read_segment.seq), hmm, recruitment_score, vntr_bp_in_unmapped_reads, selected_reads) else: self.process_unmapped_read_with_dnn(str(read_segment.seq), hmm, recruitment_score, vntr_bp_in_unmapped_reads, selected_reads, True, dnn_model) logging.debug('vntr base pairs in unmapped reads: %s' % vntr_bp_in_unmapped_reads.value) vntr_bp_in_mapped_reads = 0 vntr_start = self.reference_vntr.start_point vntr_end = self.reference_vntr.start_point + self.reference_vntr.get_length( ) read_mode = self.get_alignment_file_read_mode(alignment_file) samfile = pysam.AlignmentFile( alignment_file, read_mode, reference_filename=self.reference_filename) reference = get_reference_genome_of_alignment_file(samfile) chromosome = self.reference_vntr.chromosome if reference == 'HG19' else self.reference_vntr.chromosome[ 3:] for read in samfile.fetch(chromosome, vntr_start, vntr_end): if not recruitment_score: read_length = len(read.seq) recruitment_score = self.get_min_score_to_select_a_read( read_length) if not hmm: hmm = self.get_vntr_matcher_hmm(read_length=read_length) if read.is_unmapped: continue if len(read.seq) < int(read_length * 0.9): logging.debug('Rejecting read for short length: %s' % read.seq) continue read_end = read.reference_end if read.reference_end else read.reference_start + len( read.seq) if vntr_start - read_length < read.reference_start < vntr_end or vntr_start < read_end < vntr_end: if read.seq.count('N') <= 0: sequence = str(read.seq).upper() logp, vpath = hmm.viterbi(sequence) rev_logp, rev_vpath = hmm.viterbi( str(Seq(read.seq).reverse_complement()).upper()) if logp < rev_logp: sequence = str(Seq( read.seq).reverse_complement()).upper() logp = rev_logp vpath = rev_vpath if is_low_quality_read(read) or not self.recruit_read( logp, vpath, recruitment_score, sequence): logging.debug('Rejected Aligned Read: %s' % sequence) continue selected_reads.append( SelectedRead(sequence, logp, vpath, read.mapq, read.reference_start)) end = min(read_end, vntr_end) start = max(read.reference_start, vntr_start) vntr_bp_in_mapped_reads += end - start logging.debug('vntr base pairs in mapped reads: %s' % vntr_bp_in_mapped_reads) if update: selected_reads = self.iteratively_update_model( alignment_file, unmapped_filtered_reads, selected_reads, hmm) return selected_reads