Exemple #1
0
    def select_illumina_reads(self,
                              alignment_file,
                              unmapped_filtered_reads,
                              update=False,
                              hmm=None):
        recruitment_score = None
        sema = Semaphore(settings.CORES)
        manager = Manager()
        selected_reads = manager.list()
        vntr_bp_in_unmapped_reads = Value('d', 0.0)

        number_of_reads = 0
        read_length = 150

        process_list = []

        for read_segment in unmapped_filtered_reads:
            if number_of_reads == 0:
                read_length = len(str(read_segment.seq))
            number_of_reads += 1
            if not hmm:
                hmm = self.get_vntr_matcher_hmm(read_length=read_length)
            if not recruitment_score:
                recruitment_score = self.get_min_score_to_select_a_read(
                    read_length)

            if len(read_segment.seq) < read_length:
                continue

            sema.acquire()
            p = Process(target=self.process_unmapped_read,
                        args=(sema, str(read_segment.seq), hmm,
                              recruitment_score, vntr_bp_in_unmapped_reads,
                              selected_reads))
            process_list.append(p)
            p.start()
        for p in process_list:
            p.join()

        logging.debug('vntr base pairs in unmapped reads: %s' %
                      vntr_bp_in_unmapped_reads.value)

        vntr_bp_in_mapped_reads = 0
        vntr_start = self.reference_vntr.start_point
        vntr_end = self.reference_vntr.start_point + self.reference_vntr.get_length(
        )
        read_mode = 'r' if alignment_file.endswith('sam') else 'rb'
        samfile = pysam.AlignmentFile(alignment_file, read_mode)
        reference = get_reference_genome_of_alignment_file(samfile)
        chromosome = self.reference_vntr.chromosome if reference == 'HG19' else self.reference_vntr.chromosome[
            3:]
        for read in samfile.fetch(chromosome, vntr_start, vntr_end):
            if not recruitment_score:
                read_length = len(read.seq)
                recruitment_score = self.get_min_score_to_select_a_read(
                    read_length)
            if not hmm:
                hmm = self.get_vntr_matcher_hmm(read_length=read_length)

            if read.is_unmapped:
                continue
            if len(read.seq) < int(read_length * 0.9):
                logging.debug('Rejecting read for short length: %s' % read.seq)
                continue
            read_end = read.reference_end if read.reference_end else read.reference_start + len(
                read.seq)
            if vntr_start - read_length < read.reference_start < vntr_end or vntr_start < read_end < vntr_end:
                if read.seq.count('N') <= 0:
                    sequence = str(read.seq).upper()
                    logp, vpath = hmm.viterbi(sequence)
                    rev_logp, rev_vpath = hmm.viterbi(
                        str(Seq(read.seq).reverse_complement()).upper())
                    if logp < rev_logp:
                        sequence = str(Seq(
                            read.seq).reverse_complement()).upper()
                        logp = rev_logp
                        vpath = rev_vpath
                    length = len(sequence)
                    if is_low_quality_read(read) and not self.recruit_read(
                            logp, vpath, recruitment_score, length):
                        logging.debug('Rejected Read: %s' % sequence)
                        continue
                    selected_reads.append(
                        SelectedRead(sequence, logp, vpath, read.mapq,
                                     read.reference_start))
                end = min(read_end, vntr_end)
                start = max(read.reference_start, vntr_start)
                vntr_bp_in_mapped_reads += end - start
        logging.debug('vntr base pairs in mapped reads: %s' %
                      vntr_bp_in_mapped_reads)

        if update:
            selected_reads = self.iteratively_update_model(
                alignment_file, unmapped_filtered_reads, selected_reads, hmm)

        return selected_reads
Exemple #2
0
    def select_illumina_reads(self,
                              alignment_file,
                              unmapped_filtered_reads,
                              update=False,
                              hmm=None):
        recruitment_score = None
        dnn_model = None
        selected_reads = []
        vntr_bp_in_unmapped_reads = Value('d', 0.0)

        number_of_reads = 0
        read_length = 150

        for read_segment in unmapped_filtered_reads:
            if number_of_reads == 0:
                read_length = len(str(read_segment.seq))
            number_of_reads += 1
            if not hmm:
                hmm = self.get_vntr_matcher_hmm(read_length=read_length)
            if not recruitment_score:
                recruitment_score = self.get_min_score_to_select_a_read(
                    read_length)
                model_file = settings.DNN_MODELS_DIR + '/%s.hd5' % self.reference_vntr.id
                if os.path.exists(model_file):
                    dnn_model = load_model(model_file)

            if len(read_segment.seq) < read_length:
                continue

            if dnn_model is None:
                self.process_unmapped_read(None, str(read_segment.seq), hmm,
                                           recruitment_score,
                                           vntr_bp_in_unmapped_reads,
                                           selected_reads)
            else:
                self.process_unmapped_read_with_dnn(str(read_segment.seq), hmm,
                                                    recruitment_score,
                                                    vntr_bp_in_unmapped_reads,
                                                    selected_reads, True,
                                                    dnn_model)

        logging.debug('vntr base pairs in unmapped reads: %s' %
                      vntr_bp_in_unmapped_reads.value)

        vntr_bp_in_mapped_reads = 0
        vntr_start = self.reference_vntr.start_point
        vntr_end = self.reference_vntr.start_point + self.reference_vntr.get_length(
        )
        read_mode = self.get_alignment_file_read_mode(alignment_file)
        samfile = pysam.AlignmentFile(
            alignment_file,
            read_mode,
            reference_filename=self.reference_filename)
        reference = get_reference_genome_of_alignment_file(samfile)
        chromosome = self.reference_vntr.chromosome if reference == 'HG19' else self.reference_vntr.chromosome[
            3:]
        for read in samfile.fetch(chromosome, vntr_start, vntr_end):
            if not recruitment_score:
                read_length = len(read.seq)
                recruitment_score = self.get_min_score_to_select_a_read(
                    read_length)
            if not hmm:
                hmm = self.get_vntr_matcher_hmm(read_length=read_length)

            if read.is_unmapped:
                continue
            if len(read.seq) < int(read_length * 0.9):
                logging.debug('Rejecting read for short length: %s' % read.seq)
                continue
            read_end = read.reference_end if read.reference_end else read.reference_start + len(
                read.seq)
            if vntr_start - read_length < read.reference_start < vntr_end or vntr_start < read_end < vntr_end:
                if read.seq.count('N') <= 0:
                    sequence = str(read.seq).upper()
                    logp, vpath = hmm.viterbi(sequence)
                    rev_logp, rev_vpath = hmm.viterbi(
                        str(Seq(read.seq).reverse_complement()).upper())
                    if logp < rev_logp:
                        sequence = str(Seq(
                            read.seq).reverse_complement()).upper()
                        logp = rev_logp
                        vpath = rev_vpath
                    if is_low_quality_read(read) or not self.recruit_read(
                            logp, vpath, recruitment_score, sequence):
                        logging.debug('Rejected Aligned Read: %s' % sequence)
                        continue
                    selected_reads.append(
                        SelectedRead(sequence, logp, vpath, read.mapq,
                                     read.reference_start))
                end = min(read_end, vntr_end)
                start = max(read.reference_start, vntr_start)
                vntr_bp_in_mapped_reads += end - start
        logging.debug('vntr base pairs in mapped reads: %s' %
                      vntr_bp_in_mapped_reads)

        if update:
            selected_reads = self.iteratively_update_model(
                alignment_file, unmapped_filtered_reads, selected_reads, hmm)

        return selected_reads