Exemple #1
0
 def load_from_fast5(filename, basecall_group):
     read = Read()
     with h5py.File(filename, 'r') as file:
         read_group = list(file['Raw/Reads'].values())[0]
         read.raw_signal = numpy.array(read_group['Signal'].value)
         events = file['{}/BaseCalled_template/Events'.format(basecall_group)]
         read.sequence_to_signal_mapping = Read._extract_sequence_to_signal_mapping(events)
         read.fastq = file['{}/BaseCalled_template/Fastq'.format(basecall_group)].value.decode(
             'ascii')
         read.sequence = Genome.create_from_fastq_string(read.fastq)[0].bases
     return read
Exemple #2
0
def align_signal(reference_filename,
                 reads,
                 reference=None,
                 config=nadavca.defaults.CONFIG_FILE,
                 kmer_model=nadavca.defaults.KMER_MODEL_FILE,
                 bwa_executable=nadavca.defaults.BWA_EXECUTABLE,
                 group_name=nadavca.defaults.GROUP_NAME):
    if isinstance(config, str):
        try:
            with open(config, 'r') as file:
                config = yaml.load(file)
        except FileNotFoundError:
            sys.stderr.write(
                'failed to load config: {} not found\n'.format(config))
            return None

    if isinstance(kmer_model, str):
        try:
            kmer_model = KmerModel.load_from_hdf5(kmer_model)
        except FileNotFoundError:
            sys.stderr.write(
                'failed to load k-mer model: {} not found\n'.format(
                    kmer_model))
            return None

    if reference is None:
        try:
            reference = Genome.load_from_fasta(reference_filename)[0].bases
        except FileNotFoundError:
            sys.stderr.write(
                "failed to process: reference {} doesn't exist\n".format(
                    reference_filename))
            return None

    approximate_aligner = ApproximateAligner(bwa_executable, reference,
                                             reference_filename)
    estimator = ProbabilityEstimator(kmer_model, approximate_aligner, config)

    if isinstance(reads, str):
        read_basedir = reads
        reads = []
        for file in os.listdir(read_basedir):
            path = os.path.join(read_basedir, file)
            if not os.path.isdir(path) and path.endswith(".fast5"):
                reads.append(path)

    reads = reads[:]
    for i, read in enumerate(reads):
        if isinstance(read, str):
            reads[i] = Read.load_from_fast5(read, group_name)

    Read.normalize_reads(reads)

    return [estimator.get_refined_alignment(reference, read) for read in reads]
Exemple #3
0
    def get_refined_alignment(self, reference, read):
        approximate_alignment = self.aligner.get_signal_alignment(
            read, self.bandwidth)
        if approximate_alignment is None:
            return None

        start_in_reference, end_in_reference = approximate_alignment.reference_range
        reference_part = reference[start_in_reference:end_in_reference]
        if approximate_alignment.reverse_complement:
            reference_part = Genome.reverse_complement(reference_part)
        reference_part = Genome.to_numerical(reference_part)

        start_in_signal, end_in_signal = approximate_alignment.signal_range
        signal = read.normalized_signal[start_in_signal:end_in_signal]
        context_before, context_after = \
            self._get_read_context(read,
                                   approximate_alignment.read_sequence_range)

        refined_alignment = nadavca.dtw.refine_alignment(
            signal=signal,
            reference=reference_part,
            context_before=context_before,
            context_after=context_after,
            approximate_alignment=approximate_alignment.alignment,
            bandwidth=self.bandwidth,
            min_event_length=self.min_event_length,
            kmer_model=self.kmer_model)

        result = numpy.zeros((len(refined_alignment), 2), dtype=int)
        for reference_position, signal_position in enumerate(
                refined_alignment):
            result[reference_position][0] = signal_position + start_in_signal
            if approximate_alignment.reverse_complement:
                result[reference_position][
                    1] = end_in_reference - reference_position
            else:
                result[reference_position][
                    1] = start_in_reference + reference_position
        return result
Exemple #4
0
def align_signal_command(args):
    try:
        reference = Genome.load_from_fasta(args.reference)[0].bases
    except FileNotFoundError:
        sys.stderr.write(
            "failed to process: reference {} doesn't exist\n".format(
                args.reference))
        return

    read_filenames = []
    for file in os.listdir(args.read_basedir):
        path = os.path.join(args.read_basedir, file)
        if not os.path.isdir(path) and path.endswith(".fast5"):
            read_filenames.append(path)

    alignments = align_signal(reference_filename=args.reference,
                              reads=read_filenames,
                              reference=reference,
                              config=args.configuration,
                              kmer_model=args.kmer_model,
                              bwa_executable=args.bwa_executable,
                              group_name=args.group_name)

    if args.output:
        if not os.path.exists(args.output):
            os.makedirs(args.output)
        if not os.path.isdir(args.output):
            sys.stderr.write('Failed to create directory {}'
                             '(maybe a file with that name exists?)\n'.format(
                                 args.output))
            return

    for alignment, read_filename in zip(alignments, read_filenames):
        output_file = None
        if args.output:
            basename = os.path.splitext(os.path.basename(read_filename))[0]
            output_file = open(os.path.join(args.output, basename + '.txt'),
                               'w')
        else:
            output_file = sys.stdout
        output_file.write("signal_position\treference_position\n")
        for line in alignment:
            output_file.write("{}\t{}\n".format(line[0], line[1]))
        if args.output:
            output_file.close()
Exemple #5
0
    def get_signal_alignment(self, read, bandwidth):
        base_alignment = self._get_base_alignment(read)
        if base_alignment is None:
            return None

        base_mapping, is_reverse_complement, contig_name = base_alignment
        signal_mapping = self.convert_mapping(base_mapping, read)

        if len(signal_mapping) == 0:
            return None

        start_in_reference = signal_mapping[0][1]
        end_in_reference = signal_mapping[-1][1] + 1
        signal_mapping[:, 1] -= start_in_reference
        reference = self._get_reference_contig(contig_name)

        if is_reverse_complement:
            start_in_reference, end_in_reference = \
                len(reference) - end_in_reference, \
                len(reference) - start_in_reference
        reference_range = (start_in_reference, end_in_reference)

        start_in_signal = signal_mapping[0][0]
        end_in_signal = signal_mapping[-1][0] + 1
        extended_start_in_signal = max(0, start_in_signal - bandwidth)
        extended_end_in_signal = min(len(read.normalized_signal), end_in_signal + bandwidth)
        signal_range = (extended_start_in_signal, extended_end_in_signal)

        signal_mapping[:, 0] -= extended_start_in_signal

        start_in_read_sequence = base_mapping[0][0]
        end_in_read_sequence = base_mapping[-1][0] + 1
        read_sequence_range = (start_in_read_sequence, end_in_read_sequence)
        reference_part = reference[start_in_reference: end_in_reference]
        if is_reverse_complement:
            reference_part = Genome.reverse_complement(reference_part)

        return ApproximateSignalAlignment(alignment=signal_mapping,
                                          signal_range=signal_range,
                                          reference_range=reference_range,
                                          read_sequence_range=read_sequence_range,
                                          reverse_complement=is_reverse_complement,
                                          reference_part=reference_part,
                                          contig_name=contig_name
                                          )
Exemple #6
0
    def get_refined_alignment(self, read):
        approximate_alignment = self.aligner.get_signal_alignment(
            read, self.bandwidth)
        if approximate_alignment is None:
            return None

        start_in_reference, end_in_reference = approximate_alignment.reference_range
        reference_part = Genome.to_numerical(
            approximate_alignment.reference_part)

        start_in_signal, end_in_signal = approximate_alignment.signal_range
        signal = read.normalized_signal[start_in_signal:end_in_signal]
        context_before, context_after = \
            self._get_read_context(read,
                                   approximate_alignment.read_sequence_range)

        refined_alignment = nadavca.dtw.refine_alignment(
            signal=signal,
            reference=reference_part,
            context_before=context_before,
            context_after=context_after,
            approximate_alignment=approximate_alignment.alignment,
            bandwidth=self.bandwidth,
            min_event_length=self.min_event_length,
            kmer_model=self.kmer_model,
            model_transitions=self.model_transitions)

        if len(refined_alignment
               ) == 0:  # There was no valid path in our band (bad alignment)
            return None

        result = numpy.zeros((len(refined_alignment), 3), dtype=int)
        for reference_position, event_range in enumerate(refined_alignment):
            event_start, event_end = event_range
            result[reference_position][1] = event_start + start_in_signal
            result[reference_position][2] = event_end + start_in_signal
            if approximate_alignment.reverse_complement:
                result[reference_position][
                    0] = end_in_reference - reference_position - 1
            else:
                result[reference_position][
                    0] = start_in_reference + reference_position
        return approximate_alignment, result
Exemple #7
0
    def _get_base_alignment(self, read):
        if self.bwapy_aligner:
            alignments = self.bwapy_aligner.align_seq(''.join(read.sequence))
            if len(alignments) == 0:
                return None
            alignment = alignments[0]
            cigar = alignment.cigar
            is_reverse_complement = alignment.orient == '-'
            mapped_position = alignment.pos

        else:
            read_fastq_filename = None
            with tempfile.NamedTemporaryFile(mode='w',
                                             delete=False,
                                             prefix='nadavca_tmp',
                                             suffix='.fastq') as file:
                read_fastq_filename = file.name
                file.write(read.fastq)

            bwa_output_filename = None
            with tempfile.NamedTemporaryFile(delete=True,
                                             prefix='nadavca_tmp',
                                             suffix='.sam') as file:
                bwa_output_filename = file.name

            subprocess.run([
                self.bwa_executable, 'mem', self.reference_filename,
                read_fastq_filename, '-o', bwa_output_filename
            ],
                           stderr=subprocess.PIPE,
                           check=True)
            with simplesam.Reader(open(bwa_output_filename, 'r')) as reader:
                sam = reader.next()
                if not sam.mapped:
                    return None
                cigar = sam.cigar
                is_reverse_complement = sam.reverse
                mapped_position = sam.pos - 1

            os.remove(read_fastq_filename)
            os.remove(bwa_output_filename)

        oriented_read = Genome.reverse_complement(
            read.sequence) if is_reverse_complement else read.sequence

        index_in_read = 0
        index_in_reference = mapped_position
        base_mapping = []
        parsed_cigar = self._parse_cigar(cigar)

        for num, operation in parsed_cigar:
            if operation == 'S':
                index_in_read += num
            elif operation == 'M':
                for i in range(num):
                    if self.reference[index_in_reference] == oriented_read[
                            index_in_read]:
                        base_mapping.append(
                            (index_in_read, index_in_reference))
                    index_in_read += 1
                    index_in_reference += 1
            elif operation == 'D':
                index_in_reference += num
            elif operation == 'I':
                index_in_read += num
            else:
                raise ValueError(
                    'Unknown cigar operation: {}'.format(operation))

        if is_reverse_complement:
            for i, val in enumerate(base_mapping):
                base_mapping[i] = (len(read.sequence) - 1 - val[0],
                                   len(self.reference) - 1 - val[1])
            base_mapping.reverse()

        return numpy.array(base_mapping,
                           dtype=numpy.int), is_reverse_complement
Exemple #8
0
def estimate_snps_command(args):
    try:
        reference = Genome.load_from_fasta(args.reference)[0].bases
    except FileNotFoundError:
        sys.stderr.write(
            "failed to process: reference {} doesn't exist\n".format(
                args.reference))
        return

    read_filenames = []
    for file in os.listdir(args.read_basedir):
        path = os.path.join(args.read_basedir, file)
        if not os.path.isdir(path) and path.endswith(".fast5"):
            read_filenames.append(path)

    chunks = estimate_snps(reference_filename=args.reference,
                           reads=read_filenames,
                           reference=reference,
                           config=args.configuration,
                           kmer_model=args.kmer_model,
                           bwa_executable=args.bwa_executable,
                           independent=args.independent,
                           group_name=args.group_name)

    if chunks is None:
        return

    if args.independent:
        if args.output:
            if not os.path.exists(args.output):
                os.makedirs(args.output)
            if not os.path.isdir(args.output):
                sys.stderr.write(
                    'Failed to create directory {}'
                    '(maybe a file with that name exists?)\n'.format(
                        args.output))
                return
        for chunk, read_filename in zip(chunks, read_filenames):
            output_file = None
            if args.output:
                basename = os.path.splitext(os.path.basename(read_filename))[0]
                output_file = open(
                    os.path.join(args.output, basename + '.txt'), 'w')
            else:
                output_file = sys.stdout
            Chunk.print_head(output_file)
            chunk.print(output_file, reference)
            if args.output:
                output_file.close()

    else:
        output_file = None
        if args.output:
            output_file = open(args.output, 'w')
        else:
            output_file = sys.stdout
        Chunk.print_head(output_file)
        for chunk in chunks:
            chunk.print(output_file, reference)
        if args.output:
            output_file.close()
Exemple #9
0
    def _estimate_log_likelihoods(self, reference, read):
        approximate_alignment = self.aligner.get_signal_alignment(
            read, self.bandwidth)
        if approximate_alignment is None:
            return None

        start_in_reference, end_in_reference = approximate_alignment.reference_range
        reference_part = reference[start_in_reference:end_in_reference]
        if approximate_alignment.reverse_complement:
            reference_part = Genome.reverse_complement(reference_part)
        reference_part = Genome.to_numerical(reference_part)

        start_in_signal, end_in_signal = approximate_alignment.signal_range
        signal = read.normalized_signal[start_in_signal:end_in_signal]
        context_before, context_after = \
            self._get_read_context(read,
                                   approximate_alignment.read_sequence_range)

        if self.tweak_signal_normalization:
            refined_alignment = nadavca.dtw.refine_alignment(
                signal=signal,
                reference=reference_part,
                context_before=context_before,
                context_after=context_after,
                approximate_alignment=approximate_alignment.alignment,
                bandwidth=self.bandwidth,
                min_event_length=self.min_event_length,
                kmer_model=self.kmer_model)

            refined_alignment = numpy.array(
                refined_alignment) + start_in_signal

            expected_signal = self.kmer_model.get_expected_signal(
                reference_part, context_before, context_after)
            read.tweak_signal_normalization(refined_alignment, expected_signal)
            signal = read.tweaked_normalized_signal[
                start_in_signal:end_in_signal]

        log_likelihoods = nadavca.dtw.estimate_log_likelihoods(
            signal=signal,
            reference=reference_part,
            context_before=context_before,
            context_after=context_after,
            approximate_alignment=approximate_alignment.alignment,
            bandwidth=self.bandwidth,
            min_event_length=self.min_event_length,
            kmer_model=self.kmer_model,
            model_wobbling=self.model_wobbling)

        log_likelihoods = numpy.array(log_likelihoods)
        log_likelihoods = self._normalize_log_likelihoods(
            log_likelihoods, reference_part)

        if approximate_alignment.reverse_complement:
            complement_likelihoods = numpy.zeros(log_likelihoods.shape,
                                                 dtype=numpy.float)
            for i, line in enumerate(log_likelihoods):
                for j in range(len(alphabet)):
                    complement_likelihoods[i][j] = line[
                        numerical_complement[j]]
            log_likelihoods = numpy.flipud(complement_likelihoods)

        return Chunk(start_in_reference, end_in_reference, log_likelihoods)
import os
import numpy as np
from nadavca.genome import Genome
from nadavca.read import Read

if len(sys.argv) != 6:
    print("usage: {} reference repeats_file reads alignments output".format(
        sys.argv[0]))
    sys.exit(0)
reference_filename = sys.argv[1]
repeats_filename = sys.argv[2]
reads_dir = sys.argv[3]
alignments_dir = sys.argv[4]
output_dir = sys.argv[5]

references = Genome.load_from_fasta(reference_filename)
ref_lengths = {r.description[1:]: len(r.bases) for r in references}

reference_maps = {
    contig_name: np.zeros(length, dtype=bool)
    for contig_name, length in ref_lengths.items()
}

with open(repeats_filename, 'r') as file:
    for line in file:
        tokens = line.split()
        contig = tokens[0]
        start = int(tokens[1])
        #start -= 1
        end = int(tokens[2])
        reference_maps[contig][start:end] = True