def load_from_fast5(filename, basecall_group): read = Read() with h5py.File(filename, 'r') as file: read_group = list(file['Raw/Reads'].values())[0] read.raw_signal = numpy.array(read_group['Signal'].value) events = file['{}/BaseCalled_template/Events'.format(basecall_group)] read.sequence_to_signal_mapping = Read._extract_sequence_to_signal_mapping(events) read.fastq = file['{}/BaseCalled_template/Fastq'.format(basecall_group)].value.decode( 'ascii') read.sequence = Genome.create_from_fastq_string(read.fastq)[0].bases return read
def align_signal(reference_filename, reads, reference=None, config=nadavca.defaults.CONFIG_FILE, kmer_model=nadavca.defaults.KMER_MODEL_FILE, bwa_executable=nadavca.defaults.BWA_EXECUTABLE, group_name=nadavca.defaults.GROUP_NAME): if isinstance(config, str): try: with open(config, 'r') as file: config = yaml.load(file) except FileNotFoundError: sys.stderr.write( 'failed to load config: {} not found\n'.format(config)) return None if isinstance(kmer_model, str): try: kmer_model = KmerModel.load_from_hdf5(kmer_model) except FileNotFoundError: sys.stderr.write( 'failed to load k-mer model: {} not found\n'.format( kmer_model)) return None if reference is None: try: reference = Genome.load_from_fasta(reference_filename)[0].bases except FileNotFoundError: sys.stderr.write( "failed to process: reference {} doesn't exist\n".format( reference_filename)) return None approximate_aligner = ApproximateAligner(bwa_executable, reference, reference_filename) estimator = ProbabilityEstimator(kmer_model, approximate_aligner, config) if isinstance(reads, str): read_basedir = reads reads = [] for file in os.listdir(read_basedir): path = os.path.join(read_basedir, file) if not os.path.isdir(path) and path.endswith(".fast5"): reads.append(path) reads = reads[:] for i, read in enumerate(reads): if isinstance(read, str): reads[i] = Read.load_from_fast5(read, group_name) Read.normalize_reads(reads) return [estimator.get_refined_alignment(reference, read) for read in reads]
def get_refined_alignment(self, reference, read): approximate_alignment = self.aligner.get_signal_alignment( read, self.bandwidth) if approximate_alignment is None: return None start_in_reference, end_in_reference = approximate_alignment.reference_range reference_part = reference[start_in_reference:end_in_reference] if approximate_alignment.reverse_complement: reference_part = Genome.reverse_complement(reference_part) reference_part = Genome.to_numerical(reference_part) start_in_signal, end_in_signal = approximate_alignment.signal_range signal = read.normalized_signal[start_in_signal:end_in_signal] context_before, context_after = \ self._get_read_context(read, approximate_alignment.read_sequence_range) refined_alignment = nadavca.dtw.refine_alignment( signal=signal, reference=reference_part, context_before=context_before, context_after=context_after, approximate_alignment=approximate_alignment.alignment, bandwidth=self.bandwidth, min_event_length=self.min_event_length, kmer_model=self.kmer_model) result = numpy.zeros((len(refined_alignment), 2), dtype=int) for reference_position, signal_position in enumerate( refined_alignment): result[reference_position][0] = signal_position + start_in_signal if approximate_alignment.reverse_complement: result[reference_position][ 1] = end_in_reference - reference_position else: result[reference_position][ 1] = start_in_reference + reference_position return result
def align_signal_command(args): try: reference = Genome.load_from_fasta(args.reference)[0].bases except FileNotFoundError: sys.stderr.write( "failed to process: reference {} doesn't exist\n".format( args.reference)) return read_filenames = [] for file in os.listdir(args.read_basedir): path = os.path.join(args.read_basedir, file) if not os.path.isdir(path) and path.endswith(".fast5"): read_filenames.append(path) alignments = align_signal(reference_filename=args.reference, reads=read_filenames, reference=reference, config=args.configuration, kmer_model=args.kmer_model, bwa_executable=args.bwa_executable, group_name=args.group_name) if args.output: if not os.path.exists(args.output): os.makedirs(args.output) if not os.path.isdir(args.output): sys.stderr.write('Failed to create directory {}' '(maybe a file with that name exists?)\n'.format( args.output)) return for alignment, read_filename in zip(alignments, read_filenames): output_file = None if args.output: basename = os.path.splitext(os.path.basename(read_filename))[0] output_file = open(os.path.join(args.output, basename + '.txt'), 'w') else: output_file = sys.stdout output_file.write("signal_position\treference_position\n") for line in alignment: output_file.write("{}\t{}\n".format(line[0], line[1])) if args.output: output_file.close()
def get_signal_alignment(self, read, bandwidth): base_alignment = self._get_base_alignment(read) if base_alignment is None: return None base_mapping, is_reverse_complement, contig_name = base_alignment signal_mapping = self.convert_mapping(base_mapping, read) if len(signal_mapping) == 0: return None start_in_reference = signal_mapping[0][1] end_in_reference = signal_mapping[-1][1] + 1 signal_mapping[:, 1] -= start_in_reference reference = self._get_reference_contig(contig_name) if is_reverse_complement: start_in_reference, end_in_reference = \ len(reference) - end_in_reference, \ len(reference) - start_in_reference reference_range = (start_in_reference, end_in_reference) start_in_signal = signal_mapping[0][0] end_in_signal = signal_mapping[-1][0] + 1 extended_start_in_signal = max(0, start_in_signal - bandwidth) extended_end_in_signal = min(len(read.normalized_signal), end_in_signal + bandwidth) signal_range = (extended_start_in_signal, extended_end_in_signal) signal_mapping[:, 0] -= extended_start_in_signal start_in_read_sequence = base_mapping[0][0] end_in_read_sequence = base_mapping[-1][0] + 1 read_sequence_range = (start_in_read_sequence, end_in_read_sequence) reference_part = reference[start_in_reference: end_in_reference] if is_reverse_complement: reference_part = Genome.reverse_complement(reference_part) return ApproximateSignalAlignment(alignment=signal_mapping, signal_range=signal_range, reference_range=reference_range, read_sequence_range=read_sequence_range, reverse_complement=is_reverse_complement, reference_part=reference_part, contig_name=contig_name )
def get_refined_alignment(self, read): approximate_alignment = self.aligner.get_signal_alignment( read, self.bandwidth) if approximate_alignment is None: return None start_in_reference, end_in_reference = approximate_alignment.reference_range reference_part = Genome.to_numerical( approximate_alignment.reference_part) start_in_signal, end_in_signal = approximate_alignment.signal_range signal = read.normalized_signal[start_in_signal:end_in_signal] context_before, context_after = \ self._get_read_context(read, approximate_alignment.read_sequence_range) refined_alignment = nadavca.dtw.refine_alignment( signal=signal, reference=reference_part, context_before=context_before, context_after=context_after, approximate_alignment=approximate_alignment.alignment, bandwidth=self.bandwidth, min_event_length=self.min_event_length, kmer_model=self.kmer_model, model_transitions=self.model_transitions) if len(refined_alignment ) == 0: # There was no valid path in our band (bad alignment) return None result = numpy.zeros((len(refined_alignment), 3), dtype=int) for reference_position, event_range in enumerate(refined_alignment): event_start, event_end = event_range result[reference_position][1] = event_start + start_in_signal result[reference_position][2] = event_end + start_in_signal if approximate_alignment.reverse_complement: result[reference_position][ 0] = end_in_reference - reference_position - 1 else: result[reference_position][ 0] = start_in_reference + reference_position return approximate_alignment, result
def _get_base_alignment(self, read): if self.bwapy_aligner: alignments = self.bwapy_aligner.align_seq(''.join(read.sequence)) if len(alignments) == 0: return None alignment = alignments[0] cigar = alignment.cigar is_reverse_complement = alignment.orient == '-' mapped_position = alignment.pos else: read_fastq_filename = None with tempfile.NamedTemporaryFile(mode='w', delete=False, prefix='nadavca_tmp', suffix='.fastq') as file: read_fastq_filename = file.name file.write(read.fastq) bwa_output_filename = None with tempfile.NamedTemporaryFile(delete=True, prefix='nadavca_tmp', suffix='.sam') as file: bwa_output_filename = file.name subprocess.run([ self.bwa_executable, 'mem', self.reference_filename, read_fastq_filename, '-o', bwa_output_filename ], stderr=subprocess.PIPE, check=True) with simplesam.Reader(open(bwa_output_filename, 'r')) as reader: sam = reader.next() if not sam.mapped: return None cigar = sam.cigar is_reverse_complement = sam.reverse mapped_position = sam.pos - 1 os.remove(read_fastq_filename) os.remove(bwa_output_filename) oriented_read = Genome.reverse_complement( read.sequence) if is_reverse_complement else read.sequence index_in_read = 0 index_in_reference = mapped_position base_mapping = [] parsed_cigar = self._parse_cigar(cigar) for num, operation in parsed_cigar: if operation == 'S': index_in_read += num elif operation == 'M': for i in range(num): if self.reference[index_in_reference] == oriented_read[ index_in_read]: base_mapping.append( (index_in_read, index_in_reference)) index_in_read += 1 index_in_reference += 1 elif operation == 'D': index_in_reference += num elif operation == 'I': index_in_read += num else: raise ValueError( 'Unknown cigar operation: {}'.format(operation)) if is_reverse_complement: for i, val in enumerate(base_mapping): base_mapping[i] = (len(read.sequence) - 1 - val[0], len(self.reference) - 1 - val[1]) base_mapping.reverse() return numpy.array(base_mapping, dtype=numpy.int), is_reverse_complement
def estimate_snps_command(args): try: reference = Genome.load_from_fasta(args.reference)[0].bases except FileNotFoundError: sys.stderr.write( "failed to process: reference {} doesn't exist\n".format( args.reference)) return read_filenames = [] for file in os.listdir(args.read_basedir): path = os.path.join(args.read_basedir, file) if not os.path.isdir(path) and path.endswith(".fast5"): read_filenames.append(path) chunks = estimate_snps(reference_filename=args.reference, reads=read_filenames, reference=reference, config=args.configuration, kmer_model=args.kmer_model, bwa_executable=args.bwa_executable, independent=args.independent, group_name=args.group_name) if chunks is None: return if args.independent: if args.output: if not os.path.exists(args.output): os.makedirs(args.output) if not os.path.isdir(args.output): sys.stderr.write( 'Failed to create directory {}' '(maybe a file with that name exists?)\n'.format( args.output)) return for chunk, read_filename in zip(chunks, read_filenames): output_file = None if args.output: basename = os.path.splitext(os.path.basename(read_filename))[0] output_file = open( os.path.join(args.output, basename + '.txt'), 'w') else: output_file = sys.stdout Chunk.print_head(output_file) chunk.print(output_file, reference) if args.output: output_file.close() else: output_file = None if args.output: output_file = open(args.output, 'w') else: output_file = sys.stdout Chunk.print_head(output_file) for chunk in chunks: chunk.print(output_file, reference) if args.output: output_file.close()
def _estimate_log_likelihoods(self, reference, read): approximate_alignment = self.aligner.get_signal_alignment( read, self.bandwidth) if approximate_alignment is None: return None start_in_reference, end_in_reference = approximate_alignment.reference_range reference_part = reference[start_in_reference:end_in_reference] if approximate_alignment.reverse_complement: reference_part = Genome.reverse_complement(reference_part) reference_part = Genome.to_numerical(reference_part) start_in_signal, end_in_signal = approximate_alignment.signal_range signal = read.normalized_signal[start_in_signal:end_in_signal] context_before, context_after = \ self._get_read_context(read, approximate_alignment.read_sequence_range) if self.tweak_signal_normalization: refined_alignment = nadavca.dtw.refine_alignment( signal=signal, reference=reference_part, context_before=context_before, context_after=context_after, approximate_alignment=approximate_alignment.alignment, bandwidth=self.bandwidth, min_event_length=self.min_event_length, kmer_model=self.kmer_model) refined_alignment = numpy.array( refined_alignment) + start_in_signal expected_signal = self.kmer_model.get_expected_signal( reference_part, context_before, context_after) read.tweak_signal_normalization(refined_alignment, expected_signal) signal = read.tweaked_normalized_signal[ start_in_signal:end_in_signal] log_likelihoods = nadavca.dtw.estimate_log_likelihoods( signal=signal, reference=reference_part, context_before=context_before, context_after=context_after, approximate_alignment=approximate_alignment.alignment, bandwidth=self.bandwidth, min_event_length=self.min_event_length, kmer_model=self.kmer_model, model_wobbling=self.model_wobbling) log_likelihoods = numpy.array(log_likelihoods) log_likelihoods = self._normalize_log_likelihoods( log_likelihoods, reference_part) if approximate_alignment.reverse_complement: complement_likelihoods = numpy.zeros(log_likelihoods.shape, dtype=numpy.float) for i, line in enumerate(log_likelihoods): for j in range(len(alphabet)): complement_likelihoods[i][j] = line[ numerical_complement[j]] log_likelihoods = numpy.flipud(complement_likelihoods) return Chunk(start_in_reference, end_in_reference, log_likelihoods)
import os import numpy as np from nadavca.genome import Genome from nadavca.read import Read if len(sys.argv) != 6: print("usage: {} reference repeats_file reads alignments output".format( sys.argv[0])) sys.exit(0) reference_filename = sys.argv[1] repeats_filename = sys.argv[2] reads_dir = sys.argv[3] alignments_dir = sys.argv[4] output_dir = sys.argv[5] references = Genome.load_from_fasta(reference_filename) ref_lengths = {r.description[1:]: len(r.bases) for r in references} reference_maps = { contig_name: np.zeros(length, dtype=bool) for contig_name, length in ref_lengths.items() } with open(repeats_filename, 'r') as file: for line in file: tokens = line.split() contig = tokens[0] start = int(tokens[1]) #start -= 1 end = int(tokens[2]) reference_maps[contig][start:end] = True