def filter_synthetic_sequences(self, reads): if self.synthetic_fasta: synthetic_sequences = [read.seq for read in fasta.reads(self.synthetic_fasta)] else: synthetic_sequences = [] synthetic_lengths = np.zeros(self.max_read_length + 1) for read in reads: if contaminants.is_synthetic(read, synthetic_sequences): synthetic_lengths[len(read.seq)] += 1 else: yield read self.write_file('lengths', {'synthetic': synthetic_lengths})
def produce_sw_alignments(reads, genome_dirs, extra_targets): targets = [] for genome_dir in genome_dirs: fasta_fns = genomes.get_all_fasta_file_names(genome_dir) for fasta_fn in fasta_fns: targets.extend(list(fasta.reads(fasta_fn))) targets.extend(extra_targets) for read in reads: alignments = get_local_alignments(read, targets) + get_edge_alignments(read, targets) # bowtie2 only retains up to the first space in a qname, so do the same # here to allow qnames to be compared sanitized_name = up_to_first_space(read.name) yield sanitized_name, alignments
def get_oligo_hit_lengths(bam_fn, oligos_fasta_fn, oligos_sam_fn, max_read_length): oligo_mappings = load_oligo_mappings(oligos_sam_fn) bam_file = pysam.Samfile(bam_fn, "rb") oligo_names = [read.name for read in fasta.reads(oligos_fasta_fn)] lengths = np.zeros((len(oligo_names), max_read_length + 1), int) for oligo_number, oligo_name in enumerate(oligo_names): for rname, start, end in oligo_mappings[oligo_name]: reads = bam_file.fetch(rname, start, end) for aligned_read in reads: if not aligned_read.is_secondary: # Can't use qlen here because the bam files omit # the seq and qual of secondary mappings lengths[oligo_number][aligned_read.inferred_length] += 1 return lengths
def get_oligo_hit_lengths(bam_fn, oligos_fasta_fn, oligos_sam_fn, max_read_length): oligo_mappings = load_oligo_mappings(oligos_sam_fn) bam_file = pysam.Samfile(bam_fn, 'rb') oligo_names = [read.name for read in fasta.reads(oligos_fasta_fn)] lengths = np.zeros((len(oligo_names), max_read_length + 1), int) for oligo_number, oligo_name in enumerate(oligo_names): for rname, start, end in oligo_mappings[oligo_name]: reads = bam_file.fetch(rname, start, end) for aligned_read in reads: if not aligned_read.is_secondary: # Can't use qlen here because the bam files omit # the seq and qual of secondary mappings lengths[oligo_number][aligned_read.inferred_length] += 1 return lengths
def produce_sw_alignments(reads, genome_dirs, extra_targets, max_to_report=5): targets = set() for genome_dir in genome_dirs: fasta_fns = genomes.get_all_fasta_file_names(genome_dir) for fasta_fn in fasta_fns: targets.update(fasta.reads(fasta_fn)) targets.update(extra_targets) for read in reads: alignments = get_local_alignments(read, targets) + get_edge_alignments( read, targets) # bowtie2 only retains up to the first space in a qname, so do the same # here to allow qnames to be compared alignments = sorted(alignments, key=lambda a: a['score'], reverse=True) alignments = alignments[:max_to_report] sanitized_name = up_to_first_space(read.name) yield sanitized_name, alignments
def test_new_synth(): import trim from Sequencing import fasta sfn = '/home/jah/projects/ribosomes/data/stephanie_markers/stephanie_markers.fa' synthetics = [read.seq for read in fasta.reads(sfn)] reads = fastq.reads( '/home/jah/projects/ribosomes/experiments/belgium_2014_08_07/WT_1_FP/data/WT_1_FP.140731.MiSeq.FCA.lane1.R1.fastq' ) for read in reads: trim_at = trim.trim_by_local_alignment(read.seq) trimmed_seq = read.seq[:trim_at] trimmed_read = fasta.Read(read.name, trimmed_seq) old = is_synthetic(trimmed_read, synthetics) new = is_synthetic_new(trimmed_read, synthetics) if old and not new and trimmed_seq != '': print 'old is', old print 'new is', new print trimmed_seq raw_input()
def test_new_synth(): import trim from Sequencing import fasta sfn = "/home/jah/projects/ribosomes/data/stephanie_markers/stephanie_markers.fa" synthetics = [read.seq for read in fasta.reads(sfn)] reads = fastq.reads( "/home/jah/projects/ribosomes/experiments/belgium_2014_08_07/WT_1_FP/data/WT_1_FP.140731.MiSeq.FCA.lane1.R1.fastq" ) for read in reads: trim_at = trim.trim_by_local_alignment(read.seq) trimmed_seq = read.seq[:trim_at] trimmed_read = fasta.Read(read.name, trimmed_seq) old = is_synthetic(trimmed_read, synthetics) new = is_synthetic_new(trimmed_read, synthetics) if old and not new and trimmed_seq != "": print "old is", old print "new is", new print trimmed_seq raw_input()
def plot_oligo_hit_lengths(oligos_fasta_fn, lengths, fig_fn): oligo_names = [read.name for read in fasta.reads(oligos_fasta_fn)] if len(oligo_names) == 0: # If no oligos have been defined, there is no picture to make. return None fig, ax = plt.subplots(figsize=(18, 12)) for oligo_name, oligo_lengths, color in zip(oligo_names, lengths, colors): denominator = np.maximum(oligo_lengths.sum(), 1) normalized_lengths = np.true_divide(oligo_lengths, denominator) ax.plot(normalized_lengths, 'o-', color=color, label=oligo_name) ax.legend(loc='upper right', framealpha=0.5) ax.set_xlim(0, lengths.shape[1] - 1) ax.set_xlabel('Length of original RNA fragment') ax.set_ylabel('Number of fragments') ax.set_title('Distribution of fragment lengths overlapping each oligo') fig.savefig(fig_fn) plt.close(fig)
def plot_oligo_hit_lengths(oligos_fasta_fn, lengths, fig_fn): oligo_names = [read.name for read in fasta.reads(oligos_fasta_fn)] if len(oligo_names) == 0: # If no oligos have been defined, there is no picture to make. return None fig, ax = plt.subplots(figsize=(18, 12)) for oligo_name, oligo_lengths, color in zip(oligo_names, lengths, colors): denominator = np.maximum(oligo_lengths.sum(), 1) normalized_lengths = np.true_divide(oligo_lengths, denominator) ax.plot(normalized_lengths, "o-", color=color, label=oligo_name) ax.legend(loc="upper right", framealpha=0.5) ax.set_xlim(0, lengths.shape[1] - 1) ax.set_xlabel("Length of original RNA fragment") ax.set_ylabel("Number of fragments") ax.set_title("Distribution of fragment lengths overlapping each oligo") fig.savefig(fig_fn) plt.close(fig)
def visualize_unmapped(self): bowtie2_targets = [(self.file_names['genome'], self.file_names['bowtie2_index_prefix'], 'C,20,0'), ] sw_genome_dirs = ['/home/jah/genomes/truseq', '/home/jah/projects/crac/data/organisms/saccharomyces_cerevisiae/EF4/contaminant/fasta/', ] extra_targets = [fasta.Read('smRNA_linker', trim.smRNA_linker)] if self.synthetic_fasta: extra_targets.extend(list(fasta.reads(self.synthetic_fasta))) def get_reads(): for i, (seq, count) in enumerate(self.read_file('common_unmapped')['non_long_polyA'].most_common()): read = fastq.Read('{0}_{1}'.format(i, count), seq, fastq.encode_sanger([40]*len(seq)), ) yield read visualize_structure.visualize_unpaired_alignments(get_reads, sw_genome_dirs, extra_targets, bowtie2_targets, self.file_names['unmapped_structures'], )
def align_reads( target_fasta_fn, reads, bam_fn, min_path_length=15, error_fn='/dev/null', alignment_type='overlap', ): ''' Aligns reads to targets in target_fasta_fn by Smith-Waterman, storing alignments in bam_fn and yielding unaligned reads. ''' targets = {r.name: r.seq for r in fasta.reads(target_fasta_fn)} target_names = sorted(targets) target_lengths = [len(targets[n]) for n in target_names] alignment_sorter = sam.AlignmentSorter( target_names, target_lengths, bam_fn, ) statistics = Counter() with alignment_sorter: for original_read in reads: statistics['input'] += 1 alignments = [] rc_read = fastq.Read( original_read.name, utilities.reverse_complement(original_read.seq), original_read.qual[::-1], ) for read, is_reverse in ([original_read, False], [rc_read, True]): qual = fastq.decode_sanger(read.qual) for target_name, target_seq in targets.iteritems(): alignment = generate_alignments(read.seq, target_seq, alignment_type)[0] path = alignment['path'] if len(path) >= min_path_length and alignment['score'] / ( 2. * len(path)) > 0.8: aligned_segment = pysam.AlignedSegment() aligned_segment.seq = read.seq aligned_segment.query_qualities = qual aligned_segment.is_reverse = is_reverse char_pairs = make_char_pairs(path, read.seq, target_seq) cigar = sam.aligned_pairs_to_cigar(char_pairs) clip_from_start = first_query_index(path) if clip_from_start > 0: cigar = [(sam.BAM_CSOFT_CLIP, clip_from_start) ] + cigar clip_from_end = len( read.seq) - 1 - last_query_index(path) if clip_from_end > 0: cigar = cigar + [ (sam.BAM_CSOFT_CLIP, clip_from_end) ] aligned_segment.cigar = cigar read_aligned, ref_aligned = zip(*char_pairs) md = sam.alignment_to_MD_string( ref_aligned, read_aligned) aligned_segment.set_tag('MD', md) aligned_segment.set_tag('AS', alignment['score']) aligned_segment.tid = alignment_sorter.get_tid( target_name) aligned_segment.query_name = read.name aligned_segment.next_reference_id = -1 aligned_segment.reference_start = first_target_index( path) alignments.append(aligned_segment) if alignments: statistics['aligned'] += 1 sorted_alignments = sorted(alignments, key=lambda m: m.get_tag('AS'), reverse=True) grouped = utilities.group_by(sorted_alignments, key=lambda m: m.get_tag('AS')) _, highest_group = grouped.next() primary_already_assigned = False for alignment in highest_group: if len(highest_group) == 1: alignment.mapping_quality = 2 else: alignment.mapping_quality = 1 if not primary_already_assigned: primary_already_assigned = True else: alignment.is_secondary = True alignment_sorter.write(alignment) else: statistics['unaligned'] += 1 yield read with open(error_fn, 'w') as error_fh: for key in ['input', 'aligned', 'unaligned']: error_fh.write('{0}: {1:,}\n'.format(key, statistics[key]))