def sort_subreads(subread_file, reference_file): """ Aligning """ log.info("Aligning subreads to the two best references") temp = 'temp2.m1' if valid_file(temp): return {hit.qname: hit.tname for hit in BlasrReader(temp)} align_best_reference(subread_file, reference_file, temp) return {hit.qname: hit.tname for hit in BlasrReader(temp)}
def parse_trims( blasr_file, window ): trims = {} for record in BlasrReader( blasr_file ): start = max(int(record.qstart)-window, 0) end = min(int(record.qend)+window, int(record.qlength)) trims[record.qname] = (start, end) return trims
def create_chimeras(input_file, output=None, reference_file=None, alignment_file=None): """ Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads """ # Check the input files, and align the input file if needed if reference_file and alignment_file is None: alignment_file = align_best_reference(input_file, reference_file) elif reference_file is None and alignment_file is None: msg = "extract_alleles requires either an Alignment or a Reference!" log.error(msg) raise IOError(msg) # Set the output file if not specified if output is None: basename = '.'.join(input_file.split('.')[:-1]) output = '%s.chimeras.fasta' % basename # Parse the alignment data and extract the target sequences alignments = list(BlasrReader(alignment_file)) groups = _group_by_locus(alignments) groups = _filter_groups(groups) sequences = list(FastaReader(input_file)) chimeras = list(_create_chimeras(groups, sequences)) write_fasta(chimeras, output) return output
def extract_alleles(input_file, output_file=None, reference_file=None, alignment_file=None, method=METHOD, sort=SORT, loci=LOCI): """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta""" method = method or METHOD loci = loci or LOCI # Set the output file if not specified output_file = output_file or _get_output_file(input_file) output_type = get_file_type(output_file) # If align to reference for breaking ties alignment_file = get_alignment_file(input_file, reference_file, alignment_file) alignments = list(BlasrReader(alignment_file)) # Run the appropriate grouping if method == 'locus': groups = _group_by_locus(alignments, loci) elif method == 'barcode': groups = _group_by_barcode(alignments) elif method == 'both': groups = _group_by_both(alignments, loci) elif method == 'all': groups = {a.qname: [a] for a in alignments} else: msg = "Invalid Selection Metric: %s" % method log.error(msg) raise ValueError(msg) # Read the input sequences and use them to generate our sorting data sequences = read_sequences(input_file) if sort == 'num_reads': sorting_data = {s.name: consensus_size(s) for s in sequences} elif sort == 'accuracy': assert get_file_type(input_file) == 'fastq' sorting_data = {s.name: record_accuracy(s) for s in sequences} else: msg = "Invalid Sorting Metric: %s" % sort log.error(msg) raise ValueError(msg) log.info('Sorting sequences for selection according to "%s"' % sort) ordered = _sort_groups(groups, sorting_data) log.info('Selecting top sequences from %s according to the "%s" policy' % (input_file, method)) selected = list(_select_sequences(ordered)) log.info('Selected %s sequences from %s total for further analysis' % (len(selected), len(sequences))) log.info('Writing the selected sequences out to %s' % output_file) subset = list(_subset_sequences(sequences, selected)) _write_output(subset, output_file, output_type) return output_file
def hits_by_reference(alignment): by_reference = {} for hit in BlasrReader(alignment): try: by_reference[hit.tname].append(hit) except: by_reference[hit.tname] = [hit] return by_reference
def _identify_reversed_sequences(blasr_file): """ Identify hits where the query and reference have difference orientations """ reversed_seqs = [] for record in BlasrReader(blasr_file): if record.qstrand != record.tstrand: reversed_seqs.append(record.qname) return set(reversed_seqs)
def _parse_exon_location(alignment_file): """ Parse the most likely Exon location from an Exon-Fasta alignment """ alignments = list(BlasrReader(alignment_file)) alignments = sorted(alignments, key=lambda x: int(x.score)) alignments = sorted(alignments, key=lambda x: float(x.pctsimilarity), reverse=True) return int(alignments[0].tstart), int(alignments[0].tend)
def order_references(subread_file, reference_file): """ Select the two best reference sequences from a list """ log.info("Selecting the best references sequences to use") temp = 'temp.m1' if not valid_file(temp): align_best_reference(subread_file, reference_file, temp) c = Counter([hit.tname for hit in BlasrReader(temp)]) return [k for k, v in c.most_common()]
def _parse_blasr_alignment(blasr_file): results = {} for entry in BlasrReader(blasr_file): name = get_base_sequence_name(entry.qname) if isinstance(entry, BlasrM1): results[name] = [entry.tname, entry.pctsimilarity] elif isinstance(entry, BlasrM5): diffs = int(entry.nmis) + int(entry.nins) + int(entry.ndel) pctid = 100 * int(entry.nmat) / float(int(entry.nmat) + diffs) results[name] = [entry.tname, pctid] return results
def _parse_alignment(alignment): """ Parse the genomic typeings from the gDNA alignment """ hits = {} for record in BlasrReader(alignment): try: hits[record.qname].append(record) except: hits[record.qname] = [record] return hits
def parse_alignment_positions(alignment_file): positions = [] for hit in BlasrReader(alignment_file): left = {'name': hit.qname, 'start': 1, 'end': int(hit.qstart)} right = { 'name': hit.tname, 'start': int(hit.tstart), 'end': int(hit.tlength) } positions.append((left, right)) return positions
def _align_sequences(query, reference): """ Align one fasta file of sequences to another """ temp = NamedTemporaryFile(suffix='.m1', delete=False) align_best_reference(query, reference, output=temp.name) if valid_file(temp.name): hits = list(BlasrReader(temp.name)) os.unlink(temp.name) return hits os.unlink(temp.name) return None
def _parse_alignment(alignment): """ Parse the location of each hit in the alignment file """ locations = {} for entry in BlasrReader(alignment): if entry.tstrand == '1': start = int(entry.tlength) - int(entry.tend) end = int(entry.tlength) - int(entry.tstart) else: start = int(entry.tstart) end = int(entry.tend) locations[entry.qname] = (start, end, entry.tname) return locations
def parse_alignment_positions(alignment_file): positions = [] for hit in BlasrReader(alignment_file): position = { 'name': hit.qname, 'ref': hit.tname, 'qstart': int(hit.qstart), 'qend': int(hit.qend), 'tstart': int(hit.tstart), 'tend': int(hit.tend), 'qstring': hit.qstring, 'tstring': hit.tstring } positions.append(position) return positions
def _parse_orientation(filename): """ Parse the orientations of a list of sequences from a Blasr alignment file """ orientations = {} for record in BlasrReader(filename): if record.qname in orientations: msg = 'Duplicate record name! (%s)' % record.qname log.error(msg) raise ValueError(msg) if record.qstrand == record.tstrand: orientations[record.qname] = 'forward' else: orientations[record.qname] = 'reverse' return orientations
def create_m1_reference(m1_file, reference=None): log.info('Parsing Blasr M1 results from "{0}"'.format(m1_file)) results = {} for record in BlasrReader(m1_file): qname = get_base_sequence_name(record.qname) tname = get_base_sequence_name(record.tname) if qname in results: msg = 'Duplicate sequence ids found! "{0}"'.format(qname) log.info(msg) raise KeyError(msg) if reference: results[qname] = reference[tname] else: results[qname] = tname log.info('Finished reading Blasr results') return results
def create_m5_reference(m5_file): log.info('Parsing Blasr M5 results from "{0}"'.format(m5_file)) results = {} diffs = {} for record in BlasrReader(m5_file): qname = get_base_sequence_name(record.qname) tname = get_base_sequence_name(record.tname) diff_count = int(record.nmis) + int(record.nins) + int(record.ndel) if qname not in diffs: results[qname] = tname diffs[qname] = diff_count elif diffs[qname] > diff_count: results[qname] = tname diffs[qname] = diff_count log.info('Finished reading Blasr results') return results
def _parse_alignment(alignment): """ Parse the location of each hit in the alignment file """ log.info("Parsing subread locations from alignment data") locations = {} for entry in BlasrReader(alignment): if '/' in entry.qname: qname = '/'.join(entry.qname.split('/')[0:3]) else: qname = entry.qname if entry.tstrand == '1': start = int(entry.tlength) - int(entry.tend) end = int(entry.tlength) - int(entry.tstart) else: start = int(entry.tstart) end = int(entry.tend) locations[qname] = (start, end) return locations
def _align_fasta(query, reference, format): """ Align a single query sequence to all valid references """ suffix = '.m%s' % format temp_align = tempfile.NamedTemporaryFile(suffix=suffix, delete=False) reference_count = fasta_size(reference) blasr_args = { 'nproc': NPROC, 'out': temp_align.name, 'bestn': reference_count, 'nCandidates': reference_count, 'm': format, 'noSplitSubreads': True } run_blasr(query, reference, blasr_args) # Parse the output for return and delete the file alignments = list(BlasrReader(temp_align.name)) os.unlink(temp_align.name) return alignments
def filter_m5_file(m5_file, filtered_file): """ Filter an M5 alignment file to contain only the alignments with the fewest diffs """ log.info('Filtering Blasr M5 results from "{0}"'.format(m5_file)) selected = {} diffs = {} count = 0 for record in BlasrReader(m5_file): count += 1 diff_count = int(record.nmis) + int(record.nins) + int(record.ndel) if record.qname not in diffs: selected[record.qname] = record diffs[record.qname] = diff_count elif diffs[record.qname] > diff_count: selected[record.qname] = record diffs[record.qname] = diff_count log.info('Selected %s records from %s alignments' % (count, len(selected))) with open(filtered_file, 'w') as output: for record in selected.itervalues(): output.write('%s\n' % record_to_string(record)) log.info('Finished filtering Blasr results')
def _parse_loci( blasr_file ): """ Parse the likely locus of sequences from a Blasr file """ locus_calls = {} for entry in BlasrReader( blasr_file ): if entry.tname == 'tname': continue # Parse the locus from either Tokai or IMGT references reference = entry.tname.split('*')[0] if reference.startswith('HLA-'): locus = reference[-1] else: locus = reference.split('_')[1] # Save the Locus/Sequence pair unless duplicate if entry.qname in locus_calls: msg = 'Duplicate sequence name found "%s"!' % entry.qname log.error( msg ) raise ValueError( msg ) else: locus_calls[entry.qname] = locus return locus_calls
def count_hits(filename): return len(list(BlasrReader(filename)))
def format_blasr_file(input_file, output_file): with BlasrWriter(output_file) as writer: with BlasrReader(input_file) as reader: writer.write_header(reader.filetype) for record in reader: writer.write(record)