def align_by_identity(query, reference_fasta, output=None, format='1'): """ Type sequences in a fasta file by finding the closet reference """ # If output isn't specified, base it on the query assert format in ['1', '5'] if output is None: basename = '.'.join(query.split('.')[:-1]) output = '%s.m%s' % (basename, format) ref_count = fasta_size(reference_fasta) # Iterate over each Fasta, aligning individually. with BlasrWriter(output) as handle: handle.write_header('m1') for record in read_sequences(query): log.info('Aligning %s by identity to %s references' % (record.name, ref_count)) temp = write_temp_fasta(record) alignments = _align_fasta(temp.name, reference_fasta, format) if not alignments: log.info("No hits found for %s" % record.name) continue alignments = _sort_alignments(alignments) alignments = _filter_alignments(alignments) log.info( 'Found %s alignments sharing maximum identity with the query' % len(alignments)) handle.write(alignments[0]) os.unlink(temp.name) check_output_file(output) return output
def align_by_identity( query, reference_fasta, output=None, format='1' ): """ Type sequences in a fasta file by finding the closet reference """ # If output isn't specified, base it on the query assert format in ['1', '5'] if output is None: basename = '.'.join( query.split('.')[:-1] ) output = '%s.m%s' % (basename, format) ref_count = fasta_size(reference_fasta) # Iterate over each Fasta, aligning individually. with BlasrWriter( output ) as handle: handle.write_header( 'm1' ) for record in read_sequences( query ): log.info('Aligning %s by identity to %s references' % (record.name, ref_count)) temp = write_temp_fasta( record ) alignments = _align_fasta( temp.name, reference_fasta, format ) if not alignments: log.info("No hits found for %s" % record.name) continue alignments = _sort_alignments( alignments ) alignments = _filter_alignments( alignments ) log.info('Found %s alignments sharing maximum identity with the query' % len(alignments)) handle.write( alignments[0] ) os.unlink( temp.name ) check_output_file( output ) return output
def extract_alleles(input_file, output_file=None, reference_file=None, alignment_file=None, method=METHOD, sort=SORT, loci=LOCI): """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta""" method = method or METHOD loci = loci or LOCI # Set the output file if not specified output_file = output_file or _get_output_file(input_file) output_type = get_file_type(output_file) # If align to reference for breaking ties alignment_file = get_alignment_file(input_file, reference_file, alignment_file) alignments = list(BlasrReader(alignment_file)) # Run the appropriate grouping if method == 'locus': groups = _group_by_locus(alignments, loci) elif method == 'barcode': groups = _group_by_barcode(alignments) elif method == 'both': groups = _group_by_both(alignments, loci) elif method == 'all': groups = {a.qname: [a] for a in alignments} else: msg = "Invalid Selection Metric: %s" % method log.error(msg) raise ValueError(msg) # Read the input sequences and use them to generate our sorting data sequences = read_sequences(input_file) if sort == 'num_reads': sorting_data = {s.name: consensus_size(s) for s in sequences} elif sort == 'accuracy': assert get_file_type(input_file) == 'fastq' sorting_data = {s.name: record_accuracy(s) for s in sequences} else: msg = "Invalid Sorting Metric: %s" % sort log.error(msg) raise ValueError(msg) log.info('Sorting sequences for selection according to "%s"' % sort) ordered = _sort_groups(groups, sorting_data) log.info('Selecting top sequences from %s according to the "%s" policy' % (input_file, method)) selected = list(_select_sequences(ordered)) log.info('Selected %s sequences from %s total for further analysis' % (len(selected), len(sequences))) log.info('Writing the selected sequences out to %s' % output_file) subset = list(_subset_sequences(sequences, selected)) _write_output(subset, output_file, output_type) return output_file
def rename_sequences( sequence_file ): records = read_sequences( sequence_file ) if any([r.name.strip().endswith('|quiver') for r in records]): records = [rename_record(r) for r in records] output_file = get_output_file( sequence_file ) write_sequences( records, output_file ) return output_file else: return sequence_file
def extract_alleles( input_file, output_file=None, reference_file=None, alignment_file=None, method=METHOD, sort=SORT, loci=LOCI ): """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta""" method = method or METHOD loci = loci or LOCI # Set the output file if not specified output_file = output_file or _get_output_file( input_file ) output_type = get_file_type( output_file ) # If align to reference for breaking ties alignment_file = get_alignment_file( input_file, reference_file, alignment_file ) alignments = list( BlasrReader( alignment_file )) # Run the appropriate grouping if method == 'locus': groups = _group_by_locus( alignments, loci ) elif method == 'barcode': groups = _group_by_barcode( alignments ) elif method == 'both': groups = _group_by_both( alignments, loci ) elif method == 'all': groups = {a.qname: [a] for a in alignments} else: msg = "Invalid Selection Metric: %s" % method log.error( msg ) raise ValueError( msg ) # Read the input sequences and use them to generate our sorting data sequences = read_sequences( input_file ) if sort == 'num_reads': sorting_data = {s.name: consensus_size(s) for s in sequences} elif sort == 'accuracy': assert get_file_type(input_file) == 'fastq' sorting_data = {s.name: record_accuracy(s) for s in sequences} else: msg = "Invalid Sorting Metric: %s" % sort log.error( msg ) raise ValueError( msg ) log.info('Sorting sequences for selection according to "%s"' % sort) ordered = _sort_groups( groups, sorting_data ) log.info('Selecting top sequences from %s according to the "%s" policy' % (input_file, method)) selected = list( _select_sequences( ordered )) log.info('Selected %s sequences from %s total for further analysis' % (len(selected), len(sequences))) log.info('Writing the selected sequences out to %s' % output_file) subset = list( _subset_sequences( sequences, selected )) _write_output( subset, output_file, output_type ) return output_file
def trim_alleles(input_file, output_file=None, trim=0): """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta""" # If no trim or output file is specified, we can skip this module if trim == 0 and output_file is None: log.info('No trimming necessary for "%s", skipping...' % input_file) return input_file # Set the output file if not specified output_file = output_file or _get_output_file(input_file) output_type = get_file_type(output_file) # Read the input sequences and trim each record sequences = read_sequences(input_file) log.info("Trimming sequences by %s bp from each end" % trim) trimmed = _trim_sequences(sequences, trim) log.info("Writing the trimmed sequences out to %s" % output_file) _write_output(trimmed, output_file, output_type) return output_file
def trim_alleles(input_file, output_file=None, trim=0): """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta""" # If no trim or output file is specified, we can skip this module if trim == 0 and output_file is None: log.info('No trimming necessary for "%s", skipping...' % input_file) return input_file # Set the output file if not specified output_file = output_file or _get_output_file(input_file) output_type = get_file_type(output_file) # Read the input sequences and trim each record sequences = read_sequences(input_file) log.info('Trimming sequences by %s bp from each end' % trim) trimmed = _trim_sequences(sequences, trim) log.info('Writing the trimmed sequences out to %s' % output_file) _write_output(trimmed, output_file, output_type) return output_file