def orient_sequences(input_file, reference_file=None, alignment_file=None, output_file=None): """ Reorient a fasta file so all sequences are in the same direction as their reference """ log.info( "Reorienting all sequences in %s to the direction of their reference" % input_file) # Set the output file and type output_file = output_file or _get_output_file(input_file) output_type = _get_output_type(output_file) if valid_file(output_file): log.info("Found existing output file %s, skipping orientation step" % output_file) return output_file # Check the input files, and align the input file if needed alignment_file = get_alignment_file(input_file, reference_file, alignment_file) reversed_seqs = _identify_reversed_sequences(alignment_file) log.info("Identified %s sequences needing Reverse Complementation" % len(reversed_seqs)) input_records = _parse_input_records(input_file) reversed_records = _reverse_records(input_records, reversed_seqs) log.info("Writing out sequences to %s" % output_file) _write_output(reversed_records, output_file, output_type) return output_file
def extract_alleles(input_file, output_file=None, reference_file=None, alignment_file=None, method=METHOD, sort=SORT, loci=LOCI): """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta""" method = method or METHOD loci = loci or LOCI # Set the output file if not specified output_file = output_file or _get_output_file(input_file) output_type = get_file_type(output_file) # If align to reference for breaking ties alignment_file = get_alignment_file(input_file, reference_file, alignment_file) alignments = list(BlasrReader(alignment_file)) # Run the appropriate grouping if method == 'locus': groups = _group_by_locus(alignments, loci) elif method == 'barcode': groups = _group_by_barcode(alignments) elif method == 'both': groups = _group_by_both(alignments, loci) elif method == 'all': groups = {a.qname: [a] for a in alignments} else: msg = "Invalid Selection Metric: %s" % method log.error(msg) raise ValueError(msg) # Read the input sequences and use them to generate our sorting data sequences = read_sequences(input_file) if sort == 'num_reads': sorting_data = {s.name: consensus_size(s) for s in sequences} elif sort == 'accuracy': assert get_file_type(input_file) == 'fastq' sorting_data = {s.name: record_accuracy(s) for s in sequences} else: msg = "Invalid Sorting Metric: %s" % sort log.error(msg) raise ValueError(msg) log.info('Sorting sequences for selection according to "%s"' % sort) ordered = _sort_groups(groups, sorting_data) log.info('Selecting top sequences from %s according to the "%s" policy' % (input_file, method)) selected = list(_select_sequences(ordered)) log.info('Selected %s sequences from %s total for further analysis' % (len(selected), len(sequences))) log.info('Writing the selected sequences out to %s' % output_file) subset = list(_subset_sequences(sequences, selected)) _write_output(subset, output_file, output_type) return output_file
def extract_alleles( input_file, output_file=None, reference_file=None, alignment_file=None, method=METHOD, sort=SORT, loci=LOCI ): """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta""" method = method or METHOD loci = loci or LOCI # Set the output file if not specified output_file = output_file or _get_output_file( input_file ) output_type = get_file_type( output_file ) # If align to reference for breaking ties alignment_file = get_alignment_file( input_file, reference_file, alignment_file ) alignments = list( BlasrReader( alignment_file )) # Run the appropriate grouping if method == 'locus': groups = _group_by_locus( alignments, loci ) elif method == 'barcode': groups = _group_by_barcode( alignments ) elif method == 'both': groups = _group_by_both( alignments, loci ) elif method == 'all': groups = {a.qname: [a] for a in alignments} else: msg = "Invalid Selection Metric: %s" % method log.error( msg ) raise ValueError( msg ) # Read the input sequences and use them to generate our sorting data sequences = read_sequences( input_file ) if sort == 'num_reads': sorting_data = {s.name: consensus_size(s) for s in sequences} elif sort == 'accuracy': assert get_file_type(input_file) == 'fastq' sorting_data = {s.name: record_accuracy(s) for s in sequences} else: msg = "Invalid Sorting Metric: %s" % sort log.error( msg ) raise ValueError( msg ) log.info('Sorting sequences for selection according to "%s"' % sort) ordered = _sort_groups( groups, sorting_data ) log.info('Selecting top sequences from %s according to the "%s" policy' % (input_file, method)) selected = list( _select_sequences( ordered )) log.info('Selected %s sequences from %s total for further analysis' % (len(selected), len(sequences))) log.info('Writing the selected sequences out to %s' % output_file) subset = list( _subset_sequences( sequences, selected )) _write_output( subset, output_file, output_type ) return output_file
def cdna_from_file( input_file, hmm_fofn, output=None, reference=None, alignment=None ): """ Extract the cDNA sequences from a mixed Fasta or Fastq """ # Check the input files, and align the input file if needed alignment_file = get_alignment_file( input_file, reference, alignment ) output_file = output or get_output_file( input_file, 'cDNA' ) # Prepare the Fasta by orienting and subsetting it records = _parse_input_records( input_file ) hmms = parse_locus_dict( hmm_fofn ) loci = _parse_loci( alignment_file ) # Compose and output the records cdna_records = list( cdna_from_records( records, loci, hmms )) write_records( cdna_records, output_file ) return output_file
def orient_sequences( input_file, reference_file=None, alignment_file=None, output_file=None ): """ Reorient a fasta file so all sequences are in the same direction as their reference """ log.info("Reorienting all sequences in %s to the direction of their reference" % input_file) # Set the output file and type output_file = output_file or _get_output_file( input_file ) output_type = _get_output_type( output_file ) if valid_file( output_file ): log.info("Found existing output file %s, skipping orientation step" % output_file) return output_file # Check the input files, and align the input file if needed alignment_file = get_alignment_file( input_file, reference_file, alignment_file ) reversed_seqs = _identify_reversed_sequences( alignment_file ) log.info("Identified %s sequences needing Reverse Complementation" % len(reversed_seqs)) input_records = _parse_input_records( input_file ) reversed_records = _reverse_records( input_records, reversed_seqs ) log.info("Writing out sequences to %s" % output_file) _write_output( reversed_records, output_file, output_type ) return output_file
def cdna_from_file(input_file, hmm_fofn, output=None, reference=None, alignment=None): """ Extract the cDNA sequences from a mixed Fasta or Fastq """ # Check the input files, and align the input file if needed alignment_file = get_alignment_file(input_file, reference, alignment) output_file = output or get_output_file(input_file, 'cDNA') # Prepare the Fasta by orienting and subsetting it records = _parse_input_records(input_file) hmms = parse_locus_dict(hmm_fofn) loci = _parse_loci(alignment_file) # Compose and output the records cdna_records = list(cdna_from_records(records, loci, hmms)) write_records(cdna_records, output_file) return output_file