def write_sequences(filetype, merged, output): if filetype == 'fasta': write_fasta(merged, output) elif filetype == 'fastq': write_fastq(merged, output) else: raise ValueError
def create_chimeras(input_file, output=None, reference_file=None, alignment_file=None): """ Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads """ # Check the input files, and align the input file if needed if reference_file and alignment_file is None: alignment_file = align_best_reference(input_file, reference_file) elif reference_file is None and alignment_file is None: msg = "extract_alleles requires either an Alignment or a Reference!" log.error(msg) raise IOError(msg) # Set the output file if not specified if output is None: basename = '.'.join(input_file.split('.')[:-1]) output = '%s.chimeras.fasta' % basename # Parse the alignment data and extract the target sequences alignments = list(BlasrReader(alignment_file)) groups = _group_by_locus(alignments) groups = _filter_groups(groups) sequences = list(FastaReader(input_file)) chimeras = list(_create_chimeras(groups, sequences)) write_fasta(chimeras, output) return output
def write_sequences( filetype, merged, output ): if filetype == 'fasta': write_fasta( merged, output ) elif filetype == 'fastq': write_fastq( merged, output ) else: raise ValueError
def _write_output( records, output_file, output_type ): """Write the records out to file""" if output_type == 'fasta': write_fasta( records, output_file ) else: with FastqWriter( output_file ) as writer: for record in records: writer.writeRecord( record ) check_output_file( output_file )
def _write_output(records, output_file, output_type): """Write the records out to file""" if output_type == 'fasta': write_fasta(records, output_file) else: with FastqWriter(output_file) as writer: for record in records: writer.writeRecord(record) check_output_file(output_file)
def pair_exon_files( fofn_file, overlap_exon ): """ Pair the 5' and 3' amplicons of a gene base on 1 overlapping exon """ exon_files = list( _parse_fofn( fofn_file )) output_file = _get_output_file( exon_fasta ) fasta_records = list( FastaReader( exon_fasta )) sorted_records = _sort_fasta_records( fasta_records ) cDNA_record = _combine_records( sorted_records ) write_fasta( [cDNA_record], output_file )
def from_assembly( contig_file, reference_fofn ): contigs = read_fasta_dict( contig_file ) references = read_reference_fofn( reference_fofn ) all_genes = [] for locus, reference in references.iteritems(): alignment = align_reference_to_contigs( locus, reference, contig_file ) hits = read_blasr_hits( alignment ) hits = pick_blasr_hits( hits ) genes = extract_genes( contigs, hits ) all_genes += genes write_fasta( all_genes, 'output.fasta' )
def parse_reference(self): """ Parse HLA data from the configured reference FOFN """ log.info("Parsing the supplied FOFN of HLA reference data") hla_reference_seqs = self.get_filepath( "references", "HLA_references.fasta" ) sequences, metadata, loci = parse_reference_fofn( self.hla_reference ) log.info("Writing collected HLA reference sequences to file") write_fasta( sequences, hla_reference_seqs ) check_output_file( hla_reference_seqs ) log.info("Finished parsing the HLA reference data\n") return hla_reference_seqs, metadata, loci
def trim_fasta( fasta_file, blasr_file, output_file, locus_dict, window=WINDOW, loci=LOCI ): log.info('Trimming sequences in "%s"' % fasta_file) log.debug("\tWindow Size:\t%s" % window) records = list( FastaReader( fasta_file ) ) trims = parse_trims( blasr_file, window ) trims = filter_trims_on_loci( trims, locus_dict, loci ) trimmed_records = apply_trims( records, trims ) write_fasta( trimmed_records, output_file ) log.info('Finished trimming the supplied sequencs\n') return
def parse_reference(self): """ Parse HLA data from the configured reference FOFN """ log.info("Parsing the supplied FOFN of HLA reference data") hla_reference_seqs = self.get_filepath("references", "HLA_references.fasta") sequences, metadata, loci = parse_reference_fofn(self.hla_reference) log.info("Writing collected HLA reference sequences to file") write_fasta(sequences, hla_reference_seqs) check_output_file(hla_reference_seqs) log.info("Finished parsing the HLA reference data\n") return hla_reference_seqs, metadata, loci
def _write_temp_fasta( record ): """ Write a sequence record out to a temporary Fasta file """ temp = tempfile.NamedTemporaryFile( suffix='.fasta', delete=False ) if isinstance( record, FastaRecord ): write_fasta( [record], temp.name ) elif isinstance( record, FastqRecord ): temp_record = FastaRecord(record.name, record.sequence) write_fasta( [temp_record], temp.name ) else: msg = 'Record must be either FastaRecord or FastqRecord' log.error( msg ) raise TypeError( msg ) return temp.name
def _write_temp_fasta(record): """ Write a sequence record out to a temporary Fasta file """ temp = tempfile.NamedTemporaryFile(suffix='.fasta', delete=False) if isinstance(record, FastaRecord): write_fasta([record], temp.name) elif isinstance(record, FastqRecord): temp_record = FastaRecord(record.name, record.sequence) write_fasta([temp_record], temp.name) else: msg = 'Record must be either FastaRecord or FastqRecord' log.error(msg) raise TypeError(msg) return temp.name
def create_chimeras( input_file, output=None, reference_file=None, alignment_file=None ): """ Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads """ # Check the input files, and align the input file if needed if reference_file and alignment_file is None: alignment_file = align_best_reference( input_file, reference_file ) elif reference_file is None and alignment_file is None: msg = "extract_alleles requires either an Alignment or a Reference!" log.error( msg ) raise IOError( msg ) # Set the output file if not specified if output is None: basename = '.'.join( input_file.split('.')[:-1] ) output = '%s.chimeras.fasta' % basename # Parse the alignment data and extract the target sequences alignments = list( BlasrReader( alignment_file )) groups = _group_by_locus( alignments ) groups = _filter_groups( groups ) sequences = list( FastaReader( input_file )) chimeras = list( _create_chimeras( groups, sequences )) write_fasta( chimeras, output ) return output
def extract_best_reads(input_file, output_file=None, min_length=MIN_LENGTH, min_score=MIN_SCORE): """ Extract, filter and subset subreads from Bas/Bax/Fofn Files """ if output_file is None: basename = '.'.join( input_file.split('.')[:-1] ) output_file = '%s.best.fasta' % basename log.info('Extracting subreads from %s' % os.path.basename(input_file)) log.debug('\tMinimum Length:\t%s' % min_length) log.debug('\tMinimum Score:\t%s' % min_score) reads = [] for i, filename in enumerate(_iterate_input_files( input_file )): reads += list( _extract_from_bash5( filename, min_length, min_score )) log.info("Extracted %s subreads from %s files" % (len(reads), i+1)) write_fasta( reads, output_file ) check_output_file( output_file ) log.info("Finished extracting subreads") return output_file
def subset_sequences( fasta_file, summary_file, output_file ): seq_ids = identify_sequences( summary_file ) sequences = subset_sequence_records( fasta_file, seq_ids ) write_fasta( sequences, output_file )