def create_renaming_key(self, raw_subreads, renamed_subreads): """ Create a key for translating HBAR subread names to canonical PacBio names """ log.info("Looking for Raw<--->HBAR subread renaming key") renaming_key = self.get_filepath('subreads', 'renaming_key.txt') if valid_file(renaming_key): log.info('Using existing subread renaming key\n') return renaming_key log.info("No subread renaming key round, creating one...") # Compare the two files to make sure they're equivalent raw_count = fasta_size(raw_subreads) new_count = fasta_size(renamed_subreads) try: assert raw_count == new_count except AssertionError: msg = 'The number of raw subreads (%s) does not ' % raw_count + \ 'match the number of renamed reads (%s)' % new_count log.info(msg) raise ValueError(msg) # Write out the pairs of names to file with open(renaming_key, 'w') as handle: for raw, renamed in zip(FastaReader(raw_subreads), FastaReader(renamed_subreads)): raw_name = raw.name.split()[0] new_name = renamed.name.split()[0] handle.write('%s\t%s\n' % (new_name, raw_name)) check_output_file(renaming_key) log.info("Finished creating subread renaming key\n") return renaming_key
def create_renaming_key(self, raw_subreads, renamed_subreads ): """ Create a key for translating HBAR subread names to canonical PacBio names """ log.info("Looking for Raw<--->HBAR subread renaming key") renaming_key = self.get_filepath( 'subreads', 'renaming_key.txt' ) if valid_file( renaming_key ): log.info('Using existing subread renaming key\n') return renaming_key log.info("No subread renaming key round, creating one...") # Compare the two files to make sure they're equivalent raw_count = fasta_size( raw_subreads ) new_count = fasta_size( renamed_subreads ) try: assert raw_count == new_count except AssertionError: msg = 'The number of raw subreads (%s) does not ' % raw_count + \ 'match the number of renamed reads (%s)' % new_count log.info( msg ) raise ValueError( msg ) # Write out the pairs of names to file with open( renaming_key, 'w') as handle: for raw, renamed in zip( FastaReader(raw_subreads), FastaReader(renamed_subreads) ): raw_name = raw.name.split()[0] new_name = renamed.name.split()[0] handle.write('%s\t%s\n' % (new_name, raw_name)) check_output_file( renaming_key ) log.info("Finished creating subread renaming key\n") return renaming_key
def align_by_identity(query, reference_fasta, output=None, format='1'): """ Type sequences in a fasta file by finding the closet reference """ # If output isn't specified, base it on the query assert format in ['1', '5'] if output is None: basename = '.'.join(query.split('.')[:-1]) output = '%s.m%s' % (basename, format) ref_count = fasta_size(reference_fasta) # Iterate over each Fasta, aligning individually. with BlasrWriter(output) as handle: handle.write_header('m1') for record in read_sequences(query): log.info('Aligning %s by identity to %s references' % (record.name, ref_count)) temp = write_temp_fasta(record) alignments = _align_fasta(temp.name, reference_fasta, format) if not alignments: log.info("No hits found for %s" % record.name) continue alignments = _sort_alignments(alignments) alignments = _filter_alignments(alignments) log.info( 'Found %s alignments sharing maximum identity with the query' % len(alignments)) handle.write(alignments[0]) os.unlink(temp.name) check_output_file(output) return output
def align_by_identity( query, reference_fasta, output=None, format='1' ): """ Type sequences in a fasta file by finding the closet reference """ # If output isn't specified, base it on the query assert format in ['1', '5'] if output is None: basename = '.'.join( query.split('.')[:-1] ) output = '%s.m%s' % (basename, format) ref_count = fasta_size(reference_fasta) # Iterate over each Fasta, aligning individually. with BlasrWriter( output ) as handle: handle.write_header( 'm1' ) for record in read_sequences( query ): log.info('Aligning %s by identity to %s references' % (record.name, ref_count)) temp = write_temp_fasta( record ) alignments = _align_fasta( temp.name, reference_fasta, format ) if not alignments: log.info("No hits found for %s" % record.name) continue alignments = _sort_alignments( alignments ) alignments = _filter_alignments( alignments ) log.info('Found %s alignments sharing maximum identity with the query' % len(alignments)) handle.write( alignments[0] ) os.unlink( temp.name ) check_output_file( output ) return output
def align_amplicons( filetype, sequence_5p, sequence_3p ): blasr_args = {'bestn': 1, 'out': 'test.m5', 'm': 5, 'noSplitSubreads': True} if filetype == 'fastq': temp_5p = write_temp_fasta( sequence_5p ) temp_3p = write_temp_fasta( sequence_3p ) align_left = run_blasr( temp_5p.name, temp_3p.name, blasr_args, verbose=True ) elif filetype == 'fasta': assert fasta_size( sequence_5p ) == 2 assert fasta_size( sequence_3p ) == 2 align_left = run_blasr( sequence_5p, sequence_3p, blasr_args ) else: raise ValueError return align_left
def _align_subreads( subread_fasta, reference_fasta, locus ): """ Align all locus-specific subreads against the appropriate references """ location = os.path.dirname( subread_fasta ) alignment_file = os.path.join(location, 'temp.m1') subread_count = fasta_size( subread_fasta ) reference_count = fasta_size( reference_fasta ) blasr_args = {'nproc': 8, 'out': alignment_file, 'bestn': 1, 'nCandidates': reference_count, 'noSplitSubreads': True} log.info("Aligning %s reads against %s references for %s" % (subread_count, reference_count, locus)) run_blasr( subread_fasta, reference_fasta, blasr_args ) check_output_file( alignment_file ) return alignment_file
def _align_subreads(subread_fasta, reference_fasta, locus): """ Align all locus-specific subreads against the appropriate references """ location = os.path.dirname(subread_fasta) alignment_file = os.path.join(location, 'temp.m1') subread_count = fasta_size(subread_fasta) reference_count = fasta_size(reference_fasta) blasr_args = { 'nproc': 8, 'out': alignment_file, 'bestn': 1, 'nCandidates': reference_count, 'noSplitSubreads': True } log.info("Aligning %s reads against %s references for %s" % (subread_count, reference_count, locus)) run_blasr(subread_fasta, reference_fasta, blasr_args) check_output_file(alignment_file) return alignment_file
def align_amplicons(filetype, sequence_5p, sequence_3p): blasr_args = { 'bestn': 1, 'out': 'test.m5', 'm': 5, 'noSplitSubreads': True } if filetype == 'fastq': temp_5p = write_temp_fasta(sequence_5p) temp_3p = write_temp_fasta(sequence_3p) align_left = run_blasr(temp_5p.name, temp_3p.name, blasr_args, verbose=True) elif filetype == 'fasta': assert fasta_size(sequence_5p) == 2 assert fasta_size(sequence_3p) == 2 align_left = run_blasr(sequence_5p, sequence_3p, blasr_args) else: raise ValueError return align_left
def _parse_subread_counts(subread_fofn): """ Count the number of subreads assocated with each consensus """ sizes = {} with open(subread_fofn) as handle: for filepath in handle: filepath = filepath.strip() filename = os.path.basename(filepath) contig_name = filename.split('.')[0] if contig_name.startswith('Allele_'): contig_name = '_'.join(contig_name.split('_')[1:]) if contig_name.startswith('Resequenced_'): contig_name = '_'.join(contig_name.split('_')[1:]) sizes[contig_name] = fasta_size(filepath) return sizes
def _parse_subread_counts( subread_fofn ): """ Count the number of subreads assocated with each consensus """ sizes = {} with open( subread_fofn ) as handle: for filepath in handle: filepath = filepath.strip() filename = os.path.basename( filepath ) contig_name = filename.split('.')[0] if contig_name.startswith('Allele_'): contig_name = '_'.join( contig_name.split('_')[1:] ) if contig_name.startswith('Resequenced_'): contig_name = '_'.join( contig_name.split('_')[1:] ) sizes[contig_name] = fasta_size( filepath ) return sizes
def align_subreads( self, white_list, reference_file ): """ Align the subreads in a Whitelist to the created reference """ basename = '.'.join( reference_file.split('.')[:-1] ) alignment_file = '%s.m1' % basename reference_count = fasta_size( reference_file ) blasr_args = { 'nproc': self._nproc, 'out': alignment_file, 'bestn': 1, 'nCandidates': reference_count, 'noSplitSubreads': True } run_blasr( white_list, reference_file, blasr_args ) check_output_file( alignment_file ) return alignment_file
def separate_alleles( self, white_list ): # Run the first pass, with clustering log.info("Beginning iteration #%s" % self._count) print print self._count, self._output_filelist print curr_output = os.path.join( self._output, 'Iteration_%s' % self._count ) output_file = amp_assem_output_exists( curr_output ) if output_file: log.info('Existing output detected, skipping...') else: log.info('No existing output detected, proceeding ...') if self._count == 0: # For the first pass we enable clustering output_file = self.run_analysis( curr_output, white_list, cluster=True ) else: # For all other iterations, we disable clustering output_file = self.run_analysis( curr_output, white_list, cluster=False ) check_output_file( output_file ) # Outputs of a single Fasta File are returned as is: log.info("Finished iteration #%s" % self._count) self._count += 1 fasta_count = fasta_size( output_file ) if fasta_count == 1: log.info('AmpliconAnalysis generated 1 cluster, exiting...') self.output_filelist.append( output_file ) return log.info('Amplicon Analysis generated %s clusters, continuing splitting' % fasta_count) # Otherwise we partition the reads and run the process on each partition alignment = self.align_subreads( white_list, output_file ) groups = group_subreads( alignment ) output_dir = os.path.dirname( output_file ) sub_lists = [] for reference, group in groups.iteritems(): group_file = '%s.ids' % reference group_path = os.path.join( output_dir, group_file ) write_whitelist( group, group_path ) white_list_seqs = self.extract_whitelist_reads( group_path ) sub_lists.append( white_list_seqs ) if len(group) < MIN_SIZE: log.info('') continue for sub_list in sub_lists: self.separate_alleles( sub_list )
def _align_fasta( query, reference, format ): """ Align a single query sequence to all valid references """ suffix = '.m%s' % format temp_align = tempfile.NamedTemporaryFile( suffix=suffix, delete=False ) reference_count = fasta_size( reference ) blasr_args = {'nproc': NPROC, 'out': temp_align.name, 'bestn': reference_count, 'nCandidates': reference_count, 'm': format, 'noSplitSubreads': True} run_blasr( query, reference, blasr_args ) # Parse the output for return and delete the file alignments = list( BlasrReader( temp_align.name )) os.unlink( temp_align.name ) return alignments
def _align_fasta(query, reference, format): """ Align a single query sequence to all valid references """ suffix = '.m%s' % format temp_align = tempfile.NamedTemporaryFile(suffix=suffix, delete=False) reference_count = fasta_size(reference) blasr_args = { 'nproc': NPROC, 'out': temp_align.name, 'bestn': reference_count, 'nCandidates': reference_count, 'm': format, 'noSplitSubreads': True } run_blasr(query, reference, blasr_args) # Parse the output for return and delete the file alignments = list(BlasrReader(temp_align.name)) os.unlink(temp_align.name) return alignments
def align_best_reference(query, reference, output=None): """ Align the output of AA to the references and return """ output = _get_output_file(query, output, 'm1') # Run Blasr ref_count = fasta_size(reference) log.info("Aligning %s sequences to %s references" % (query, ref_count)) blasr_args = {'nproc': nproc, 'out': output, 'bestn': 1, 'nCandidates': ref_count, 'noSplitSubreads': True} if reference_has_index( reference ): blasr_args['sa'] = reference + '.sa' run_blasr(query, reference, blasr_args) # Check the output file if valid_file( output ): return output return None
def split_results(amp_analysis): """Split the output of an Amplicon Analysis job by Barcode""" assert os.path.isdir(amp_analysis) sequence_path = os.path.join(amp_analysis, "amplicon_analysis.fasta") check_output_file(sequence_path) print "Analyzing %s output sequences" % fasta_size(sequence_path) barcode_path = os.path.join(amp_analysis, "by_barcode") create_directory(barcode_path) records = list(FastaReader(sequence_path)) barcodes = {get_barcode(r): [] for r in records} [barcodes[get_barcode(r)].append(r) for r in records] barcode_files = {} for barcode, records in barcodes.iteritems(): barcode_file = barcode + ".fasta" sample_path = os.path.join(barcode_path, barcode_file) with FastaWriter(sample_path) as handle: for record in records: handle.writeRecord(record) barcode_files[barcode] = sample_path return barcode_files
def split_results(amp_analysis): """Split the output of an Amplicon Analysis job by Barcode""" assert os.path.isdir(amp_analysis) sequence_path = os.path.join(amp_analysis, 'amplicon_analysis.fasta') check_output_file(sequence_path) print "Analyzing %s output sequences" % fasta_size(sequence_path) barcode_path = os.path.join(amp_analysis, 'by_barcode') create_directory(barcode_path) records = list(FastaReader(sequence_path)) barcodes = {get_barcode(r): [] for r in records} [barcodes[get_barcode(r)].append(r) for r in records] barcode_files = {} for barcode, records in barcodes.iteritems(): barcode_file = barcode + '.fasta' sample_path = os.path.join(barcode_path, barcode_file) with FastaWriter(sample_path) as handle: for record in records: handle.writeRecord(record) barcode_files[barcode] = sample_path return barcode_files
def full_align_best_reference(query, reference, output=None): """ Align the output of AA to the references and return """ # Figure out the output and remove it if it exists output = _get_output_file(query, output, 'm5') # Run Blasr ref_count = fasta_size(reference) log.info("Aligning %s sequences to %s references" % (query, ref_count)) blasr_args = {'nproc': nproc, 'out': output, 'm': 5, 'bestn': 1, 'nCandidates': ref_count, 'noSplitSubreads': True} if reference_has_index( reference ): blasr_args['sa'] = reference + '.sa' run_blasr(query, reference, blasr_args) # Check the output file check_output_file(output) return output