def test_align(): ''' Ensure that sequence that ends with a '-' will not cause an error ''' dir_path = os.path.dirname(os.path.realpath(__file__)) ## Load reference sequence seqpath = os.path.join(dir_path, "./rsrc/SHORT.FASTA") output_file = os.path.join(dir_path, "./rsrc/SHORT.FASTA.test.bam") records = SeqIO.parse(seqpath, 'fasta') reference = gapless(next(records)) def allseqs(records): yield compute_cigar(reference, reference) for record in records: print(record) yield record def output(records): BamIO.write(allseqs(records), output_file, reference) _align_par(reference, records, BLOSUM62.load(), True, False, None, None, output, False) # Read output file BamIO.sort(output_file)
def run_group_alignment (sequence_group): print ("%d sequences with matching JUNCTION regions" % (len (sequence_group) - 1)) seqrecords = [] for seq_id in sequence_group: #print ("Step 1\n%s" % sequence_group[seq_id]) massaged_string = sequence_group[seq_id].replace ('NNN','').replace ('---','').replace ('-','N') #print ("Step 2\n%s" % massaged_string) if len (massaged_string) % 3: massaged_string = massaged_string [:len (massaged_string) - len (massaged_string) % 3] #print ("Step 3\n%s" % massaged_string) seqrecords.append(gapless(Bio.SeqRecord.SeqRecord (Bio.Seq.Seq(massaged_string), id = seq_id, name = seq_id, description = ''))) if len (seqrecords) == 1: refseq = seqrecords[0].format ('fasta') return {'ref': refseq, 'alignment': refseq, 'seqs': seqrecords} # find the longest sequence seq_lengths = [len(record.seq) for record in seqrecords] refseq_id = seq_lengths.index(max(seq_lengths)) refseq = seqrecords.pop(refseq_id) #print (len (seqrecords)) if len(refseq.seq) % 3: seqrecords = [s for s in seqrecords] print (">ref\n%s" % str(refseq.seq)) print ('\n'.join ([">%s\n%s" % (str(k.id), str(k.seq)) for k in seqrecords])) sm = BLOSUM62.load() msa, discarded = align_to_refseq( refseq, seqrecords, score_matrix=sm, do_codon=True, reverse_complement=False, #expected_identity=0.6, keep_insertions=False, ) if len (discarded): print (">ref\n%s" % str(refseq.seq)) print ('\n'.join ([">%s\n%s" % (str(k.id), str(k.seq)) for k in seqrecords])) print (discarded) raise Exception ("Non-empty discarded") sys.exit (1) string_buffer = io.StringIO () Bio.SeqIO.write (msa, string_buffer, "fasta") all_lines = string_buffer.getvalue() string_buffer.close() return {'ref': refseq.format ('fasta'), 'alignment': all_lines, 'seqs': seqrecords}
def __call__( self, ref, query, open_insertion=None, extend_insertion=None, open_deletion=None, extend_deletion=None, miscall_cost=None, do_local=None, do_affine=None ): # populate defaults from initialization if open_insertion is None: open_insertion = self.__open_insertion if extend_insertion is None: extend_insertion = self.__extend_insertion if open_deletion is None: open_deletion = self.__open_deletion if extend_deletion is None: extend_deletion = self.__extend_deletion if miscall_cost is None: miscall_cost = self.__miscall_cost if do_local is None: do_local = self.__do_local if do_affine is None: do_affine = self.__do_affine ref = gapless(ref) query = gapless(query) # if the reference and query are the same, we can return early if len(ref) and ref == query: if self.__do_codon: score = sum(self.__score_matrix[char, char] for char in _translate(ref)) else: score = sum(self.__score_matrix[char, char] for char in ref) return score / len(ref), ref, query if isinstance(ref, SeqRecord): ref_ = str(ref.seq) elif isinstance(ref, Seq): ref_ = str(ref) else: ref_ = ref if isinstance(query, SeqRecord): query_ = str(query.seq) elif isinstance(query, Seq): query_ = str(query) else: query_ = query # convert to uppercase, because _align assumes it ref_ = ref_.upper() query_ = query_.upper() if self.__do_codon and len(ref_) % 3 != 0: raise ValueError('when do_codon = True, len(ref) must be a multiple of 3') # if do_codon, the query's length needs to be a multiple of 3 # if self.__do_codon and len(query_) % 3 != 0: # ns = 3 - len(query_) % 3 # query_ += 'N' * ns # else: # ns = 0 # for shared memory safety, recreate matrices if the PID changed current_pid = getpid() if self.__cached_pid != current_pid: self.__cached_pid = current_pid self.__cached_score_matrix = np.empty((1,), dtype=float) self.__cached_deletion_matrix = np.empty((1,), dtype=float) self.__cached_insertion_matrix = np.empty((1,), dtype=float) if self.__do_codon: cache_size = (len(ref_) // 3 + 1) * (len(query_) + 1) else: cache_size = (len(ref_) + 1) * (len(query_) + 1) if self.__cached_score_matrix.shape[0] < cache_size: self.__cached_score_matrix.resize((cache_size,)) if do_affine: if self.__cached_deletion_matrix.shape[0] < cache_size: self.__cached_deletion_matrix.resize((cache_size,)) if self.__cached_insertion_matrix.shape[0] < cache_size: self.__cached_insertion_matrix.resize((cache_size,)) if len(query) == 0: score, ref_aligned, query_aligned = float('-Inf'), ref_, '-' * len(ref_) else: score, ref_aligned, query_aligned = _align( ref_.encode('utf-8'), query_.encode('utf-8'), self.__nchars, self.__char_map, self.__score_matrix_, self.__score_matrix_.shape[0], open_insertion, extend_insertion, open_deletion, extend_deletion, miscall_cost, do_local, do_affine, self.__do_codon, self.__codon3x5, self.__codon3x4, self.__codon3x2, self.__codon3x1, self.__cached_score_matrix, self.__cached_deletion_matrix, self.__cached_insertion_matrix ) if sys.version_info >= (3, 0): ref_aligned = ref_aligned.decode('utf-8') query_aligned = query_aligned.decode('utf-8') if isinstance(ref, SeqRecord): ref_aligned_ = SeqRecord( Seq(ref_aligned, ref.seq.alphabet), id=ref.id, name=ref.name, description=ref.description, dbxrefs=ref.dbxrefs, annotations=ref.annotations ) elif isinstance(ref, Seq): ref_aligned_ = Seq(ref_aligned, ref.seq.alphabet) else: ref_aligned_ = ref_aligned if isinstance(query, SeqRecord): query_aligned_ = SeqRecord( Seq(query_aligned, query.seq.alphabet), id=query.id, name=query.name, description=query.description, dbxrefs=query.dbxrefs, annotations=query.annotations ) elif isinstance(query, Seq): query_aligned_ = Seq(query_aligned, query.seq.alphabet) else: query_aligned_ = query_aligned # normalize score to per-position if len(query_): score /= (len(query_) / 3) if self.__do_codon else len(query_) return score, ref_aligned_, query_aligned_
def __call__(self, ref, query, open_insertion=None, extend_insertion=None, open_deletion=None, extend_deletion=None, miscall_cost=None, do_local=None, do_affine=None): # populate defaults from initialization if open_insertion is None: open_insertion = self.__open_insertion if extend_insertion is None: extend_insertion = self.__extend_insertion if open_deletion is None: open_deletion = self.__open_deletion if extend_deletion is None: extend_deletion = self.__extend_deletion if miscall_cost is None: miscall_cost = self.__miscall_cost if do_local is None: do_local = self.__do_local if do_affine is None: do_affine = self.__do_affine ref = gapless(ref) query = gapless(query) # if the reference and query are the same, we can return early if len(ref) and ref == query: if self.__do_codon: score = sum(self.__score_matrix[char, char] for char in _translate(ref)) else: score = sum(self.__score_matrix[char, char] for char in ref) return score / len(ref), ref, query if isinstance(ref, SeqRecord): ref_ = str(ref.seq) elif isinstance(ref, Seq): ref_ = str(ref) else: ref_ = ref if isinstance(query, SeqRecord): query_ = str(query.seq) elif isinstance(query, Seq): query_ = str(query) else: query_ = query # convert to uppercase, because _align assumes it ref_ = ref_.upper() query_ = query_.upper() if self.__do_codon and len(ref_) % 3 != 0: raise ValueError( 'when do_codon = True, len(ref) must be a multiple of 3') # if do_codon, the query's length needs to be a multiple of 3 # if self.__do_codon and len(query_) % 3 != 0: # ns = 3 - len(query_) % 3 # query_ += 'N' * ns # else: # ns = 0 # for shared memory safety, recreate matrices if the PID changed current_pid = getpid() if self.__cached_pid != current_pid: self.__cached_pid = current_pid self.__cached_score_matrix = np.empty((1, ), dtype=float) self.__cached_deletion_matrix = np.empty((1, ), dtype=float) self.__cached_insertion_matrix = np.empty((1, ), dtype=float) if self.__do_codon: cache_size = (len(ref_) // 3 + 1) * (len(query_) + 1) else: cache_size = (len(ref_) + 1) * (len(query_) + 1) if self.__cached_score_matrix.shape[0] < cache_size: self.__cached_score_matrix.resize((cache_size, )) if do_affine: if self.__cached_deletion_matrix.shape[0] < cache_size: self.__cached_deletion_matrix.resize((cache_size, )) if self.__cached_insertion_matrix.shape[0] < cache_size: self.__cached_insertion_matrix.resize((cache_size, )) if len(query) == 0: score, ref_aligned, query_aligned = float( '-Inf'), ref_, '-' * len(ref_) else: score, ref_aligned, query_aligned = _align( ref_.encode('utf-8'), query_.encode('utf-8'), self.__nchars, self.__char_map, self.__score_matrix_, self.__score_matrix_.shape[0], open_insertion, extend_insertion, open_deletion, extend_deletion, miscall_cost, do_local, do_affine, self.__do_codon, self.__codon3x5, self.__codon3x4, self.__codon3x2, self.__codon3x1, self.__cached_score_matrix, self.__cached_deletion_matrix, self.__cached_insertion_matrix) if sys.version_info >= (3, 0): ref_aligned = ref_aligned.decode('utf-8') query_aligned = query_aligned.decode('utf-8') if isinstance(ref, SeqRecord): ref_aligned_ = SeqRecord(Seq(ref_aligned, ref.seq.alphabet), id=ref.id, name=ref.name, description=ref.description, dbxrefs=ref.dbxrefs, annotations=ref.annotations) elif isinstance(ref, Seq): ref_aligned_ = Seq(ref_aligned, ref.seq.alphabet) else: ref_aligned_ = ref_aligned if isinstance(query, SeqRecord): query_aligned_ = SeqRecord(Seq(query_aligned, query.seq.alphabet), id=query.id, name=query.name, description=query.description, dbxrefs=query.dbxrefs, annotations=query.annotations) elif isinstance(query, Seq): query_aligned_ = Seq(query_aligned, query.seq.alphabet) else: query_aligned_ = query_aligned # normalize score to per-position if len(query_): score /= (len(query_) / 3) if self.__do_codon else len(query_) return score, ref_aligned_, query_aligned_
def output(records): for record in records: alignment.append(gapful(gapless(record), insertions=False))
def output(records): for record in records: alignment.append(suffix_pad(gapful(gapless(record), insertions=False)))
def discard(record): SeqIO.write([gapless(record.upper())], discard_handle, 'fasta')
def main( input_handle, output_handle, reference, expected_identity, alphabet, reverse_complement, score_matrix, discard_handle, do_sort, quiet, globalStartingPoint, extendGapPenalty ): try: score_matrix_ = score_matrix.load() except: raise RuntimeError('could not load the score matrix') if ((alphabet == 'dna' and not isinstance(score_matrix, DNAScoreMatrix)) and not isinstance(score_matrix, ProteinScoreMatrix)): raise ValueError( 'DNA alphabet requires a DNA score matrix, ' 'while amino and codon alphabets require a protein score matrix' ) do_codon = alphabet == 'codon' records = SeqIO.parse(input_handle, 'fasta') # grab the first, make it gapless once and for all if reference is None: reference = gapless(next(records)) def allseqs(records): yield compute_cigar(reference, reference) for record in records: yield record else: def allseqs(records): for record in records: yield record if discard_handle: def discard(record): SeqIO.write([gapless(record.upper())], discard_handle, 'fasta') else: discard = None def output(records): BamIO.write( allseqs(records), output_handle, reference ) retcode = -1 try: _align_par( reference, records, score_matrix_, do_codon, reverse_complement, expected_identity, discard, output, globalStartingPoint, extendGapPenalty, quiet ) if do_sort: BamIO.sort(output_handle) retcode = 0 except FrequenciesError: print( 'supplied score-matrix does not imply a frequency distribution:', 'please choose another matrix if you must filter on expected identity', file=sys.stderr ) return retcode