def align_to_refseq( reference, records, score_matrix=None, do_codon=True, reverse_complement=True, expected_identity=None, keep_insertions=False, **kwargs ): if keep_insertions: raise ValueError('keeping insertions is unsupported at this time') if score_matrix is None: from BioExt.scorematrices import BLOSUM62 score_matrix = BLOSUM62.load() # drop-in compatibility with hy454 do_codon = kwargs.get('codon', do_codon) reverse_complement = kwargs.get('revcomp', reverse_complement) discards = [] def discard(record): discards.append(record) alignment = MultipleSeqAlignment([]) alignment_length = len(reference) def suffix_pad (record): deficit = alignment_length - len(record) if deficit > 0: return SeqRecord( Seq(''.join((str(record.seq), '-' * deficit))), id=record.id, name=record.name, dbxrefs=copy(record.dbxrefs), description=record.description, annotations=copy(record.annotations), ) return record def output(records): for record in records: alignment.append(suffix_pad(gapful(gapless(record), insertions=False))) _align_par( reference, records, score_matrix, do_codon, reverse_complement, expected_identity, discard, output ) return alignment, discards
def test_align(): ''' Ensure that sequence that ends with a '-' will not cause an error ''' dir_path = os.path.dirname(os.path.realpath(__file__)) ## Load reference sequence seqpath = os.path.join(dir_path, "./rsrc/SHORT.FASTA") output_file = os.path.join(dir_path, "./rsrc/SHORT.FASTA.test.bam") records = SeqIO.parse(seqpath, 'fasta') reference = gapless(next(records)) def allseqs(records): yield compute_cigar(reference, reference) for record in records: print(record) yield record def output(records): BamIO.write(allseqs(records), output_file, reference) _align_par(reference, records, BLOSUM62.load(), True, False, None, None, output, False) # Read output file BamIO.sort(output_file)
def test_align_to_refseq_suffix_pad(): ''' Ensure that sequence that ends with a '-' will not cause an error ''' # Load reference sequence refseq = hxb2.prrt.load() dir_path = os.path.dirname(os.path.realpath(__file__)) seqpath = os.path.join(dir_path, "./rsrc/TEST.FASTA") # Load sequences with open(seqpath) as fh: seqrecords = [record for record in SeqIO.parse(fh, "fasta")] if len (seqrecords) == 1: refseq = seqrecords[0].format ('fasta') return {'ref': refseq, 'alignment': refseq, 'seqs': seqrecords} sm = BLOSUM62.load() all([len(seqrecord) == len(seqrecords[0]) for seqrecord in seqrecords]) ### find the longest sequence msa, discarded = align_to_refseq( refseq, seqrecords, score_matrix=sm, codon=True, expected_identity=0.6, keep_insertions=False ) assert msa[3].seq == seqrecords[3].seq
def align_to_refseq( reference, records, score_matrix=None, do_codon=True, reverse_complement=True, expected_identity=None, keep_insertions=False, **kwargs ): if keep_insertions: raise ValueError('keeping insertions is unsupported at this time') if score_matrix is None: from BioExt.scorematrices import BLOSUM62 score_matrix = BLOSUM62.load() # drop-in compatibility with hy454 do_codon = kwargs.get('codon', do_codon) reverse_complement = kwargs.get('revcomp', reverse_complement) discards = [] def discard(record): discards.append(record) alignment = MultipleSeqAlignment([]) alignment_length = len(reference) def suffix_pad (record): deficit = alignment_length - len(record) if deficit > 0: return SeqRecord( Seq(''.join((str(record.seq), '-' * deficit)), record.seq.alphabet), id=record.id, name=record.name, dbxrefs=copy(record.dbxrefs), description=record.description, annotations=copy(record.annotations), ) return record def output(records): for record in records: alignment.append(suffix_pad(gapful(gapless(record), insertions=False))) _align_par( reference, records, score_matrix, do_codon, reverse_complement, expected_identity, discard, output ) return alignment, discards
def align(self, refseq, seqs, score_matrix=None, revcomp=False, expected_identity=0., keep_insertions=False, quiet=True): # if we have no sequences, abort early to prevent later errors if not len(seqs): return [], [] if score_matrix is None: if self.codon: score_matrix = BLOSUM62.load() else: score_matrix = DNAExpIdScoreMatrix( 0.8 if expected_identity == 0. else expected_identity, { 'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25 } ) if self.codon and not isinstance(score_matrix, ProteinScoreMatrix): raise ValueError('score_matrix incompatible with codon alignment') elif not self.codon and not isinstance(score_matrix, DNAScoreMatrix): raise ValueError('score_matrix incompatible with dna alignment') smdef = { ('_%s_letters' % self.__smvar, '_%s_scorematrix' % self.__smvar): score_matrix } # uppercase the refseq to deal with bugs in HyPhy's aligner refseq = refseq.upper() numseqs = len(seqs) # if the # nodes exceeds the number of seqs, we just need numseqs jobs numnodes = min(numseqs, self.nodes) seqs_per_node = max(1, numseqs // numnodes) remainder = numseqs % numnodes arg1 = 'Yes' if revcomp else 'No' arg2 = 'Yes' if keep_insertions else 'No' argslist = [] lwr, upr = 0, 0 for i in range(numnodes): # since our traversal is stateful, keep these cursors # around. During the first remainder iterations, # add an extra seq to the list of seqs, afterwards # proceed as normal lwr = upr if i < remainder: upr = min(numseqs, lwr + seqs_per_node + 1) else: upr = min(numseqs, lwr + seqs_per_node) node_seqs = [s.upper() for s in seqs[lwr:upr]] argslist.append( [arg1, arg2, refseq, expected_identity, len(node_seqs)] + node_seqs ) retstrs = self.map(argslist, globalvars=smdef, quiet=quiet) seqscores = [] for retstr in retstrs: seqscores.extend(json.loads(retstr)) newrefstrs, newseqstrs, scores, overlaps, identities = zip(*seqscores) return list(newrefstrs), list(newseqstrs), list(scores), list(overlaps), list(identities)
def run_group_alignment (sequence_group): print ("%d sequences with matching JUNCTION regions" % (len (sequence_group) - 1)) seqrecords = [] for seq_id in sequence_group: #print ("Step 1\n%s" % sequence_group[seq_id]) massaged_string = sequence_group[seq_id].replace ('NNN','').replace ('---','').replace ('-','N') #print ("Step 2\n%s" % massaged_string) if len (massaged_string) % 3: massaged_string = massaged_string [:len (massaged_string) - len (massaged_string) % 3] #print ("Step 3\n%s" % massaged_string) seqrecords.append(gapless(Bio.SeqRecord.SeqRecord (Bio.Seq.Seq(massaged_string), id = seq_id, name = seq_id, description = ''))) if len (seqrecords) == 1: refseq = seqrecords[0].format ('fasta') return {'ref': refseq, 'alignment': refseq, 'seqs': seqrecords} # find the longest sequence seq_lengths = [len(record.seq) for record in seqrecords] refseq_id = seq_lengths.index(max(seq_lengths)) refseq = seqrecords.pop(refseq_id) #print (len (seqrecords)) if len(refseq.seq) % 3: seqrecords = [s for s in seqrecords] print (">ref\n%s" % str(refseq.seq)) print ('\n'.join ([">%s\n%s" % (str(k.id), str(k.seq)) for k in seqrecords])) sm = BLOSUM62.load() msa, discarded = align_to_refseq( refseq, seqrecords, score_matrix=sm, do_codon=True, reverse_complement=False, #expected_identity=0.6, keep_insertions=False, ) if len (discarded): print (">ref\n%s" % str(refseq.seq)) print ('\n'.join ([">%s\n%s" % (str(k.id), str(k.seq)) for k in seqrecords])) print (discarded) raise Exception ("Non-empty discarded") sys.exit (1) string_buffer = io.StringIO () Bio.SeqIO.write (msa, string_buffer, "fasta") all_lines = string_buffer.getvalue() string_buffer.close() return {'ref': refseq.format ('fasta'), 'alignment': all_lines, 'seqs': seqrecords}
def align_to_refseq( reference, records, score_matrix=None, do_codon=True, reverse_complement=True, expected_identity=None, keep_insertions=False, **kwargs ): if keep_insertions: raise ValueError('keeping insertions is unsupported at this time') if score_matrix is None: from BioExt.scorematrices import BLOSUM62 score_matrix = BLOSUM62.load() # drop-in compatibility with hy454 do_codon = kwargs.get('codon', do_codon) reverse_complement = kwargs.get('revcomp', reverse_complement) discards = [] def discard(record): discards.append(record) alignment = MultipleSeqAlignment([]) def output(records): for record in records: alignment.append(gapful(gapless(record), insertions=False)) _align_par( reference, records, score_matrix, do_codon, reverse_complement, expected_identity, discard, output ) return alignment, discards
def align_to_refseq(refseq, seqrecords, score_matrix=None, codon=True, revcomp=True, expected_identity=0., keep_insertions=False, quiet=False): if score_matrix is None: score_matrix = BLOSUM62.load() _, aligned, scores, overlaps, identities = Aligner(codon=codon)( str(refseq.seq), [str(s.seq) for s in seqrecords], score_matrix, revcomp, expected_identity, keep_insertions, quiet ) # deepcopy the seqrecords so that we can change their sequences later aligned_records = [] discarded_records = [] for i, aln in enumerate(aligned): old = seqrecords[i] if expected_identity > 0. and identities[i] < 0: discarded_records.append(old) else: annotations = deepcopy(old.annotations) annotations['_nbpidentical'] = overlaps[i] annotations['_pbpscore'] = scores[i] new = SeqRecord( Seq(aln, generic_nucleotide), old.id, old.name, old.description, deepcopy(old.dbxrefs), deepcopy(old.features), annotations # don't grab the letter_annotations, # they won't match anymore ) aligned_records.append(new) if not keep_insertions: return MultipleSeqAlignment(aligned_records), discarded_records return aligned_records, discarded_records
#!/usr/bin/env python3 import nose from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from BioExt.align import Aligner from BioExt.scorematrices import BLOSUM62 aln = Aligner(BLOSUM62.load()) class test_Aligner(): def test_align_self(self): """Check alignment of reference against itself""" assert aln('GCTAGA', 'GCTAGA') == (4.5, 'GCTAGA', 'GCTAGA') def test_align_self_case(self): """Check case is irrelevant for self-alignment""" assert aln('GCTAGA', 'GCTAGA') == aln('GCTAGA', 'GcTaGa') def test_align_self_seq_to_str(self): """Check alignment of Seq instance against seq-identical str""" ref = 'GCTAGA' record = Seq(ref) assert aln(ref, record) == (4.5, ref, record) def test_align_self_seqrecord_to_str(self): """Check alignment of SeqRecord instance against seq-identical str""" ref = 'GCTAGA'
def validate( refseq, seqs, dna_score_matrix=None, protein_score_matrix=None, dna_mismatch=0, protein_mismatch=0, codon=True, revcomp=True, expected_identity=0., keep_insertions=True, quiet=False): msg = "cannot validate sequences that are not SeqRecord, Seq, or str objects" if isinstance(refseq, SeqRecord): r = str(refseq.seq) elif isinstance(refseq, Seq): r = str(refseq) elif isinstance(refseq, str): r = refseq else: raise ValueError(msg) qs = [] for i, q in enumerate(seqs): if isinstance(q, SeqRecord): qs.append(str(q.seq)) elif isinstance(q, Seq): qs.append(str(q)) elif isinstance(q, str): qs.append(q) else: raise ValueError(msg) if dna_score_matrix is None: dna_score_matrix = DNA80 if protein_score_matrix is None: score_matrix = BLOSUM62.load() if codon: score_matrix = protein_score_matrix else: score_matrix = dna_score_matrix aligner = Aligner(codon=codon) refs, queries, _, _, identities = aligner( r, qs, score_matrix, revcomp, expected_identity, keep_insertions, quiet ) lengths = [] dna_scores = [] protein_scores = [] for r, q, i in zip(refs, queries, identities): assert len(r) == len(q), 'sequences unaligned for some reason' lengths.append(len(r)) if expected_identity > 0. and i < expected_identity: dna_scores.append(None) protein_scores.append(None) else: dna_scores.append(dna_score_matrix(r, q, dna_mismatch)) # we can translate codon-aligned sequences, # but not DNA-aligned sequences if codon: protein_scores.append( protein_score_matrix( translate(r), translate(q), protein_mismatch ) ) else: protein_scores.append(None) return lengths, dna_scores, protein_scores