def _concatenate(self, alignments): """Return single alignment from list of alignments for multiple genes.""" if len(alignments) == 1: return alignments[0] # sort IDs alignment_ids = [] for gene in alignments: gene_ids = [] for rec in gene: gene_ids.append(rec.id) alignment_ids.append(gene_ids) all_ids = [] [all_ids.extend(e) for e in alignment_ids] all_ids = list(set(all_ids)) # concatenate alignment = MultipleSeqAlignment([]) for txid in all_ids: sequence = "" for i, gene in enumerate(alignments): if txid in alignment_ids[i]: sequence += gene[alignment_ids[i].index(txid)].seq else: sequence += "-" * gene.get_alignment_length() sequence = SeqRecord(sequence, id=txid, description="multigene sequence") alignment.append(sequence) return alignment
def to_generic(self, alphabet): """Retrieve generic alignment object for the given alignment. Instead of the tuples, this returns a MultipleSeqAlignment object from Bio.Align, through which you can manipulate and query the object. alphabet is the specified alphabet for the sequences in the code (for example IUPAC.IUPACProtein). Thanks to James Casbon for the code. """ # TODO - Switch to new Bio.Align.MultipleSeqAlignment class? seq_parts = [] seq_names = [] parse_number = 0 n = 0 for name, start, seq, end in self.alignment: if name == 'QUERY': # QUERY is the first in each alignment block parse_number += 1 n = 0 if parse_number == 1: # create on first_parse, append on all others seq_parts.append(seq) seq_names.append(name) else: seq_parts[n] += seq n += 1 generic = MultipleSeqAlignment([], alphabet) for (name, seq) in zip(seq_names, seq_parts): generic.append(SeqRecord(Seq(seq, alphabet), name)) return generic
def stage_one_trimming(alignment, window_size, proportion, threshold, min_len): """ --------------------------------------------------------------------- MODIFIED FUNCTION FROM PHYLUCE: generic_align.py --------------------------------------------------------------------- First stage alignment trimming to find and trim edges of a given alignment. Calls running_average function above to determine reasonable alignment start and end trimming for the entire alignment block. """ start, end = running_average(alignment, window_size, proportion, threshold) s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?")) for sequence in alignment: sequence.seq.alphabet = IUPAC.IUPACAmbiguousDNA() if start >= 0 and end: trim = sequence[start:end] if set(trim) != set( ['-']) and set(trim) != (['?']) and len(trim) >= min_len: s1_trimmed.append(sequence[start:end]) else: s1_trimmed = None break else: s1_trimmed = None break return s1_trimmed
def test_basic_alignment(self): """Basic tests on a simple alignment of three sequences.""" alignment = MultipleSeqAlignment([]) letters = "AbcDefGhiJklMnoPqrStuVwxYz" alignment.append(SeqRecord(Seq(letters), id="mixed")) alignment.append(SeqRecord(Seq(letters.lower()), id="lower")) alignment.append(SeqRecord(Seq(letters.upper()), id="upper")) self.assertEqual(alignment.get_alignment_length(), 26) self.assertEqual(len(alignment), 3) self.assertEqual(str(alignment[0].seq), letters) self.assertEqual(str(alignment[1].seq), letters.lower()) self.assertEqual(str(alignment[2].seq), letters.upper()) self.assertEqual(alignment[0].id, "mixed") self.assertEqual(alignment[1].id, "lower") self.assertEqual(alignment[2].id, "upper") for (col, letter) in enumerate(letters): self.assertEqual(alignment[:, col], letter + letter.lower() + letter.upper()) # Check row extractions: self.assertEqual(alignment[0].id, "mixed") self.assertEqual(alignment[-1].id, "upper") # Check sub-alignment extraction by row slicing: self.assertIsInstance(alignment[::-1], MultipleSeqAlignment) self.assertEqual(alignment[::-1][0].id, "upper") self.assertEqual(alignment[::-1][2].id, "mixed")
def trim_seqs_to_ref(self): """ Trim the requested sequences to the reference length in the alignment. """ temp_aln = MultipleSeqAlignment([]) for seq in self.alignment: if seq.id in self.trim_seqs and self.trim_seqs: sequence = MutableSeq(str(seq.seq)) if self.boundary[0] > 0: sequence[0:self.boundary[0]] = self.gap_char * ( self.boundary[0] - 0) if self.boundary[1] < len(sequence): sequence[self.boundary[1]:] = self.gap_char * ( len(sequence) - self.boundary[1]) seq.seq = sequence if set(seq.seq) == set({self.gap_char}): print( f"{seq.id} contains only gaps after trimming. " f"Removing {seq.id} from alignment.", file=sys.stderr) else: temp_aln.append(seq) else: temp_aln.append(seq) self.alignment = temp_aln
def removecolumnfrommask(seqfile, filetype, mask): outFile = open(seqfile.split('.')[0] + '_masked.fas', 'w+') alignment = AlignIO.read(seqfile, filetype) trimAlign = MultipleSeqAlignment([]) numCol = alignment.get_alignment_length() colToKeep = [] coltoremove = [] for k in open(mask, 'r'): coltoremove.append(int(k.split('\n')[0])) print(len(coltoremove)) for i in range(numCol): if i not in coltoremove: colToKeep.append(i) print(len(colToKeep)) print('if okay remove+keep (', int(len(coltoremove) + len(colToKeep)), ') match ', int(numCol)) for record in alignment: newseq = "" for j in colToKeep: newseq = newseq + (record[j]) newRecord = SeqRecord(Seq(newseq), id=record.id) trimAlign.append(newRecord) if 'SWARM' in record.id: outFile.write('>' + record.id.split('_')[0] + '\n' + newseq + '\n') else: outFile.write('>' + record.id + '\n' + newseq + '\n') outFile.close() print("Total number of columns remaining: %i" % trimAlign.get_alignment_length())
def getOptimalQuartets(self, quartets): optimal_quartets = dict() for quartet in quartets: quartet_id = self.getQuartetID(quartet) assert quartet_id not in optimal_quartets trees = {tuple(quartet): self.treeFromQuartet(quartet)} for i in range(0, 2): temp = quartet[i] quartet[i] = quartet[2] quartet[2] = temp trees[tuple(quartet)] = self.treeFromQuartet(quartet) min_cost = float("inf") for quartet_key, tree in trees.iteritems(): alignment = MultipleSeqAlignment([]) for record in self._alignment: if record.id in quartet: alignment.append(record) small_parsimony = SmallParsimony(tree, alignment) if small_parsimony.cost < min_cost: min_cost = small_parsimony.cost optimal_quartets[quartet_id] = { "topology": quartet_key, "topology_id": self.getTopologyID(quartet_key) } return optimal_quartets
def stage_one_trimming(self, alignment, window_size, threshold, proportion): """ First stage (of 3) alignment trimming to find and trim edges of a given alignment. Calls running_average function above to determine reasonable alignment start and end trimming for the entire alignment block. """ # get the trim positions that we determine begin and end "good" # alignments start, end = self.running_average(alignment, window_size, threshold, proportion) # create a new alignment object to hold our alignment s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-")) for sequence in alignment: if start >= 0 and end: trim = sequence[start:end] # ensure we don't just add a taxon with only gaps/missing # data if set(trim) != set(['-']) and set(trim) != (['?']): s1_trimmed.append(sequence[start:end]) else: s1_trimmed = None break else: s1_trimmed = None break return s1_trimmed
def load_weighted_msa(work_msa): """ The given multiple sequence alignment (MSA) should contain the reference sequence. The reference will be removed from the alignment and returned separately. The alignment will also be changed such that "." is used to indicate terminal deletions while "-" is used to indicate internal deletions. """ msa_with_ref = AlignIO.read(work_msa.output_aln, 'clustal') ref = None msa = MultipleSeqAlignment([]) for record in msa_with_ref: # Use "." to indicate terminal mismatches, and "-" to indicate internal # mismatches. to_dots = lambda m: '.' * (m.end() - m.start()) record.seq = Seq( re.sub('^-+|-+$', to_dots, str(record.seq)), record.seq.alphabet, ) if record.id == work_msa.shared.target_id: ref = record else: msa.append(record) msa.ref = ref msa.ref_ungapped = remove_gaps(ref.seq) weight_alignments(msa) return msa
def maskResiduesNOMAP(refMSA_file, numseq, alnlen, scores, x, formatout, final_file, seqType): ''' Masks poorly aligned residues whose score is <x. Will NOT mask gaps.''' new='?' parsed = AlignIO.read(refMSA_file, 'fasta') newseqs=[] numres=0 totalmasked=0 maskedMSA=MultipleSeqAlignment([]) for row in range(numseq): newseq='' for position in range(alnlen): thispos=str(parsed[row].seq[position]) if thispos=='-': newseq=newseq+parsed[row].seq[position] else: numres+=1 thescore=scores[row][position] if float(thescore)<float(x): #mask if below threshold. newseq=newseq+new totalmasked+=1 else: #or, keep that position newseq=newseq+parsed[row].seq[position] newseqs.append(newseq) for i in range(numseq): if str(seqType)=='protein': aln_record=SeqRecord(Seq(newseqs[i],generic_protein), id=str(i+1), description='') elif str(seqType)=='dna': aln_record=SeqRecord(Seq(newseqs[i],generic_dna), id=str(i+1), description='') maskedMSA.append(aln_record) outhandle=open(final_file, 'w') outhandle.write(maskedMSA.format(str(formatout))) outhandle.close()
def pad_nucleotide_sequences(aln_aa, seq_nuc): ''' introduce gaps of 3 (---) into nucleotide sequences corresponding to aligned DNA sequences. Parameters: - aln_aa: amino acid alignment - seq_nuc: unaligned nucleotide sequences. Returns: - aligned nucleotide sequences with all gaps length 3 ''' from Bio.Align import MultipleSeqAlignment from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq aln_nuc = MultipleSeqAlignment([]) for aa_seq in aln_aa: try: tmp_nuc_seq = str(seq_nuc[aa_seq.id].seq) except KeyError as e: print aa_seq.id print 'Key not found, continue with next sequence' continue tmpseq = '' nuc_pos = 0 for aa in aa_seq: if aa=='-': tmpseq+='---' else: tmpseq+=tmp_nuc_seq[nuc_pos:(nuc_pos+3)] nuc_pos+=3 aln_nuc.append(SeqRecord(seq=Seq(tmpseq),id=aa_seq.id)) return aln_nuc
def main(): args = get_args() # iterate through all the files to determine the longest alignment files = get_files(args.input) all_taxa = set([]) for count, f in enumerate(files): #new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) new_align = MultipleSeqAlignment([], generic_dna) for align in AlignIO.parse(f, 'nexus'): for seq in list(align): #pdb.set_trace() fname = os.path.splitext(os.path.basename(f))[0] new_seq_name = re.sub("^{}_*".format(fname), "", seq.name) all_taxa.add(new_seq_name) seq.id = new_seq_name seq.name = new_seq_name new_align.append(seq) assert len(all_taxa) == args.taxa, "Taxon names are not identical" outf = os.path.join(args.output, os.path.split(f)[1]) try: AlignIO.write(new_align, open(outf, 'w'), 'nexus') except ValueError: pdb.set_trace() print count print "Taxon names in alignments: {0}".format(','.join(list(all_taxa)))
def testLimit(self, list_seqs, start): """ Extract the aa sequences in the window. list_seqs is the list of sequence id in the alignment (not the id associated with the Bio.Seq object). start is the index of the start of the window. """ frame = start % 3 aa_window_length = int(self.window_length / 3) begin = int((start - frame) / 3) end = int(begin + aa_window_length) if frame == 0: t_align = self.t_align0 elif frame == 1: t_align = self.t_align1 else: t_align = self.t_align2 sub_align = MulAlign([], Gapped(IUPAC.ExtendedIUPACProtein(), "N")) for idx in list_seqs: sub_align.append(t_align[idx][begin:end]) result = [] for c in range(aa_window_length): c = Counter(sub_align[:, c]) # count the most common aa nbr_most_common = c.most_common(1)[0][1] if nbr_most_common / len(list_seqs) >= self.min_aa_ratio: result.append(True) else: result.append(False) return result
def clean_seqs(gene): '''clean up sequences to remove N & - characters''' clean_gene = MultipleSeqAlignment([]) for genome in gene: if genome.seq.count("N") + genome.seq.count("-") < 0.1 * (len( genome.seq)): clean_gene.append(genome) return clean_gene
def stage_two_trimming(self, s1_trimmed, window_size=5): """ Alignment row-by-row trimming. After stage one trimming, iterate over rows of alignment to find differences between the alignment consensus and the row of data. Trim those ends coming before (or after at 3' end) a block of 5 contiguous highly conserved positions. Goes to third round of filtering to remove edges that end up with only '----' characters to start or end alignment block. """ # create new alignment object to hold trimmed alignment s2_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?")) # get consensus of alignment in array form consensus_array = numpy.array( list(self._alignment_consensus(s1_trimmed))) # iterate over each alignment sequence for sequence in s1_trimmed: #if sequence.id == 'phaenicophaeus_curvirostris2': # pdb.set_trace() start, end = self._get_ends(sequence) # convert sequence to array orig_seq_array = numpy.array(list(sequence)) # trim down edge gaps so they do not exert undue influence # on the running average seq_array = orig_seq_array[start:end] compare = (seq_array == consensus_array[start:end]) weight = numpy.repeat(1.0, window_size) / window_size # compute running average across window size running_average = numpy.convolve(compare, weight, 'same') # get first 5' and 3' positions where quality > 1 over # 5 positions ([True, True, True, True, True]). This helps # us find the ends of the alignment where there are likely # problems) gm = (running_average > 0.99) for i in xrange(gm.size): # get 5 value slices if numpy.all(gm[i:i + 5] == True): bad_start = i break reversed_gm = gm[::-1] for i in xrange(reversed_gm.size): # get 5 value slices if numpy.all(reversed_gm[i:i + 5] == True): bad_end = reversed_gm.size - i break orig_seq_array[:start + bad_start] = '-' orig_seq_array[start + bad_end:] = '-' trim = ''.join(orig_seq_array) # feed those up to replacement engine to set all # missing/trimmed data at edges to "?" which is # missing data designator #trim = self._replace_ends(trim) if set(trim) != set(['-']) and set(trim) != (['?']): s2_trimmed.append(self._record_formatter(trim, sequence.id)) else: s2_trimmed = None break return s2_trimmed
def gap_span(reads, bases): ''' Returns a MSA with rows=reads and columns=bases, composed of gaps only ''' spal = MultipleSeqAlignment(alphabet) span = ''.join('-' * bases) for r in reads: spal.append(Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(span, alphabet), id=r)) return spal
def remove_seed_duplicates(msa, seed_index): sequence = str(msa[seed_index].seq) return_msa = MultipleSeqAlignment([]) for (index, seq_record) in enumerate(msa): if (index == seed_index): return_msa.append(seq_record) else: if (str(seq_record.seq) != sequence): return_msa.append(seq_record) return return_msa
def bam2Alignment(sam_name, chrom=None, start=None, stop=None, minlen=1): """ Read alignment from samfile and return Alignment object. """ it = sam_name.fetch(chrom, start, stop) aln = MultipleSeqAlignment(alphabet) for read in it: if read.rlen - start + read.pos + 1 > minlen and stop - read.pos + 1 >= minlen: aln.append(getSeqRecord(read, start=start, stop=stop)) return aln
def replace_gaps(aln): """we need to determine actual starts of alignments""" new_aln = MultipleSeqAlignment([], generic_dna) for taxon in aln: seq = replace_gaps_at_start_and_ends(taxon.seq) new_aln.append( SeqRecord(seq, id=taxon.id, name=taxon.name, description=taxon.description)) return new_aln
def json_to_Bio_alignment(seq_json): from Bio.Align import MultipleSeqAlignment from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq aln = MultipleSeqAlignment([]) for seq in seq_json: aln.append( SeqRecord(name=seq['strain'], id=seq['strain'], seq=Seq(seq['seq']))) return aln
def refactor_title_allmsa(msa): """ refactors titles of sequence in format needed for histoneDB seeds """ msa_r = MultipleSeqAlignment([]) for i in msa: print(i.description) # genus=re.search(r"\[(\S+)\s+.+\S+\]",i.description).group(1) text = re.search(r"(\S+)\|(\d+)\|(\S+)", i.id) i.id = text.group(3) + "|" + text.group(1) + "|" + text.group(2) # i.description=genus+"_"+variant+"_"+gi msa_r.append(i) return msa_r
def aln_undup(alignment): """Removes duplicate keys""" aln = MultipleSeqAlignment([]) checksums = set() for record in alignment: checksum = seguid(record.seq) if checksum in checksums: print "Ignoring %s" % record.id continue checksums.add(checksum) aln.append(record) return aln
def maskalignment(arg, percent, percentmissing, filetype): maskedcolumn = open( arg.split('.')[0] + '_mask_' + str(percentmissing) + '.txt', 'w+') outFile = open( arg.split('.')[0] + '_masked_' + str(percentmissing) + '.fas', 'w+') checkgap = open(arg.split('.')[0] + '_missingcharacter.txt', 'w+') alignment = AlignIO.read(arg, filetype) trimAlign = MultipleSeqAlignment([]) numRows = len(alignment) x = float(percent) * float(numRows) / 100.0 numGap = numRows - float(x) numCol = alignment.get_alignment_length() print("Total number of rows: %i" % numRows) print("Number of gapped sequences allowed at a given site: %i" % numGap) print("Total number of columns: %i" % numCol) checkgap.write("Total number of rows: \t" + str(numRows) + '\nNumber of gapped sequences allowed at a given site: \t' + str(numGap) + '\n Total number of columns: \t' + str(numCol) + '\n\n cutoff : \t' + str(x) + '\n\n\n') checkgap.write("Position \t Missing Characters \t Characters \n") my_array = {} colToKeep = [] for i in range(numCol): #print i lineName = "line_" + str(i) my_array[lineName] = alignment[:, i] chapre = int(numRows) - int(my_array[lineName].count('-')) checkgap.write( str(i) + '\t' + str(my_array[lineName].count('-')) + '\t' + str(chapre) + '\n') if my_array[lineName].count('-') > numGap: print("get rid of column %i" % i) maskedcolumn.write(str(i) + '\n') else: colToKeep.append(i) for record in alignment: newseq = "" for i in colToKeep: newseq = newseq + (record[i]) newRecord = SeqRecord(Seq(newseq), id=record.id) trimAlign.append(newRecord) outFile.write('>' + record.id + '\n' + newseq + '\n') print("Total number of columns remaining: %i" % trimAlign.get_alignment_length())
def mult_align(sum_dict, align_dict): """Return multiple alignment instance (MultipleSeqAlignment).""" mult_align_dict = {} for j in align_dict.abs(1).pos_align_dict: mult_align_dict[j] = "" for i in range(1, len(align_dict) + 1): # loop on positions for j in align_dict.abs(i).pos_align_dict: # loop within a position mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa fssp_align = MultipleSeqAlignment([]) for i in sorted(mult_align_dict): fssp_align.append( SeqRecord(Seq(mult_align_dict[i]), sum_dict[i].pdb2 + sum_dict[i].chain2)) return fssp_align
def refactor_title(msa, variant): """ refactors titles of sequence in format needed for histoneDB seeds """ msa_r = MultipleSeqAlignment([]) for i in msa: # print i.description gi = re.search(r"gi\|(\d+)\|", i.id).group(1) try: genus = re.search(r"\[(\S+)\s+.+\S+\]", i.description).group(1) except: genus = get_genus_by_gi(gi) i.id = genus + "|" + gi + "|" + variant i.description = genus + "_" + variant + "_" + gi msa_r.append(i) return msa_r
def translate(self, align, offset): """ Translate the alignment according to the selected frame which is set according to 'offset' value """ end = ((align.get_alignment_length() - offset) // 3) * 3 + offset t_align = MulAlign([], Gapped(IUPAC.ExtendedIUPACProtein(), "N")) for rec in align: seq = str(rec.seq).upper().replace("-", "N").replace("n", "N") new_seq = Seq(seq, IUPAC.IUPACAmbiguousDNA())[offset:end].translate() new_rec = SeqRecord(new_seq, name=rec.name, id=rec.id, description="") t_align.append(new_rec) return t_align
def mult_align(sum_dict, align_dict): """Returns a biopython multiple alignment instance (MultipleSeqAlignment)""" mult_align_dict = {} for j in align_dict.abs(1).pos_align_dict: mult_align_dict[j] = '' for i in range(1, len(align_dict) + 1): # loop on positions for j in align_dict.abs(i).pos_align_dict: # loop within a position mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein) fssp_align = MultipleSeqAlignment([], alphabet=alpha) for i in sorted(mult_align_dict): fssp_align.append(SeqRecord(Seq(mult_align_dict[i], alpha), sum_dict[i].pdb2 + sum_dict[i].chain2)) return fssp_align
def stage_two_trimming(s1_trimmed, window_size, max_divergence, min_len): """ --------------------------------------------------------------------- MODIFIED FUNCTION FROM PHYLUCE: generic_align.py --------------------------------------------------------------------- Alignment row-by-row trimming. After stage one trimming, iterate over rows of alignment to find differences between the alignment consensus and the row (taxon) of data. Trim those ends that differ from the consensus with > `divergence` across a `window_size` window. Goes to third round of filtering to remove edges that end up with only '----' characters to start or end alignment block. """ s2_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?")) consensus_array = numpy.array(list(alignment_consensus(s1_trimmed))) for sequence in s1_trimmed: sequence = sequence.upper() start, end = get_ends(sequence) orig_seq_array = numpy.array(list(sequence)) seq_array = orig_seq_array[start:end] bad_start = 0 bad_end = len(sequence) compare = (seq_array != consensus_array[start:end]) for bad_start in range(compare.size): window = compare[bad_start:bad_start + window_size] divergence = float(sum(window)) / window.size if divergence < max_divergence: break reversed_compare = compare[::-1] for bad_end in range(reversed_compare.size): window = reversed_compare[bad_end:bad_end + window_size] divergence = float(sum(window)) / window.size if divergence < max_divergence: bad_end = reversed_compare.size - bad_end break orig_seq_array[:start + bad_start] = '-' orig_seq_array[start + bad_end:] = '-' trim = ''.join(orig_seq_array) if set(trim) != set( ['-']) and set(trim) != (['?']) and len(trim) >= min_len: s2_trimmed.append(record_formatter(trim, sequence.id)) else: s2_trimmed = None break return s2_trimmed
def concatenate_msa(out_dir): with open(os.path.join(out_dir, 'supermatrix-msa.phy'), 'w') as fh: # set the order of the taxa based on the first MSA file taxa = {} for record in SeqIO.parse(os.path.join(out_dir, 'msa-0.fasta'), 'fasta'): taxa[record.id] = Seq('', generic_dna) # get each MSA file and concatenate it to the supermatrix for msa_file in glob.glob(os.path.join(out_dir, 'msa-*.fasta')): alignment = SeqIO.to_dict(SeqIO.parse(msa_file, 'fasta')) for taxon, seq in taxa.items(): taxa[taxon] += alignment[taxon].seq #seq.append(str(alignment[taxon].seq)) # write the supermatrix to the file msa = MultipleSeqAlignment([], alphabet=generic_dna) for taxon, seq in taxa.items(): msa.append(SeqRecord(seq, id=taxon)) AlignIO.write(msa, fh, 'phylip')
def stage_one_trimming(self, alignment, window_size, proportion, threshold, min_len, replace_ends=False): """ First stage (of 3) alignment trimming to find and trim edges of a given alignment. Calls running_average function above to determine reasonable alignment start and end trimming for the entire alignment block. """ # get the trim positions that we determine begin and end "good" # alignments start, end = self.running_average(alignment, window_size, proportion, threshold) # create a new alignment object to hold our alignment s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?")) for sequence in alignment: # ensure correct sequence alphabet or we'll get a conflict when # we try to generate a consensus sequence.seq.alphabet = IUPAC.IUPACAmbiguousDNA() if start >= 0 and end: trim = sequence[start:end] # ensure we don't just add a taxon with only gaps/missing # data and that alignments are >= min_len if set(trim) != set( ['-']) and set(trim) != (['?']) and len(trim) >= min_len: if not replace_ends: s1_trimmed.append(sequence[start:end]) else: # replace end gaps with missing data character ? # called on third iteration of trimming repl = self._replace_ends(str(sequence[start:end].seq)) s1_trimmed.append( self._record_formatter(repl, sequence.id)) else: s1_trimmed = None break else: s1_trimmed = None break return s1_trimmed