def check_consensus_length(data_folder, adaID, fragment, VERBOSE=0): '''Check consensus length, and if too short or absent complain''' from Bio import AlignIO ali_fn = get_reference_consensus_ali_filename(data_folder, adaID, fragment) if not os.path.isfile(ali_fn): if VERBOSE >= 2: print 'Consensus alignment to reference not found', adaID, fragment return False ali = AlignIO.read(ali_fn, 'fasta') len_ref = len(ali[0].seq.ungap('-')) len_cons = len(ali[1].seq.ungap('-')) if len_cons < len_ref - 200: if VERBOSE >= 2: print 'Consensus alignment to reference too short: ref', len_ref, 'cons:', len_cons return False elif len_cons > len_ref + 200: if VERBOSE >= 2: print 'Consensus alignment to reference too long: ref', len_ref, 'cons:', len_cons return False if VERBOSE >= 2: print 'Consensus checked, has approximately the right length: ref', len_ref, 'cons:', len_cons return True
if VERBOSE >= 2: print ali[:, :30] print ali[:, -30:] print 'Lenghts: ref', len(refseq), 'consensus', len( consensusseq) len_ali = ali.get_alignment_length() n_diff = sum(ali[0, i] != ali[1, i] for i in xrange(len_ali)) print 'Differences from ref:', n_diff, '(' + '{:3.1f}'.format( 100.0 * n_diff / len_ali) + '%)' # Ungap consensus consensusseq = SeqRecord(ali[1].seq, id=name, name=name) if '-' in consensusseq: consensusseq.seq = consensusseq.seq.ungap('-') # Write output outfile = get_consensus_filename(data_folder, adaID, frag_out, trim_primers=True) SeqIO.write(consensusseq, outfile, 'fasta') AlignIO.write( ali, get_reference_consensus_ali_filename(data_folder, adaID, fragment), 'fasta') if store_allele_counts: allele_counts.dump( get_allele_counts_filename(data_folder, adaID, frag_out))
ali = align_muscle(refseq, consensusseq, sort=True) if ali[0][-1] == '-': start_nongap = len(ali[0]) - len(ali[0].seq.lstrip('-')) end_nongap = len(ali[0].seq.rstrip('-')) ali = ali[:, start_nongap: end_nongap] if VERBOSE >= 2: print ali[:, :30] print ali[:, -30:] print 'Lenghts: ref', len(refseq), 'consensus', len(consensusseq) len_ali = ali.get_alignment_length() n_diff = sum(ali[0, i] != ali[1, i] for i in xrange(len_ali)) print 'Differences from ref:', n_diff, '('+'{:3.1f}'.format(100.0 * n_diff / len_ali)+'%)' # Ungap consensus consensusseq = SeqRecord(ali[1].seq, id=name, name=name) if '-' in consensusseq: consensusseq.seq = consensusseq.seq.ungap('-') # Write output outfile = get_consensus_filename(data_folder, adaID, frag_out, trim_primers=True) SeqIO.write(consensusseq, outfile, 'fasta') AlignIO.write(ali, get_reference_consensus_ali_filename(data_folder, adaID, fragment), 'fasta') if store_allele_counts: allele_counts.dump(get_allele_counts_filename(data_folder, adaID, frag_out))