records = list() for i, k in enumerate(taxID): try: handle = Entrez.efetch(db="taxonomy", id=k, retmode="xml") temp_rec = Entrez.read(handle) handle.close() records.append(temp_rec[0]) print("%s" % (temp_rec[0]['Lineage'])) print("%s" % (temp_rec[0]['ScientificName'])) except: records.append('') end = time.clock() print("Look up for taxonomy information complete. Time: %f" % (end - start)) # Write to the output fasta file. s_records = list() [hd, seqs] = sca.readAlg(options.Input_MSA) f = open(options.output, 'w') for i, k in enumerate(seqs): try: hdnew = hd[i] + '|' + records[i][ 'ScientificName'] + '|' + ','.join( records[i]['Lineage'].split(';')) except: hdnew = hd[i] + '| unknown ' print("Unable to add taxonomy information for seq: %s" % hd[i]) f.write('>%s\n' % hdnew) f.write('%s\n' % k) f.close()
if options.refseq is not None and options.refpos is None: print_("Using reference sequence but no position list provided! Just numbering positions 1 to length(sequence)") if options.pdbid is not None: print_("And...ignoring the PDB file...") options.pdbid = None options.refpos = range(len(options.refseq)) + 1 if options.refseq is not None and options.refpos is not None: print_("Using the reference sequence and position list...") if options.pdbid is not None: print_("And...ignoring the PDB file...") options.pdbid = None else: i_ref = options.i_ref # Read in initial alignment headers_full, sequences_full = sca.readAlg(options.alignment) print_('Loaded alignment of {:d} sequences, {:d} positions.'.format(len(headers_full), len(sequences_full[0]))) # Check the alignment and remove sequences containing non-standard amino acids print_("Checking alignment for non-standard amino acids") alg_out = list() hd_out = list() for i, k in enumerate(sequences_full): flag = 0 for aa in k: if aa not in 'ACDEFGHIKLMNPQRSTVWY-': flag = 1 if (flag == 0): alg_out.append(k) hd_out.append(headers_full[i]) headers_full = hd_out
) if options.pdbid is not None: print("And...ignoring the PDB file...") options.pdbid = None options.refpos = range(len(options.refseq)) + 1 if options.refseq is not None and options.refpos is not None: print("Using the reference sequence and position list...") if options.pdbid is not None: print("And...ignoring the PDB file...") options.pdbid = None else: i_ref = options.i_ref # Read in initial alignment headers_full, sequences_full = sca.readAlg(options.alignment) print('Loaded alignment of %i sequences, %i positions.' % (len(headers_full), len(sequences_full[0]))) # Check the alignment and remove sequences containing non-standard amino acids print("Checking alignment for non-standard amino acids") alg_out = list() hd_out = list() for i, k in enumerate(sequences_full): flag = 0 for aa in k: if aa not in 'ACDEFGHIKLMNPQRSTVWY-': flag = 1 if (flag == 0): alg_out.append(k) hd_out.append(headers_full[i])
"-t", "--tolerance", dest="tol", type=int, default=50, help= "allowable sequence length variation in number of amino acids (alignment will be trimmed to mean +/- tolerance, default = 50)" ) parser.add_argument("--output", dest="outputfile", default='FilteredAln.fa', help="specify an outputfile name") options = parser.parse_args() headers, seqs = sca.readAlg(options.alignment) seqLen = np.zeros((len(seqs), 1)).astype(int) for i, k in enumerate(seqs): seqLen[i] = len(k) avgLen = seqLen.mean() print("Average sequence length: %i" % avgLen) print("Min: %i, Max %i" % (seqLen.min(), seqLen.max())) minsz = avgLen - options.tol maxsz = avgLen + options.tol print("Keeping sequences in the range: %i - %i" % (minsz, maxsz)) keepSeqs = list() keepHeaders = list() for i, k in enumerate(seqLen): if (k > minsz) & (k < maxsz): keepSeqs.append(seqs[i])
""" import scaTools as sca import numpy as np import argparse if __name__ =='__main__': #parse inputs parser = argparse.ArgumentParser() parser.add_argument("alignment", help='Input Sequence Alignment') parser.add_argument("-t","--tolerance", dest = "tol", type = int, default = 50, help="allowable sequence length variation in number of amino acids (alignment will be trimmed to mean +/- tolerance, default = 50)") parser.add_argument("--output", dest="outputfile", default = 'FilteredAln.fa', help="specify an outputfile name") options = parser.parse_args() headers, seqs = sca.readAlg(options.alignment) seqLen = np.zeros((len(seqs),1)).astype(int) for i,k in enumerate(seqs): seqLen[i] = len(k) avgLen = seqLen.mean() print ("Average sequence length: %i" % avgLen) print ("Min: %i, Max %i" % (seqLen.min(), seqLen.max())) minsz = avgLen - options.tol; maxsz = avgLen + options.tol; print ("Keeping sequences in the range: %i - %i" % (minsz, maxsz)) keepSeqs = list() keepHeaders = list() for i,k in enumerate(seqLen): if (k > minsz) & (k < maxsz): keepSeqs.append(seqs[i])
# Collect records with lineage information print_("Collecting taxonomy information...") start = time.clock() records = list() for i, k in enumerate(taxID): try: handle = Entrez.efetch(db="taxonomy", id=k, retmode="xml") temp_rec = Entrez.read(handle) handle.close() records.append(temp_rec[0]) print_("{}".format(temp_rec[0]['Lineage'])) print_("{}".format(temp_rec[0]['ScientificName'])) except: records.append('') end = time.clock() print_("Look up for taxonomy information complete. Time: {}".format(end - start)) # Write to the output fasta file. s_records = list() [hd, seqs] = sca.readAlg(options.Input_MSA) with open(options.output, 'w') as f: for i, k in enumerate(seqs): try: hdnew = hd[i] + '|' + records[i]['ScientificName'] + '|' + ','.join(records[i]['Lineage'].split(';')) except: hdnew = hd[i] + '| unknown ' print_("Unable to add taxonomy information for seq: {}".formathd[i]) print_('>{}'.format(hdnew, file=f)) print_('{}'.format(k, file=f))
import scaTools as sca import argparse if __name__ =='__main__': #parse inputs parser = argparse.ArgumentParser() parser.add_argument("alignment_for_headers", help='Alignment that is providing the headers') parser.add_argument("alignment_for_seqs", help ='ALignment that is providing the sequences') parser.add_argument("--output", dest="outputfile", default = 'FixedHeaders.fa', help="specify an outputfile name") options = parser.parse_args() print ("WARNING: This script assumes that the headers of the two input fasta files are in IDENTICAL order. If this is NOT true, the script will give incorrect results"); headers1, seqs1 = sca.readAlg(options.alignment_for_headers) headers2, seqs2 = sca.readAlg(options.alignment_for_seqs) if (len(seqs2) != len(headers1)): print ("ERROR: The length of the two alignments does not match.") exit f = open(options.outputfile, 'w') for i,k in enumerate(headers1): f.write('>%s\n' % k) f.write('%s\n' % seqs2[i]) f.close()
from scipy.io import savemat if __name__ =='__main__': #parse inputs parser = argparse.ArgumentParser() parser = argparse.ArgumentParser() parser.add_argument("alignment", help='Input Sequence Alignment') parser.add_argument("-o","--refpos", dest = "refpos", help="reference positions, supplied as a text file with one position specified per line") parser.add_argument("-i","--refindex", dest = "i_ref", type = int, help="reference sequence number in the alignment, COUNTING FROM 0") parser.add_argument("--output", dest="outputfile", default = None, help="specify an outputfile name") options = parser.parse_args() # Read in initial alignment headers_full, sequences_full = sca.readAlg(options.alignment) print('Loaded alignment of %i sequences, %i positions.' % (len(headers_full), len(sequences_full[0]))) # Create the ATS i_ref = options.i_ref print "Reference sequence %i:" % (i_ref) print headers_full[i_ref] s_tmp = sequences_full[i_ref] try: f = open(options.refpos,'r') ats_tmp = [line.rstrip('\n') for line in f] print ats_tmp f.close() except: sys.exit("Error!! Unable to read reference positions!")