records = list()
        for i, k in enumerate(taxID):
            try:
                handle = Entrez.efetch(db="taxonomy", id=k, retmode="xml")
                temp_rec = Entrez.read(handle)
                handle.close()
                records.append(temp_rec[0])
                print("%s" % (temp_rec[0]['Lineage']))
                print("%s" % (temp_rec[0]['ScientificName']))
            except:
                records.append('')
        end = time.clock()
        print("Look up for taxonomy information complete. Time: %f" %
              (end - start))

        # Write to the output fasta file.
        s_records = list()
        [hd, seqs] = sca.readAlg(options.Input_MSA)
        f = open(options.output, 'w')
        for i, k in enumerate(seqs):
            try:
                hdnew = hd[i] + '|' + records[i][
                    'ScientificName'] + '|' + ','.join(
                        records[i]['Lineage'].split(';'))
            except:
                hdnew = hd[i] + '| unknown '
                print("Unable to add taxonomy information for seq: %s" % hd[i])
            f.write('>%s\n' % hdnew)
            f.write('%s\n' % k)
        f.close()
Example #2
0
        if options.refseq is not None and options.refpos is None:
            print_("Using reference sequence but no position list provided! Just numbering positions 1 to length(sequence)")
            if options.pdbid is not None:
                print_("And...ignoring the PDB file...")
                options.pdbid = None
            options.refpos = range(len(options.refseq)) + 1
        if options.refseq is not None and options.refpos is not None:
            print_("Using the reference sequence and position list...")
            if options.pdbid is not None:
                print_("And...ignoring the PDB file...")
                options.pdbid = None
    else:
        i_ref = options.i_ref

    # Read in initial alignment
    headers_full, sequences_full = sca.readAlg(options.alignment)
    print_('Loaded alignment of {:d} sequences, {:d} positions.'.format(len(headers_full), len(sequences_full[0])))

    # Check the alignment and remove sequences containing non-standard amino acids
    print_("Checking alignment for non-standard amino acids")
    alg_out = list()
    hd_out = list()
    for i, k in enumerate(sequences_full):
        flag = 0
        for aa in k:
            if aa not in 'ACDEFGHIKLMNPQRSTVWY-':
                flag = 1
        if (flag == 0):
            alg_out.append(k)
            hd_out.append(headers_full[i])
    headers_full = hd_out
Example #3
0
            )
            if options.pdbid is not None:
                print("And...ignoring the PDB file...")
                options.pdbid = None
            options.refpos = range(len(options.refseq)) + 1

        if options.refseq is not None and options.refpos is not None:
            print("Using the reference sequence and position list...")
            if options.pdbid is not None:
                print("And...ignoring the PDB file...")
                options.pdbid = None
    else:
        i_ref = options.i_ref

    # Read in initial alignment
    headers_full, sequences_full = sca.readAlg(options.alignment)
    print('Loaded alignment of %i sequences, %i positions.' %
          (len(headers_full), len(sequences_full[0])))

    # Check the alignment and remove sequences containing non-standard amino acids
    print("Checking alignment for non-standard amino acids")
    alg_out = list()
    hd_out = list()
    for i, k in enumerate(sequences_full):
        flag = 0
        for aa in k:
            if aa not in 'ACDEFGHIKLMNPQRSTVWY-':
                flag = 1
        if (flag == 0):
            alg_out.append(k)
            hd_out.append(headers_full[i])
Example #4
0
        "-t",
        "--tolerance",
        dest="tol",
        type=int,
        default=50,
        help=
        "allowable sequence length variation in number of amino acids (alignment will be trimmed to mean +/- tolerance, default = 50)"
    )
    parser.add_argument("--output",
                        dest="outputfile",
                        default='FilteredAln.fa',
                        help="specify an outputfile name")

    options = parser.parse_args()

    headers, seqs = sca.readAlg(options.alignment)
    seqLen = np.zeros((len(seqs), 1)).astype(int)
    for i, k in enumerate(seqs):
        seqLen[i] = len(k)
    avgLen = seqLen.mean()
    print("Average sequence length: %i" % avgLen)
    print("Min: %i, Max %i" % (seqLen.min(), seqLen.max()))
    minsz = avgLen - options.tol
    maxsz = avgLen + options.tol
    print("Keeping sequences in the range: %i - %i" % (minsz, maxsz))

    keepSeqs = list()
    keepHeaders = list()
    for i, k in enumerate(seqLen):
        if (k > minsz) & (k < maxsz):
            keepSeqs.append(seqs[i])
Example #5
0
"""

import scaTools as sca
import numpy as np
import argparse

if __name__ =='__main__':
        #parse inputs
        parser = argparse.ArgumentParser()
        parser.add_argument("alignment", help='Input Sequence Alignment')
        parser.add_argument("-t","--tolerance", dest = "tol", type = int, default = 50, help="allowable sequence length variation in number of amino acids (alignment will be trimmed to mean +/- tolerance, default = 50)")
        parser.add_argument("--output", dest="outputfile", default = 'FilteredAln.fa', help="specify an outputfile name")

        options = parser.parse_args()

        headers, seqs = sca.readAlg(options.alignment)
        seqLen = np.zeros((len(seqs),1)).astype(int)
        for i,k in enumerate(seqs):
            seqLen[i] = len(k)
        avgLen = seqLen.mean()
        print ("Average sequence length: %i" % avgLen)
        print ("Min: %i, Max %i" % (seqLen.min(), seqLen.max()))
        minsz = avgLen - options.tol;
        maxsz = avgLen + options.tol;
        print ("Keeping sequences in the range: %i - %i" % (minsz, maxsz))

        keepSeqs = list()
        keepHeaders = list()
        for i,k in enumerate(seqLen):
            if (k > minsz) & (k < maxsz):
                keepSeqs.append(seqs[i])
Example #6
0
        # Collect records with lineage information
        print_("Collecting taxonomy information...")
        start = time.clock()
        records = list()
        for i, k in enumerate(taxID):
            try:
                handle = Entrez.efetch(db="taxonomy", id=k, retmode="xml")
                temp_rec = Entrez.read(handle)
                handle.close()
                records.append(temp_rec[0])
                print_("{}".format(temp_rec[0]['Lineage']))
                print_("{}".format(temp_rec[0]['ScientificName']))
            except:
                records.append('')
        end = time.clock()
        print_("Look up for taxonomy information complete. Time: {}".format(end - start))

        # Write to the output fasta file.
        s_records = list()
        [hd, seqs] = sca.readAlg(options.Input_MSA)
        with open(options.output, 'w') as f:
            for i, k in enumerate(seqs):
                try:
                    hdnew = hd[i] + '|' + records[i]['ScientificName'] + '|' + ','.join(records[i]['Lineage'].split(';'))
                except:
                    hdnew = hd[i] + '| unknown '
                    print_("Unable to add taxonomy information for seq: {}".formathd[i])
                print_('>{}'.format(hdnew, file=f))
                print_('{}'.format(k, file=f))
Example #7
0
import scaTools as sca
import argparse 


if __name__ =='__main__':
        #parse inputs
        parser = argparse.ArgumentParser()
        parser.add_argument("alignment_for_headers", help='Alignment that is providing the headers')
        parser.add_argument("alignment_for_seqs", help ='ALignment that is providing the sequences')
        parser.add_argument("--output", dest="outputfile", default = 'FixedHeaders.fa', help="specify an outputfile name")
        options = parser.parse_args()

        print ("WARNING:  This script assumes that the headers of the two input fasta files are in IDENTICAL order. If this is NOT true, the script will give incorrect results");

        headers1, seqs1 = sca.readAlg(options.alignment_for_headers)
        headers2, seqs2 = sca.readAlg(options.alignment_for_seqs)

        if (len(seqs2) != len(headers1)):
            print ("ERROR: The length of the two alignments does not match.")
            exit
                

        f = open(options.outputfile, 'w')
        for i,k in enumerate(headers1):
            f.write('>%s\n' % k)
            f.write('%s\n' % seqs2[i])
        f.close()
    
        
Example #8
0
from scipy.io import savemat

if __name__ =='__main__':
	#parse inputs
        parser = argparse.ArgumentParser()


        parser = argparse.ArgumentParser()
        parser.add_argument("alignment", help='Input Sequence Alignment')
        parser.add_argument("-o","--refpos", dest = "refpos", help="reference positions, supplied as a text file with one position specified per line")
        parser.add_argument("-i","--refindex", dest = "i_ref", type = int, help="reference sequence number in the alignment, COUNTING FROM 0")
        parser.add_argument("--output", dest="outputfile", default = None, help="specify an outputfile name")
        options = parser.parse_args()

	# Read in initial alignment
        headers_full, sequences_full = sca.readAlg(options.alignment)
        print('Loaded alignment of %i sequences, %i positions.' % (len(headers_full), len(sequences_full[0])))

	# Create the ATS
        i_ref = options.i_ref
        print "Reference sequence %i:" % (i_ref)
        print headers_full[i_ref]
        s_tmp = sequences_full[i_ref]

        try:
                f = open(options.refpos,'r')
                ats_tmp = [line.rstrip('\n') for line in f]
                print ats_tmp
                f.close()
        except:
                sys.exit("Error!! Unable to read reference positions!")