Esempio n. 1
0
    def write_as_fasta(self, fh, n=None):
        """-----------------------------------------------------------------------------------------
        Write to a file in fasta format, if n is defined, write only the specified ORF in the list

        :param fh, open filehandle for writing
        :param n: integer, index of ORF to write, write all if not specified
        :return: n
        -----------------------------------------------------------------------------------------"""
        fasta = Fasta()
        nwritten = 0

        if n is None:
            # print all ORFS
            for orf in self.orf:
                fasta.id = orf['id']
                fasta.doc = 'len={} strand={} frame={} begin={} end={}'. \
                    format(orf['length'], orf['direction'], orf['frame'], orf['begin'], orf['end'])
                fasta.seq = orf['sequence']
                fh.write(fasta.format(linelen=60))
                fh.write('\n')
                nwritten += 1

        elif n < len(self.orf):
            # print the selected ORF
            orf = self.orf[n]
            fasta.id = orf['id']
            fasta.doc = 'len={} strand={} frame={} begin={} end={}'. \
                format(orf['length'], orf['direction'], orf['frame'], orf['begin'], orf['end'])
            fasta.seq = orf['sequence']
            fh.write(fasta.format(linelen=60))
            fh.write('\n')
            nwritten = 1

        return nwritten
Esempio n. 2
0
"""---------------------------------------------------------------------------------------------------------------------
Remove the Trinity path information from the id line
usage
    fasta_reformat.py *.fasta
---------------------------------------------------------------------------------------------------------------------"""
import glob
import sys
import re
from sequence.fasta import Fasta

linelen = 60

# default target file name
target = '*.fasta'
if len(sys.argv) > 1:
    target = sys.argv[1]
print('  target file:', target)

for fastafile in glob.glob(target):
    # output file
    outfile = fastafile + '.reformatted'
    out = open(outfile, 'w')
    print('  input file:', fastafile, '    output file:', outfile)

    fasta = Fasta()
    fasta.open(fastafile)
    while fasta.next():
        fasta.doc = re.sub(r' path=\[[^]]+\]', '', fasta.doc)
        out.write(fasta.format(linelen=linelen))
Esempio n. 3
0
    from sequence.fasta import Fasta
    from plotter import Plotter

    match = Windowmatch()
    print('done {}'.format(type(match)))
    print(match.alphabet)

    # match.readNCBI('table/NUC4.4.matrix')
    print(match.format())

    fasta1 = Fasta(filename=sys.argv[1])
    fasta1.read()

    fasta2 = Fasta()
    fasta2.id = 'seq2'
    fasta2.doc = ' bases 1:50'
    fasta2.seq = fasta1.seq[:50]

    fasta1.seq = fasta1.seq[:200]

    match.s1 = fasta1
    match.s2 = fasta2
    l1, l2 = match.seqToInt()
    print(l1, l2)

    match.window = 10
    match.threshold = 5
    nmatch = match.windowScore()
    print('window: {}     threshold: {}     nmatch: {}'. \
          format(match.window, match.threshold, nmatch))
    plot = Plotter()
Esempio n. 4
0
                filtered.append([pos, runlen])

            diagonal[d] = filtered

        return nmatch


# --------------------------------------------------------------------------------------------------
# Testing
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__':
    print('\ntest 0: identity matching')
    print('\texpect 7 matches\n')
    fasta = Fasta()
    fasta.id = 'test0'
    fasta.doc = '5 letter DNA test'
    fasta.seq = 'ACAGT'
    print('{}\n'.format(fasta.format()))

    match = Match()
    match.s1 = fasta
    match.s2 = fasta
    nmatch = match.identityPos()
    print('matches: {}'.format(nmatch))

    print('\ntest 1: identity matching, unequal length sequences')
    print('\texpect 11 matches\n')
    match = Match()

    fasta1 = Fasta()
    fasta1.id = 'test1.1'
Esempio n. 5
0
            flist[info['ID']] = info
            nfeature += 1
        elif info['Parent'] in flist:
            for k in info:
                if k not in flist[info['Parent']]:
                    flist[info['Parent']][k] = info[k]
        else:
            # flist[info['ID']] = info
            sys.stderr.write('unknown feature {}\n'.format(info['feature']))

    # write out sequences
    for gene in flist:
        thisgene = flist[gene]
        f = Fasta()
        f.id = thisgene['ID']
        f.doc = ''
        for k in save:
            if k in thisgene:
                f.doc += ' {}:{}'.format(k, thisgene[k])
        f.seq = seq[thisgene['seqname']][thisgene['begin'] - 1:thisgene['end']]
        if (thisgene['end'] - thisgene['begin'] > 100000):
            # coordinates cross origin
            f.seq = seq[thisgene['seqname']][thisgene['end'] - 1:] + seq[
                thisgene['seqname']][:thisgene['begin']]

        if thisgene['strand'] == '-':
            f.seq = complement(f.seq)

        sys.stdout.write(f.format(linelen=100))

exit(0)
Esempio n. 6
0
     "phams":["56154"],
     "Start":15822,
     "Stop":16230,
     "Length":408,
     "Name":"24",
     "translation":"MTNVFTLDAMREETRKKYQPVKIGLSEDVTVELKPLLKLGKKAREAVADAVKEIEALPDEIDEDDEDSDELMDEVAEKICESIAKVFKLIATSPRKLLAELDTEEEPQIRAELYGAVLRTWMRET QLGEAAPSPN",
     "Orientation":"F",
     "Notes":"b'tail assembly chaperone'"} ...

Michael Gribskov     10 April 2021
================================================================================================="""
import sys
import json
from sequence.fasta import Fasta

# --------------------------------------------------------------------------------------------------
# main program
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__':
    fp = open(sys.argv[1], 'r')
    phage = json.load(fp)

    for gene in phage['results']:
        f = Fasta()
        f.id = gene['GeneID']
        f.seq = gene['translation']
        f.doc = gene['Notes'][2:-1]
        print(f.format(linelen=100))

    exit(0)
Esempio n. 7
0
        base = base.replace('.seq', '')
        sys.stdout.write('\n\tExpanded file: {}\n\tbasename: {}\n'.format(
            infilename, base))
        outfilename = base + '.fasta'
        outfile = None
        try:
            outfile = open(outfilename, 'w')
        except:
            sys.stderr.write(
                'Unable to open output file ({})\n'.format(outfilename))
            exit(2)

        # process all sequences in the file
        n = 0
        for seq in infile:
            fasta = Fasta()
            fasta.id = base + '_{}'.format(n)
            fasta.seq = seq.rstrip().upper()
            fasta.doc = 'length={}'.format(fasta.length())
            outfile.write(fasta.format(linelen=100))
            n += 1

        infile.close()
        outfile.close()
        sys.stdout.write('\t{} sequences written to {}\n'.format(
            n, outfilename))

    # end of loop over files

exit(0)