コード例 #1
0
    def write_as_fasta(self, fh, n=None):
        """-----------------------------------------------------------------------------------------
        Write to a file in fasta format, if n is defined, write only the specified ORF in the list

        :param fh, open filehandle for writing
        :param n: integer, index of ORF to write, write all if not specified
        :return: n
        -----------------------------------------------------------------------------------------"""
        fasta = Fasta()
        nwritten = 0

        if n is None:
            # print all ORFS
            for orf in self.orf:
                fasta.id = orf['id']
                fasta.doc = 'len={} strand={} frame={} begin={} end={}'. \
                    format(orf['length'], orf['direction'], orf['frame'], orf['begin'], orf['end'])
                fasta.seq = orf['sequence']
                fh.write(fasta.format(linelen=60))
                fh.write('\n')
                nwritten += 1

        elif n < len(self.orf):
            # print the selected ORF
            orf = self.orf[n]
            fasta.id = orf['id']
            fasta.doc = 'len={} strand={} frame={} begin={} end={}'. \
                format(orf['length'], orf['direction'], orf['frame'], orf['begin'], orf['end'])
            fasta.seq = orf['sequence']
            fh.write(fasta.format(linelen=60))
            fh.write('\n')
            nwritten = 1

        return nwritten
コード例 #2
0
ファイル: diagon.py プロジェクト: gribskov/biocomputing
def getSequence():
    sequence = None
    if request.method == 'POST':
        if 'file1' in request.files:
            f = request.files['file1']
            sequence = 0
            state['seq'][1]['status'] = 'next'

        elif 'file2' in request.files:
            f = request.files['file2']
            sequence = 1

        fasta = Fasta(fh=f)
        fasta.read()
        print(fasta.format())
        seq = state['seq'][sequence]
        seq['fasta'] = fasta
        seq['status'] = 'loaded'

        # if both sequences have been selected, check whether the sequences are DNA or protein
        state['params']['seqtype'] = 'protein'
        if state['seq'][0]['status'] is 'loaded' and state['seq'][1][
                'status'] is 'loaded':
            if state['seq'][0]['fasta'].isACGT(
            ) and state['seq'][1]['fasta'].isACGT():
                state['params']['seqtype'] = 'DNA'

    return render_template('dashboard.html', state=state)
コード例 #3
0
ファイル: fasta_select.py プロジェクト: gribskov/biocomputing
        n_notmatch[fastafile] = 0
        n_file += 1

        while fasta.next():
            n_sequence[fastafile] += 1
            n_total += 1

            if fasta.id in idlist or not idlist:
                # desired selected sequences
                fasta.trimDocByRegex(trim)
                seqlen = len(fasta.seq)
                if args.minlen and seqlen < args.minlen:
                    # skip sequences shorter than minimum length, if specified
                    continue

                out.write('{}\n'.format(fasta.format(linelen=args.linelen)))
                n_written += 1
                n_match[fastafile] += 1
                if fasta.id in n_found:
                    n_found[fasta.id] += 1
                else:
                    n_found[fasta.id] = 1

            else:
                # not selected sequence
                n_notmatch[fastafile] += 1

    sys.stderr.write('files read: {}\n'.format(n_file))
    sys.stderr.write('total sequences read: {}\n'.format(n_total))
    sys.stderr.write('total sequences written: {}\n'.format(n_written))
コード例 #4
0
"""---------------------------------------------------------------------------------------------------------------------
Remove the Trinity path information from the id line
usage
    fasta_reformat.py *.fasta
---------------------------------------------------------------------------------------------------------------------"""
import glob
import sys
import re
from sequence.fasta import Fasta

linelen = 60

# default target file name
target = '*.fasta'
if len(sys.argv) > 1:
    target = sys.argv[1]
print('  target file:', target)

for fastafile in glob.glob(target):
    # output file
    outfile = fastafile + '.reformatted'
    out = open(outfile, 'w')
    print('  input file:', fastafile, '    output file:', outfile)

    fasta = Fasta()
    fasta.open(fastafile)
    while fasta.next():
        fasta.doc = re.sub(r' path=\[[^]]+\]', '', fasta.doc)
        out.write(fasta.format(linelen=linelen))
コード例 #5
0
n_uniqueperfile = 0
n_total = 0
n_unique_total = 0
sys.stderr.write('{}\t{}\t\t{}\n'.format('file', 'per file', 'total'))
for fastafile in glob.glob(target):
    fasta = Fasta()
    fasta.open(fastafile)
    n_file += 1
    n_perfile = 0
    while fasta.next():
        n_perfile += 1
        if fasta.id in unique_seq:
            continue
        else:
            n_uniqueperfile += 1
            unique_seq[fasta.id] = fasta.format(linelen=100)

    n_total += n_perfile
    n_unique_total += n_uniqueperfile
    sys.stderr.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(n_file, fastafile,
                                                       n_perfile,
                                                       n_uniqueperfile,
                                                       n_total,
                                                       n_unique_total))

# write out sequences
for seq in unique_seq:
    sys.stdout.write(unique_seq[seq])

exit(0)
コード例 #6
0
ファイル: wordmatch.py プロジェクト: gribskov/biocomputing
            diagonal[d] = filtered

        return nmatch


# --------------------------------------------------------------------------------------------------
# Testing
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__':
    print('\ntest 0: identity matching')
    print('\texpect 7 matches\n')
    fasta = Fasta()
    fasta.id = 'test0'
    fasta.doc = '5 letter DNA test'
    fasta.seq = 'ACAGT'
    print('{}\n'.format(fasta.format()))

    match = Match()
    match.s1 = fasta
    match.s2 = fasta
    nmatch = match.identityPos()
    print('matches: {}'.format(nmatch))

    print('\ntest 1: identity matching, unequal length sequences')
    print('\texpect 11 matches\n')
    match = Match()

    fasta1 = Fasta()
    fasta1.id = 'test1.1'
    fasta1.doc = '5 letter DNA test'
    fasta1.seq = 'ACAGT'
コード例 #7
0
            print('in', n_current, 'sequences', end=' ')
            print('written to', outfilename)

        except NameError:
            pass

        n_out += 1
        n_current = 0
        base_current = 0

        outfilename = '{0}.{1}.{2}'.format(outbase, n_out, outsuffix)
        outfile = open(outfilename, 'w')

    n_seq += 1
    base_total += fasta.length()

    n_current += 1
    base_current += fasta.length()
    outfile.write(fasta.format())

# report statistics for last file
outfile.close()
print('   ', base_current, 'bases/amino acids', end=' ')
print('in', n_current, 'sequences', end=' ')
print('written to', outfilename)

# report overall statistics
print('\n')
print(base_total, 'characters from', n_seq, 'sequences written to', n_out,
      'files\n')
コード例 #8
0
            nfeature += 1
        elif info['Parent'] in flist:
            for k in info:
                if k not in flist[info['Parent']]:
                    flist[info['Parent']][k] = info[k]
        else:
            # flist[info['ID']] = info
            sys.stderr.write('unknown feature {}\n'.format(info['feature']))

    # write out sequences
    for gene in flist:
        thisgene = flist[gene]
        f = Fasta()
        f.id = thisgene['ID']
        f.doc = ''
        for k in save:
            if k in thisgene:
                f.doc += ' {}:{}'.format(k, thisgene[k])
        f.seq = seq[thisgene['seqname']][thisgene['begin'] - 1:thisgene['end']]
        if (thisgene['end'] - thisgene['begin'] > 100000):
            # coordinates cross origin
            f.seq = seq[thisgene['seqname']][thisgene['end'] - 1:] + seq[
                thisgene['seqname']][:thisgene['begin']]

        if thisgene['strand'] == '-':
            f.seq = complement(f.seq)

        sys.stdout.write(f.format(linelen=100))

exit(0)
コード例 #9
0
     "phams":["56154"],
     "Start":15822,
     "Stop":16230,
     "Length":408,
     "Name":"24",
     "translation":"MTNVFTLDAMREETRKKYQPVKIGLSEDVTVELKPLLKLGKKAREAVADAVKEIEALPDEIDEDDEDSDELMDEVAEKICESIAKVFKLIATSPRKLLAELDTEEEPQIRAELYGAVLRTWMRET QLGEAAPSPN",
     "Orientation":"F",
     "Notes":"b'tail assembly chaperone'"} ...

Michael Gribskov     10 April 2021
================================================================================================="""
import sys
import json
from sequence.fasta import Fasta

# --------------------------------------------------------------------------------------------------
# main program
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__':
    fp = open(sys.argv[1], 'r')
    phage = json.load(fp)

    for gene in phage['results']:
        f = Fasta()
        f.id = gene['GeneID']
        f.seq = gene['translation']
        f.doc = gene['Notes'][2:-1]
        print(f.format(linelen=100))

    exit(0)
コード例 #10
0
ファイル: raw2fasta.py プロジェクト: gribskov/biocomputing
        base = base.replace('.seq', '')
        sys.stdout.write('\n\tExpanded file: {}\n\tbasename: {}\n'.format(
            infilename, base))
        outfilename = base + '.fasta'
        outfile = None
        try:
            outfile = open(outfilename, 'w')
        except:
            sys.stderr.write(
                'Unable to open output file ({})\n'.format(outfilename))
            exit(2)

        # process all sequences in the file
        n = 0
        for seq in infile:
            fasta = Fasta()
            fasta.id = base + '_{}'.format(n)
            fasta.seq = seq.rstrip().upper()
            fasta.doc = 'length={}'.format(fasta.length())
            outfile.write(fasta.format(linelen=100))
            n += 1

        infile.close()
        outfile.close()
        sys.stdout.write('\t{} sequences written to {}\n'.format(
            n, outfilename))

    # end of loop over files

exit(0)