Example #1
0
def getSequence():
    sequence = None
    if request.method == 'POST':
        if 'file1' in request.files:
            f = request.files['file1']
            sequence = 0
            state['seq'][1]['status'] = 'next'

        elif 'file2' in request.files:
            f = request.files['file2']
            sequence = 1

        fasta = Fasta(fh=f)
        fasta.read()
        print(fasta.format())
        seq = state['seq'][sequence]
        seq['fasta'] = fasta
        seq['status'] = 'loaded'

        # if both sequences have been selected, check whether the sequences are DNA or protein
        state['params']['seqtype'] = 'protein'
        if state['seq'][0]['status'] is 'loaded' and state['seq'][1][
                'status'] is 'loaded':
            if state['seq'][0]['fasta'].isACGT(
            ) and state['seq'][1]['fasta'].isACGT():
                state['params']['seqtype'] = 'DNA'

    return render_template('dashboard.html', state=state)
Example #2
0
    def write_as_fasta(self, fh, n=None):
        """-----------------------------------------------------------------------------------------
        Write to a file in fasta format, if n is defined, write only the specified ORF in the list

        :param fh, open filehandle for writing
        :param n: integer, index of ORF to write, write all if not specified
        :return: n
        -----------------------------------------------------------------------------------------"""
        fasta = Fasta()
        nwritten = 0

        if n is None:
            # print all ORFS
            for orf in self.orf:
                fasta.id = orf['id']
                fasta.doc = 'len={} strand={} frame={} begin={} end={}'. \
                    format(orf['length'], orf['direction'], orf['frame'], orf['begin'], orf['end'])
                fasta.seq = orf['sequence']
                fh.write(fasta.format(linelen=60))
                fh.write('\n')
                nwritten += 1

        elif n < len(self.orf):
            # print the selected ORF
            orf = self.orf[n]
            fasta.id = orf['id']
            fasta.doc = 'len={} strand={} frame={} begin={} end={}'. \
                format(orf['length'], orf['direction'], orf['frame'], orf['begin'], orf['end'])
            fasta.seq = orf['sequence']
            fh.write(fasta.format(linelen=60))
            fh.write('\n')
            nwritten = 1

        return nwritten
Example #3
0
 def __init__(self):
     self.s1 = Fasta()
     self.s2 = Fasta()
     self.i1 = []
     self.i2 = []
     self.score = Score()
     self.smat = None
     self.pmat = None
     self.gi = 0  # length independent gap penalty
     self.gd = 0  # length dependent gap penalty
Example #4
0
    def __init__(self):
        """-----------------------------------------------------------------------------------------
        Diagonal class constructor.
        Subclass of Score
        Delegates to Fasta via self.s1 and self.s2
        Delegates to pyplot via self.fig

        diagonal: one diagonal of scores
        yinc: direction of y axis, 1 or -1 means forward or reverse respectively
        window: length of window for calculation
        threshold: minimum value for window to be plotted

        -----------------------------------------------------------------------------------------"""
        Score.__init__(self)

        self.diagonal = []
        self.single = False
        self.yinc = 1
        self.threshold = 0
        self.window = 0
        self.nscore = 0
        self.nrun = 0

        self.frame = {}  # data frames
        self.function = {}  # functions for populating data frames

        # Plotting variables
        # sizes of panels are defined in setupBokeh()
        self.title = ''
        self.figure = {}
        self.grid = None
        self.palette = None
        self.cmap = None
        self.alpha = 0.5
        self.mindotsize = 2
        self.maxdotsize = 10

        # sequences, s1 is horizontal, s2 is vertical
        self.s1 = Fasta()
        self.s2 = Fasta()
        self.i1 = None  # integer array representation of sequences
        self.i2 = None
        self.l1 = 0
        self.l2 = 0
        self.seqreverse = False  # only applies to s2
Example #5
0
    def find(self, direction='+', frame=0, minlen=0, includeseq=False):
        """-----------------------------------------------------------------------------------------
        find the open reading frames in a specific frame and direction. For the reverse
        complement, the coordinates are in terms of the reversed sequence

        :param direction: string, '+' or '-'
        :param frame: int, 0 - 2
        :param minlen: int, only save if the orf is longer than minlen
        :param includeseq: boolean, if true, include the sequence in the identified reading frames
        :return: int, number of rfs added to self.list
        -----------------------------------------------------------------------------------------"""
        seq = self.sequence
        if direction == '-':
            seq = Fasta.reverseComplement(self.sequence)

        nrf = 0
        pos = frame
        begin = pos

        while pos < len(seq) - 2:
            codon = seq[pos:pos + 3]
            if codon in Orf.stop:
                # end of an ORF
                if pos - begin > 3:
                    nrf += 1
                    if pos - begin >= minlen:
                        self.rflist.append({'direction': direction,
                                            'frame':     frame,
                                            'begin':     begin,
                                            'end':       pos})
                        if includeseq:
                            newrf = self.rflist[-1]
                            newrf['seq'] = seq[newrf['begin']:newrf['end']]
                begin = pos + 3

            pos += 3

        if pos - begin > 2:
            nrf += 1
            if pos - begin >= minlen:
                self.rflist.append({'direction': direction,
                                    'frame':     frame,
                                    'begin':     begin,
                                    'end':       pos})
                if includeseq:
                    newrf = self.rflist[-1]
                    newrf['seq'] = seq[newrf['begin']:newrf['end']]
        return nrf
Example #6
0
    def findall(self, minlen=0, includeseq=False):
        """-----------------------------------------------------------------------------------------
        Find orfs in all six reading frames

        :param minlen: int, only save if the orf is longer than minlen
        :param includeseq: boolean, if true, include the sequence in the identified reading frames
        :return: int, number of orfs found
        -----------------------------------------------------------------------------------------"""
        nrf = 0
        for direction in ('+', '-'):
            s = self.sequence
            if direction == '-':
                s = Fasta.reverseComplement(self.sequence)

            for frame in range(3):
                nrf += self.find(direction=direction, frame=frame, minlen=minlen,
                                 includeseq=includeseq)


        return nrf
Example #7
0
        scoremin = score[0]
        scoremax = score[0]
        for s in score:
            scoremin = min(scoremin, s)
            scoremax = max(scoremax, s)

        return scoremin, scoremax


# --------------------------------------------------------------------------------------------------
# Testing
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__':
    match = Diagonal()

    fasta = Fasta(filename=sys.argv[1])
    fasta.read()
    fasta1 = fasta.copy()

    fasta2 = fasta.copy()
    fasta2.id = 'seq 2'
    fasta2.doc = 'Sequence 2'

    fasta1.seq = fasta1.seq[:200]
    fasta2.seq = fasta2.seq[:400]

    dataframes = [{
        'data': 'dots',
        'fn': match.windowThreshold,
        'var': ['x', 'y', 'score']
    }, {
Example #8
0
    snplist, maxpos = read_snps_tabular(snp_file)

    # this version reads gff format
    # snp_file = 'C:/Users/michael/Desktop/apple/GDDH13_1-1_SNPs.gff3'
    # snplist, maxpos = read_snps_gff(snp_file)

    for chr in snplist:
        print('{} {} snps max: {}'.format(chr, len(snplist[chr]), maxpos[chr]))
    print()

    # output file from command line
    out = open(sys.argv[1], 'w')

    # read genome and match, one sequence at a time
    fastafile = 'C:/Users/michael/Desktop/apple/GDDH13_1-1_formatted.fasta'
    fasta = Fasta()
    fasta.open(fastafile)

    pad = 25
    window = 2 * pad + 1

    bases = 0
    seqlen = {}
    sequence = ''
    seqbegin = 0
    seqend = 0
    snpcount = 0
    for line in fasta.fh:
        line = line.rstrip()
        if line.startswith('>'):
            try:
Example #9
0
# --------------------------------------------------------------------------------------------------
# Testing
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__':
    import sys
    from sequence.fasta import Fasta
    from plotter import Plotter

    match = Windowmatch()
    print('done {}'.format(type(match)))
    print(match.alphabet)

    # match.readNCBI('table/NUC4.4.matrix')
    print(match.format())

    fasta1 = Fasta(filename=sys.argv[1])
    fasta1.read()

    fasta2 = Fasta()
    fasta2.id = 'seq2'
    fasta2.doc = ' bases 1:50'
    fasta2.seq = fasta1.seq[:50]

    fasta1.seq = fasta1.seq[:200]

    match.s1 = fasta1
    match.s2 = fasta2
    l1, l2 = match.seqToInt()
    print(l1, l2)

    match.window = 10
Example #10
0
reformat the output from the apc.pl de-circularization program.
Input is two lines, idline and sequence
Output is 100 letters/line

usage
    fasta_reformat.py *.fasta
---------------------------------------------------------------------------------------------------------------------"""
import glob
import sys
from sequence.fasta import Fasta

linelen = 100

# default target file name
target = '*.fasta'
if len(sys.argv) > 1:
    target = sys.argv[1]
print('  target file:', target)

for fastafile in glob.glob(target):
    # output file
    outfile = fastafile + '.reformatted'
    out = open(outfile, 'w')
    print('  input file:', fastafile, '    output file:', outfile)

    fasta = Fasta()
    fasta.open(fastafile)
    while fasta.next():
        fasta.doc = ' len={}'.format(fasta.length())
        out.write(fasta.format(linelen=linelen))
"""---------------------------------------------------------------------------------------------------------------------
Remove the Trinity path information from the id line
usage
    fasta_reformat.py *.fasta
---------------------------------------------------------------------------------------------------------------------"""
import glob
import sys
import re
from sequence.fasta import Fasta

linelen = 60

# default target file name
target = '*.fasta'
if len(sys.argv) > 1:
    target = sys.argv[1]
print('  target file:', target)

for fastafile in glob.glob(target):
    # output file
    outfile = fastafile + '.reformatted'
    out = open(outfile, 'w')
    print('  input file:', fastafile, '    output file:', outfile)

    fasta = Fasta()
    fasta.open(fastafile)
    while fasta.next():
        fasta.doc = re.sub(r' path=\[[^]]+\]', '', fasta.doc)
        out.write(fasta.format(linelen=linelen))
Example #12
0
                nrf += self.find(direction=direction, frame=frame, minlen=minlen,
                                 includeseq=includeseq)


        return nrf


# --------------------------------------------------------------------------------------------------
# Testing
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__':

    orf = Orf()
    orf.sequence = 'TAAATGATGTGACCCTCACCGTGA'
    print(orf.sequence)

    nrf = orf.findall(includeseq=True)
    print(f'{nrf} reading frames found')

    for i in range(nrf):
        rf = orf.rflist[i]
        s = orf.sequence
        if rf['direction'] == '-':
            s = Fasta.reverseComplement(orf.sequence)
        begin = rf["begin"]
        end = rf["end"]
        # print(f'f:{rf["frame"]}{rf["direction"]}\tbegin:{begin:4d}\tend:{end:4d}\t{s[begin:end]}')
        print(f'f:{rf["frame"]}{rf["direction"]}\tbegin:{begin:4d}\tend:{end:4d}\t{rf["seq"]}')

    exit(0)
Example #13
0
'''

'''
from sequence.fasta import Fasta

trinity = Fasta()
Example #14
0
# --------------------------------------------------------------------------------------------------
# main
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__':

    # open files
    gtffile = sys.argv[1]
    try:
        gtf = open(gtffile, 'r')
    except:
        sys.stderr.write('Unable to open GTF file ({})\n'.format(gtffile))
        exit(1)

    seq = {}
    fasta = Fasta()
    fasta.open(sys.argv[2])
    sys.stderr.write('Reading Fasta {}...\n'.format(sys.argv[2]))
    nseq = 0
    while fasta.next():
        seq[fasta.id] = fasta.seq
        nseq += 1

    sys.stderr.write('\n{} Sequences read from {}\n'.format(nseq, sys.argv[2]))
    for s in seq:
        sys.stderr.write('\t{} len={}\n'.format(s, len(seq[s])))

    sys.stderr.write('\ngtf2fasta\n')
    sys.stderr.write('\tGTF: {}\n'.format(gtffile))
    sys.stderr.write('\tFasta: {}\n'.format(sys.argv[2]))
Example #15
0
        return True


# --------------------------------------------------------------------------------------------------
# Testing
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__':
    from wordmatch import Match  # for testing only
    from sequence.fasta import Fasta

    print('\ntest 1: identity matching, unequal length sequences')
    print('\texpect 11 matches\n')
    match = Match()

    fasta1 = Fasta()
    fasta1.id = 'test0.1'
    fasta1.doc = '5 letter DNA test'
    fasta1.seq = 'ACAGT'
    match.s1 = fasta1

    fasta2 = Fasta()
    fasta2.id = 'test0.2'
    fasta2.doc = '7 letter DNA test'
    fasta2.seq = 'ACAGTAA'
    match.s2 = fasta2

    nmatch = match.identity()

    plot = Plotter()
    plot.match = match
Example #16
0
     "phams":["56154"],
     "Start":15822,
     "Stop":16230,
     "Length":408,
     "Name":"24",
     "translation":"MTNVFTLDAMREETRKKYQPVKIGLSEDVTVELKPLLKLGKKAREAVADAVKEIEALPDEIDEDDEDSDELMDEVAEKICESIAKVFKLIATSPRKLLAELDTEEEPQIRAELYGAVLRTWMRET QLGEAAPSPN",
     "Orientation":"F",
     "Notes":"b'tail assembly chaperone'"} ...

Michael Gribskov     10 April 2021
================================================================================================="""
import sys
import json
from sequence.fasta import Fasta

# --------------------------------------------------------------------------------------------------
# main program
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__':
    fp = open(sys.argv[1], 'r')
    phage = json.load(fp)

    for gene in phage['results']:
        f = Fasta()
        f.id = gene['GeneID']
        f.seq = gene['translation']
        f.doc = gene['Notes'][2:-1]
        print(f.format(linelen=100))

    exit(0)
Example #17
0
        return the row and col corresponding to cell n

        :param l1: int, length of sequence 1 (col)
        :param n: int, cell n
        :return: int, int; row, col
        -----------------------------------------------------------------------------------------"""
        return (n - 1) // l1, (n - 1) % l1


# --------------------------------------------------------------------------------------------------
# testing
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__':
    align = Alignment()

    align.s1 = Fasta()
    align.s2 = Fasta()

    # align.alphabet = 'ACGT'
    # align.identity(pos=3, neg=-3)

    # align.s1.seq = 'ACTTATCTTAT'
    # align.s1.seq = 'TATTCTATTCA'
    # align.s1.seq = 'TGGTATACTAT'
    # align.s1.seq = 'GATACTATCTA'

    # align.s2.seq = 'AGTATCATATT'
    # align.s2.seq = 'TTATACTATGG'
    # align.s2.seq = 'TACTATTTAGAT'
    # align.s2.seq = 'TTATACTATGA'
Example #18
0
class Diagonal(Score, Fasta):
    """=============================================================================================

    ============================================================================================="""
    def __init__(self):
        """-----------------------------------------------------------------------------------------
        Diagonal class constructor.
        Subclass of Score
        Delegates to Fasta via self.s1 and self.s2
        Delegates to pyplot via self.fig

        diagonal: one diagonal of scores
        yinc: direction of y axis, 1 or -1 means forward or reverse respectively
        window: length of window for calculation
        threshold: minimum value for window to be plotted

        -----------------------------------------------------------------------------------------"""
        Score.__init__(self)

        self.diagonal = []
        self.single = False
        self.yinc = 1
        self.threshold = 0
        self.window = 0
        self.nscore = 0
        self.nrun = 0

        self.frame = {}  # data frames
        self.function = {}  # functions for populating data frames

        # Plotting variables
        # sizes of panels are defined in setupBokeh()
        self.title = ''
        self.figure = {}
        self.grid = None
        self.palette = None
        self.cmap = None
        self.alpha = 0.5
        self.mindotsize = 2
        self.maxdotsize = 10

        # sequences, s1 is horizontal, s2 is vertical
        self.s1 = Fasta()
        self.s2 = Fasta()
        self.i1 = None  # integer array representation of sequences
        self.i2 = None
        self.l1 = 0
        self.l2 = 0
        self.seqreverse = False  # only applies to s2

    def setupCalculation(self,
                         seq1,
                         seq2,
                         window=5,
                         threshold=3,
                         resetstat=True):
        """-----------------------------------------------------------------------------------------
        Load the sequences and do some basic setup for score calculations. Sequences
        are passed as Fasta object to make it easier to use multi fasta files.

        :param seq1: Fasta object
        :param seq2: Fasta object
        :param window: int, length of window for calculation
        :param threshold: float, minimum score in window to plot
        :param resetstat: boolean, if False, reset score and run counts to zero
        :return: True
        -----------------------------------------------------------------------------------------"""
        # sequence setup
        self.s1 = seq1
        self.s2 = seq2
        self.l1 = len(seq1.seq)
        self.l2 = len(seq2.seq)

        # move shorter sequence to s2 if necessary
        if self.l1 < self.l2:
            # shorter sequence is always s2
            self.s1, self.s2 = self.s2, self.s1
            self.l1, self.l2 = self.l2, self.l1

        # reverse sequence 2 if necessary
        yinc = 1
        if self.seqreverse:
            self.s2.seq = self.s2.reverseComplement()
            self.yinc = -1

        # setup integer array version of sequence
        self.seqToInt()

        self.diagonal = [0 for _ in range(min(self.l1, self.l2))]
        self.window = window
        self.threshold = threshold

        # stat() histograms.  nrun is always positive

        if resetstat:
            self.nscore = 0
            self.nrun = 0
            for frame in self.frame:
                self.resetFrame(frame)

        return True

    def setupBokeh(self, cbase=None, clevels=None, creverse=None):
        """-----------------------------------------------------------------------------------------
        SEt up four plot in 2 x 2 grid, but with differing sizes
            mainplot is the dotplot itself, upper right
            legend shows the colorbar legend
            scoreplot shows the window score distribution
            runploot shows the log of the run length distribution

        :param cbase: string, e.g. Greys, Blues, Reds, Viridis, etc
        :param clevels: int, usually 0-9 or 256
        :param creverse: boolean, if True highest color is dark
        :return: True
        -----------------------------------------------------------------------------------------"""

        # turn off MISSING_RENDERERS warning caused by plotting colorbars in empty plot
        silence(MISSING_RENDERERS, True)

        self.palette = self.setupPalette(cbase=cbase,
                                         clevels=clevels,
                                         creverse=creverse)

        if self.title:
            titlestr = self.title
        else:
            now = date.today()
            titlestr = 'Dotplot of {} and {} - {}'.format(
                self.s1.id, self.s2.id, now)

        xlabel = '\n'.join([self.s1.id, self.s1.doc])
        ylabel = '\n'.join([self.s2.doc, self.s2.id])

        # account for sequence length difference, ylen scaling affects main and legend panels
        xlen = 800
        ylen = xlen * self.l2 / self.l1

        # define each panel as a figure
        label = '({}, {}, score)'.format(self.s1.id, self.s2.id)
        TIPS = [(label, '($x{0}, $y{0}, @score)')]
        self.figure['main'] = figure(title=titlestr,
                                     x_axis_label=xlabel,
                                     y_axis_label=ylabel,
                                     height=int(ylen),
                                     width=int(xlen),
                                     align='center',
                                     tooltips=TIPS)

        self.figure['legend'] = figure(height=int(ylen), width=200)

        TIPS = [('score, number', '$x{0}, $y{0.00}')]
        self.figure['scoredist'] = figure(height=300, width=500, tooltips=TIPS)

        TIPS = [('length,count', '$x{0}, $y{0}')]
        self.figure['rundist'] = figure(height=300,
                                        width=500,
                                        y_axis_type='log',
                                        tooltips=TIPS)

        # grid layout
        self.grid = layout([[self.figure['main'], self.figure['legend']],
                            [self.figure['scoredist'],
                             self.figure['rundist']]])

        return True

    def setupFrame(self, defs):
        """-----------------------------------------------------------------------------------------
        Setup data frames for the defined analyses with empty ndata fields.  Each def in defs
        defines
            name - name of data frame
            function - a callback function used to construct the data from a diagonal of scores
            variables - variables that will be populated

        As used here, a dataframes are stored in the object as self.frame[name]
        self.frame[name] = {function, var1: [], var2: [], var3: [], ...}

        :param defs: list, see above
        :return: int, number of frames define
        -----------------------------------------------------------------------------------------"""
        n = 0
        for defin in defs:
            n += 1
            self.frame[defin['data']] = {}
            self.function[defin['data']] = defin['fn']
            for v in defin['var']:
                self.frame[defin['data']][v] = []

        return n

    def resetFrame(self, framename):
        """-----------------------------------------------------------------------------------------
        Reset the data in one frame to empty lists.  Needed for reverse plots

        :param framename:
        :return: True
        -----------------------------------------------------------------------------------------"""
        frame = self.frame[framename]
        for var in frame:
            frame[var] = []

        return True

    def setupPalette(self, cbase, clevels, creverse):
        """-----------------------------------------------------------------------------------------
        Colormaps are used in multiple methods so this utility provides a unified safe method for
        setup.  Bokeh handles colormaps a little differently than other plotting programs

        :param cbase: string, e.g. Greys, Blues, Reds, Viridis, etc
        :param clevels: int, usually 0-9 or 256
        :param creverse: boolean, if True highest color is dark
        :return:
        -----------------------------------------------------------------------------------------"""
        from bokeh.palettes import all_palettes

        # the defaults are here instead of in definition so that they never change
        default_base = 'Greys'
        default_levels = 256
        default_reverse = True

        try:
            palette = all_palettes[cbase][clevels]
        except (KeyError, IndexError) as error:
            # if lookup fails, use default
            palette = all_palettes[default_base][default_levels]
            creverse = default_reverse
            sys.stderr.write(
                'Diagonal::setupPalettes - {}, color {} levels {} is undefined.\n'
                .format(error, cbase, clevels))
            sys.stderr.write('\tUsing default {}{}\n'.format(
                default_base, default_levels))

        if creverse:
            # reverse the orde of colors
            palette = palette[::-1]

        return palette

    def seqToInt(self):
        """-----------------------------------------------------------------------------------------
        Convert sequence strings to an integer arrays and stores in object.  An integer array is
        more convenient for direct lookups in the scoring table than a string

        :return: int, int length of sequence lists
        -----------------------------------------------------------------------------------------"""
        a2i = self.a2i

        self.i1 = [a2i[c] for c in self.s1.seq]
        self.i2 = [a2i[c] for c in self.s2.seq]

        return len(self.i1), len(self.i2)

    def rle2coord(self):
        """-----------------------------------------------------------------------------------------
        Return a list of beginning and ending positions of each run.  List is a list of four
        coordinates for each run [s1begin, s1end, s2begin, s2end]

        :return: 4 x int, beg1, end1, beg2, end2
        -----------------------------------------------------------------------------------------"""
        coord = []
        l2 = self.l2

        for diag in range(len(self.diagonal)):

            for offset, length in self.diagonal[diag]:
                end1 = max(diag - l2 + 1, 0) + offset
                end2 = max(l2 - diag - 1, 0) + offset
                beg1 = end1 - length + 1
                beg2 = end2 - length + 1
                coord.append([beg1, end1, beg2, end2])

        return coord

    def diagLenBegin(self, diag):
        """-----------------------------------------------------------------------------------------
        Calculates the length of diagonal diag and the beginning position of the diagonal in
        each sequence

        :param diag: int, diagonal number
        :return: int (diagonal length), int (seq1 begin), int (seq2 begin)
        -----------------------------------------------------------------------------------------"""
        pos1 = max(diag - self.l2 + 1, 0)
        pos2 = max(self.l2 - diag - 1, 0)
        diaglen = min(self.l1 - pos1, self.l2 - pos2)

        # if self.seqreverse:
        # pos2 = self.l2 - pos2

        return diaglen, pos1, pos2

    def diagonalScore(self, d):
        """-----------------------------------------------------------------------------------------
        Calculate the moving window sum of comparison score along one diagonal and store in the
        object.

        :param d: int, diagonal number
        :return: list, scores along diagonal
        -----------------------------------------------------------------------------------------"""
        diaglen, pos1, pos2 = self.diagLenBegin(d)

        i1 = self.i1
        i2 = self.i2

        window = self.window
        cmp = self.table
        diagonal = self.diagonal

        old1 = pos1
        old2 = pos2

        if diaglen < window:
            # skip   diagonals shorter than window length
            return []

        diagonal[:] = map(lambda i: 0,
                          diagonal)  # lambda much faster to set all values
        # to zero
        score = 0

        # first window
        for offset in range(window):
            score += cmp[i1[pos1]][i2[pos2]]
            pos1 += 1
            pos2 += 1

        dpos = 0
        diagonal[dpos] = score

        # rest of diagonal
        for offset in range(window, diaglen):
            # sys.stderr.write('{}\t{}\n'.format(pos1,pos2))
            score -= cmp[i1[old1]][i2[old2]]
            score += cmp[i1[pos1]][i2[pos2]]

            dpos += 1
            diagonal[dpos] = score

            old1 += 1
            old2 += 1
            pos1 += 1
            pos2 += 1

        return diagonal

    def random(self, n=10000):
        """-----------------------------------------------------------------------------------------
        Calculate random score distribution using current scoring table, window, and threshold.
        Use stat() to get distributions and run lengths.  Use n = number of windows calculated for
        actual sequences.

        :param n: int, number of windows to calculate
        :return: list of n scores
        -----------------------------------------------------------------------------------------"""
        window = self.window
        cmp = self.table
        i1 = self.i1
        i2 = self.i2

        if n == 0:
            n = self.l1 * self.l2

        self.diagonal = [0 for _ in range(n - window)]
        dist = self.diagonal
        win = [0 for _ in range(window)]

        wsum = 0
        for i in range(window):
            a = choice(i1)
            b = choice(i2)
            score = cmp[a][b]
            win[i] = score
            wsum += score

        newpos = 0
        pos = 0
        for i in range(n - window):
            dist[pos] = wsum
            wsum -= win[newpos]
            a = choice(i1)
            b = choice(i2)
            score = cmp[a][b]

            wsum += score
            win[newpos] = score

            newpos = (newpos + 1) % window
            pos += 1

        return dist

    def allDiagonals(self, select):
        """-----------------------------------------------------------------------------------------
        Iterate over all diagonals and apply specified actions to each diagonal.  Each action is
        a tuple that specifies the name of the resulting data frame, and a function to process
        the diagonal. The frames are usable as Bokeh sources for plotting.

        :param select: list, names of dataframes to calculate from each diagonal
        :return: True
        -----------------------------------------------------------------------------------------"""
        frame = self.frame
        function = self.function

        for d in range(self.l1 + self.l2 - 1):
            dscore = self.diagonalScore(d)
            if not dscore:
                continue

            for data in select:
                # apply each selected function to this diagonal of scores to populate the
                # dataframes
                fxn = function[data]
                fxn(data, d)

        return True

    def windowThreshold(self, framename, d):
        """-----------------------------------------------------------------------------------------
        Callback function for allDiagonals.  Savs windows with score >= threshold in dataframe
        framename.  Works on the internally stored diagonal of scores calculated by diagonalScore()

        :param framename: string, name of a dataframe in self.frame
        :param d: int, diagonal number
        :return: True
        -----------------------------------------------------------------------------------------"""
        frame = self.frame[framename]
        dscore = self.diagonal
        window = self.window
        halfwindow = (window - 1) / 2.0
        threshold = self.threshold
        yinc = self.yinc

        diaglen, xpos, ypos = self.diagLenBegin(d)
        if diaglen < window:
            return False

        xpos += halfwindow
        if self.yinc < 0:
            ypos = self.l2 - ypos - halfwindow - 1
        else:
            ypos += halfwindow

        for pos in range(diaglen - window + 1):
            if dscore[pos] >= threshold:
                frame['x'].append(xpos)
                frame['y'].append(ypos)
                frame['score'].append(dscore[pos])

            xpos += 1
            ypos += yinc
            self.nscore += 1

        return True

    def scaleColumn(self, framename, column_source, column_dest, value, scale):
        """-----------------------------------------------------------------------------------------
        Performs a simple linear scaling on a column

        :param framename: string, a data frame in self.frame
        :param column_source: string, the column in frame to be scaled
        :param column_dest: string, name for the scaled column (in frame)
        :param value: tuple, low and high value for the input data
        :param scale: tuple, low and high value for the scaled data
        :return:
        -----------------------------------------------------------------------------------------"""
        frame = self.frame[framename]
        values = frame[column_source]
        frame[column_dest] = []
        # width = frame[column_dest]

        rangeval = value[1] - value[0]
        rangesize = scale[1] - scale[0]
        m = rangesize / rangeval

        for v in values:
            size = scale[0] + (v - value[0]) * m
            frame[column_dest].append(size)

        return

    def histogramScore(self, scoreframe, d):
        """-----------------------------------------------------------------------------------------
        Callback function for allDiagonals. Creates data frames with the score distribution. Works
        on the internally stored diagonal of scores calculated by diagonalScore()

        :param scoreframe: string, name of dataframe in self.frame
        :param d: int, diagonal number
        :return: int, number of values in columns of dataframe
        -----------------------------------------------------------------------------------------"""
        scoreframe = self.frame[scoreframe]
        diagonal = self.diagonal
        window = self.window

        if self.single:
            diaglen = len(diagonal)
        else:
            diaglen, xpos, ypos = self.diagLenBegin(d)
            diaglen -= window - 1

        nscore = 0
        score = {}
        for s in diagonal[:diaglen]:
            try:
                score[s] += 1
            except KeyError:
                score[s] = 1

            nscore += 1

        # insert into data frame, the dateframe is randomly ordered
        for s in score:
            try:
                i = scoreframe['score'].index(s)
                scoreframe['count'][i] += score[s]

            except ValueError:
                scoreframe['score'].append(s)
                scoreframe['count'].append(score[s])

        return len(scoreframe['score'])

    def histogramRun(self, runframe, d):
        """-----------------------------------------------------------------------------------------
        Callback function for allDiagonals. Create a dataframe with the run length distribution,
        apply the threshold stored in self.threshold.
        Works on the internally stored diagonal of scores calculated by diagonalScore()

        :param runframe: string, name of dataframe in self.frame
        :param d: int, diagonal number
        :return: int, number of values in columns of dataframe
        -----------------------------------------------------------------------------------------"""
        runframe = self.frame[runframe]
        diagonal = self.diagonal
        window = self.window
        threshold = self.threshold

        if self.single:
            diaglen = len(diagonal)
        else:
            diaglen, xpos, ypos = self.diagLenBegin(d)
            diaglen -= window - 1

        run = {}
        nrun = 0
        runlen = 0
        for offset in range(diaglen):
            if diagonal[offset] >= threshold:
                runlen += 1

            else:
                try:
                    run[runlen] += 1
                except KeyError:
                    # runlen key doesn't exist yet
                    run[runlen] = 1

                runlen = 0
                nrun += 1

        if runlen:
            try:
                run[runlen] += 1
            except KeyError:
                # runlen key doesn't exist yet
                run[runlen] = 1
            nrun += 1

        # insert into data frame, the dataframe is randomly ordered
        for r in run:
            try:
                i = runframe['len'].index(r)
                runframe['count'][i] += run[r]

            except ValueError:
                runframe['len'].append(r)
                runframe['count'].append(run[r])

        return len(runframe['len'])

    def sortFrame(self, frame, keyvar):
        """-----------------------------------------------------------------------------------------
        Sort all the variables in the dataframe according to the order of keyvar
        TODO should this and return the min and max values?

        :param frame: string
        :param keyvar: string
        :return: True
        -----------------------------------------------------------------------------------------"""
        unsorted = self.frame[frame]

        # save the order so it can be applied to all viariables in the dataframe
        order = sorted(range(len(unsorted[keyvar])),
                       key=lambda x: unsorted[keyvar][x])

        sorted_frame = {}
        for column in unsorted:
            sorted_frame[column] = []
            for i in order:
                sorted_frame[column].append(unsorted[column][i])

        self.frame[frame] = sorted_frame
        return True

    def bdot(self,
             dataname,
             figurename,
             width=1,
             color=1,
             mode='dot',
             set_colormap=True):
        """-----------------------------------------------------------------------------------------
        Bokeh plot of dots in the main panel, and colorbar in the legend panel

        :param dataname: string, name of a dataframe in self.frame
        :param figurename: string, a figure defined in setupBokeh and stored in self.figure
        :param width: boolean, scale size of markers by the score
        :param color: boolean, scale the color of the markers by the score
        :param mode: string, if dot use the circle renderer, otherwise segment renderer
        :param set_colormap: boolean, set the colormap based on score range, turn off for second
        plot to use the same scale
        :return: True
        -----------------------------------------------------------------------------------------"""
        data = self.frame[dataname]
        figure = self.figure[figurename]
        legend = self.figure['legend']
        window = self.window
        threshold = self.threshold
        alpha = self.alpha

        scoremin, scoremax = self.valueMinMax(data['score'])

        if width == 1:
            self.scaleColumn('dots', 'score', 'size',
                             (threshold - 1, scoremax),
                             (self.mindotsize, self.maxdotsize))
        else:
            data['size'] = [self.mindotsize for _ in range(len(data['score']))]

        if color == 1:
            pass
        else:
            data['score'] = [scoremax for _ in range(len(data['score']))]

        if set_colormap:
            if color == 1:
                cmap = LinearColorMapper(self.palette,
                                         low=max(threshold - 1.0,
                                                 scoremin - 1),
                                         high=scoremax)
            else:
                cmap = LinearColorMapper(self.palette,
                                         low=threshold - 0.1,
                                         high=threshold)
            self.cmap = cmap
        else:
            cmap = self.cmap

        source = ColumnDataSource(data)
        if mode == 'dot':
            figure.circle(source=source,
                          x='x',
                          y='y',
                          size='size',
                          line_color=transform('score', cmap),
                          line_alpha=alpha,
                          fill_color=transform('score', cmap),
                          fill_alpha=alpha)

        else:
            # line mode
            figure.segment(source=source,
                           x0='x',
                           x1='x1',
                           y0='y',
                           y1='y1',
                           line_width='size',
                           line_color=transform('score', cmap),
                           alpha=alpha)

        # color bar is in a separate window, self.legend, so it doesn't disturb the
        # aspect ratio
        if color:
            color_bar = ColorBar(color_mapper=cmap,
                                 label_standoff=3,
                                 bar_line_color='black',
                                 scale_alpha=alpha,
                                 width=20,
                                 margin=0,
                                 location=(0, 0),
                                 major_tick_in=20,
                                 major_tick_out=5,
                                 major_tick_line_color='black')

            legend.add_layout(color_bar, 'left')

        return True

    def bscoreDist(self, figurename, dataname, color):
        """-----------------------------------------------------------------------------------------
        Bokeh plot of score distribution and cumulative score distribution.

        :param figurename: string, name of figures (stored in self.figure)
        :param dataname: string, name of data frame (stored in self.frame)

        :param figurename: string, name of figures (stored in self.figure)
        :param dataname: string, name of data frame (stored in self.frame)
        :param color: string, and valid Bokeh color, used to fill bars
        :return: True
        -----------------------------------------------------------------------------------------"""
        data = self.frame[dataname]
        figure = self.figure[figurename]

        minp, maxp = self.valueMinMax(data['count'])

        source = ColumnDataSource(data)

        # observed score density
        figure.vbar(source=source,
                    x='score',
                    top='count',
                    width=0.8,
                    color=color,
                    line_color='black',
                    alpha=self.alpha,
                    bottom=0.0)
        figure.y_range = Range1d(0.0, maxp * 1.1)

        return True

    def brunDist(self, figurename, dataname, color):
        """-----------------------------------------------------------------------------------------
        Bokeh plot of run length distribution

        :param figurename: string, name of figures (stored in self.figure)
        :param dataname: string, name of data frame (stored in self.frame)
        :param color: string, and valid Bokeh color, used to fill bars
        :return: True
        -----------------------------------------------------------------------------------------"""
        run = self.frame[dataname]
        figure = self.figure[figurename]

        source = ColumnDataSource(run)
        minrun = 1
        # x = [i for i in range(minrun, maxrun + 1)]
        # observed and simulated run lengths,  need bottom=1 because of log axis

        figure.vbar(source=source,
                    x='len',
                    top='count',
                    width=0.8,
                    color=color,
                    line_color='black',
                    alpha=self.alpha,
                    line_width=0.5,
                    bottom=0.1)

        return True

    def bscoreCumulative(self, figurename, dataname):
        """-----------------------------------------------------------------------------------------
        Bokeh plot of cumulative distribution as a line on right hand axis

        :param figurename: string, name of figures (stored in self.figure)
        :param dataname: string, name of data frame (stored in self.frame)
        :return: True
        -----------------------------------------------------------------------------------------"""
        data = self.frame[dataname]
        figure = self.figure[figurename]

        source = ColumnDataSource(data)

        figure.extra_y_ranges = {"cumulative": Range1d(start=0.0, end=1.0)}
        axis2 = LinearAxis(y_range_name="cumulative")
        axis2.ticker.num_minor_ticks = 10
        figure.add_layout(axis2, 'right')
        figure.line(source=source,
                    x='score',
                    y='cumulative',
                    y_range_name='cumulative',
                    line_width=2,
                    color='#1122cc')

        # shaded box showing 95% level
        box = BoxAnnotation(bottom=0.95,
                            top=1.0,
                            y_range_name='cumulative',
                            fill_color='#FFBBBB',
                            line_width=3,
                            line_dash='dashed')
        figure.add_layout(box)

        return True

    def writeFrame(self, framename, key='x', out=sys.stdout):
        """-----------------------------------------------------------------------------------------
        Write the dataframe out as a table to the specified output file.  Output file should be
        opened for writing in advance.

        TODO figure out how to format values more nicely

        :param framename: string, name of a dataframe in self.frame
        :param key: string, name of column to use as key (first column in table)
        :param out: open output file
        :return: True
        -----------------------------------------------------------------------------------------"""
        frame = self.frame[framename]

        out.write('\n{} dataframe\n'.format(framename))
        out.write('\t{}'.format(key))
        for column in frame:
            if column == key:
                continue
            out.write('\t{}'.format(column))
        out.write('\n')

        n = len(frame[key])
        for i in range(n):
            out.write('\t{}'.format(frame[key][i]))
            for column in frame:
                if column == key:
                    continue
                out.write('\t{}'.format(frame[column][i]))
            out.write('\n')

        return True

    def show(self, *args, **kwargs):
        """-----------------------------------------------------------------------------------------
        Delegate to plt.show().  Makes syntax a little easier in application since the object is
        used instead of the plotting class

        :param args: arguments to pass to show()
        :param kwargs: arguments to pass to show()
        :return: True
        -----------------------------------------------------------------------------------------"""
        show(self.grid, *args, **kwargs)

        return True

    @staticmethod
    def cumulative(score, total):
        """-----------------------------------------------------------------------------------------
        Return cumulative score probability distribution as a list.

        :param score: list
        :param total: int, number of observations
        :return: list
        -----------------------------------------------------------------------------------------"""
        cumulative = []
        wsum = 0
        for i in range(len(score)):
            wsum += score[i] / total
            cumulative.append(wsum)

        return cumulative

    def addCumulative(self, data, sourcecol, destcol):
        """-----------------------------------------------------------------------------------------
        Add cumulative distribution to dataframe data, based on column sourcecol and stored in a
        new column named destcol

        :param data: string (dataframe in self.frames)
        :param sourcecol: string, column name in self.frame[data]
        :param destcol: string, new column name for cumulative distribution
        :return: True
        -----------------------------------------------------------------------------------------"""
        data = self.frame[data]
        source = data[sourcecol]
        cum = []
        total = 0
        for v in source:
            total += v
            cum.append(total)

        for i in range(len(cum)):
            cum[i] /= total

        data[destcol] = cum
        return True

    def addSegment(self, framename, xcol='x', ycol='y', xnew='x1', ynew='y1'):
        """-----------------------------------------------------------------------------------------
        convert x, y dot positions to line segments; the segment renderer requires beginning and
        ending points for each segment. The existing x and y are modified to be the beginning and
        new variables (xnew and ynew) are added for the end points.
        
        :param xcol: string, name of x column in data frame
        :param ycol: string, name of y column in data frame
        :param xnew: string, name of new x column in data frame (end of segment)
        :param ynew: string, name of new y column in data frame (end of segment)
        :return: True
        -----------------------------------------------------------------------------------------"""
        frame = self.frame[framename]
        frame[xnew] = []
        frame[ynew] = []

        # correct the direction when sequence 2 is reversed
        yinc = self.yinc
        dither = 0.5
        ydither = [-dither * yinc, dither * yinc]

        for pos in range(len(frame[xcol])):
            frame[xnew].append(frame[xcol][pos] + dither)
            frame[ynew].append(frame[ycol][pos] + ydither[1])
            frame[xcol][pos] -= dither
            frame[ycol][pos] += ydither[0]

        return True

    @staticmethod
    def density(score, total):
        """-----------------------------------------------------------------------------------------
        Convert a list representing the score distribution to a density by dividing by total
        :param score: list of int or float
        :param total: total number of scores (sum(score))
        :return:
        -----------------------------------------------------------------------------------------"""
        maxp = 0.0
        for i in range(len(score)):
            score[i] /= total
            maxp = max(maxp, score[i])

        return maxp

    @staticmethod
    def scoreMinMax(score):
        """-----------------------------------------------------------------------------------------
        Returns the first and last non-zero positions in a list of scores.  Use to get ranges for
        score histograms

        :param score: list
        :return: int, int
        -----------------------------------------------------------------------------------------"""
        scoremin = None
        scoremax = None
        for i in range(len(score)):
            if score[i] > 0:
                if scoremin is None:
                    scoremin = i
                scoremax = i

        return scoremin, scoremax

    @staticmethod
    def valueMinMax(score):
        """-----------------------------------------------------------------------------------------
        Returns the minimum and maximum value in a list of values.

        :param score: list
        :return: float, float
        -----------------------------------------------------------------------------------------"""
        scoremin = score[0]
        scoremax = score[0]
        for s in score:
            scoremin = min(scoremin, s)
            scoremax = max(scoremax, s)

        return scoremin, scoremax
Example #19
0
    peptide_maxlen = 20000
    args = arguments_get()
    sys.stderr.write('\nfasta_getorfs - Get ORFs from transcript sequences\n')
    sys.stderr.write('\tinput transcript file: {}\n'.format(
        args.transcript.name))
    sys.stderr.write('\toutput ORF Fasta file: {}\n'.format(args.fasta.name))
    if args.tabular:
        sys.stderr.write('\toutput ORF tabular file: {}\n'.format(
            args.tabular.name))
    sys.stderr.write('\tORF histogram file: {}\n'.format(args.histogram.name))
    sys.stderr.write('\tminimum ORF length: {}\n'.format(args.minlen))
    if args.longest_only:
        sys.stderr.write('\tOnly longest ORFs will be reported\n')
    sys.stderr.write('\n')

    fasta = Fasta(fh=args.transcript)
    nsequence = 0
    npeptide = 0
    npeptide_total = 0

    # initialize lists for histograms
    # lenhist is for all ORFs
    # longhist is for the longerst ORF in each transcript

    lenhist = [0 for _ in range(peptide_maxlen)]
    lentotal = 0

    longhist = [0 for _ in range(peptide_maxlen)]
    longtotal = 0

    n = None
# default target file name
target = '*.fasta'
if len(sys.argv) > 1:
    target = sys.argv[1]
sys.stderr.write('  target file: {}\n\n'.format(target))

# read all the sequences and store in dictionary
unique_seq = {}
n_file = 0
n_perfile = 0
n_uniqueperfile = 0
n_total = 0
n_unique_total = 0
sys.stderr.write('{}\t{}\t\t{}\n'.format('file', 'per file', 'total'))
for fastafile in glob.glob(target):
    fasta = Fasta()
    fasta.open(fastafile)
    n_file += 1
    n_perfile = 0
    while fasta.next():
        n_perfile += 1
        if fasta.id in unique_seq:
            continue
        else:
            n_uniqueperfile += 1
            unique_seq[fasta.id] = fasta.format(linelen=100)

    n_total += n_perfile
    n_unique_total += n_uniqueperfile
    sys.stderr.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(n_file, fastafile,
                                                       n_perfile,
Example #21
0
        #     l1 = len(i1)
        #     l2 = len(i2)
        #
        #     score = bestscore
        #     ipos, jpos = bestpos
        #     while score > 0:
        target


# --------------------------------------------------------------------------------------------------
# main
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__':
    align = Alignment()

    align.s1 = Fasta(filename=sys.argv[1])
    align.s2 = Fasta(filename=sys.argv[2])
    align.readNCBI('../../dotplot/table/BLOSUM62.matrix')

    # testing
    # align.s1 = Fasta()
    # align.s1.seq = 'ACTGCC'
    # align.s2 = Fasta()
    # align.s2.seq = 'ATGCC'
    # align.readNCBI('../../dotplot/table/NUC4.4.matrix')

    align.seqToInt()
    # random.shuffle(align.i1)          # uncomment to test scores for random alignments
    original_score, bestpos = align.localScore(-10, -1)
    print('original score: {} at {}'.format(original_score, bestpos))
    beginscore, beginpos = align.localReverse(-10, -1, original_score, bestpos)
Example #22
0
        base = base.replace('.seq', '')
        sys.stdout.write('\n\tExpanded file: {}\n\tbasename: {}\n'.format(
            infilename, base))
        outfilename = base + '.fasta'
        outfile = None
        try:
            outfile = open(outfilename, 'w')
        except:
            sys.stderr.write(
                'Unable to open output file ({})\n'.format(outfilename))
            exit(2)

        # process all sequences in the file
        n = 0
        for seq in infile:
            fasta = Fasta()
            fasta.id = base + '_{}'.format(n)
            fasta.seq = seq.rstrip().upper()
            fasta.doc = 'length={}'.format(fasta.length())
            outfile.write(fasta.format(linelen=100))
            n += 1

        infile.close()
        outfile.close()
        sys.stdout.write('\t{} sequences written to {}\n'.format(
            n, outfilename))

    # end of loop over files

exit(0)
Example #23
0
    for job in delete_list:
        del joblist[job]

    return text


# ==================================================================================================
# Main
# ==================================================================================================

args = arguments_get()
args.logfile.write('\ninterpro_batch - interproscan of ORF sequences\n')
args.logfile.write('\tinput ORF file: {}\n'.format(args.fasta_in.name))
args.logfile.write('\tminimum ORF length: {}\n\n'.format(args.minlen))

fasta = Fasta(fh=args.fasta_in)

# The job list keeps track of the ips object that have been created and their current status
# the joblist is a dictionary where the ips object is the key and the value is a status string
joblist = {}

# create a template for the jobs.  The template is an interpro object with the metadata added
template = Interpro(loglevel=1)
template.log_fh = args.logfile
template.email = '*****@*****.**'
template.application_select(['Pfam', 'Panther', 'SignalP'])
template.output_select = 'json'
template.poll_time = 60
template.poll_max = 100

sequence_limit = 20
Example #24
0
cl = commandline.parse_args()
maxbases = cl.maxbases
outbase = cl.prefix
outsuffix = cl.suffix
outsuffix = outsuffix.lstrip('.')  # remove leading . if present
trim = cl.trim

print('\nsplit.py - split fasta file into chunks')
print("    fasta file:", cl.fasta_file.name)
print("    maximum characters:", maxbases)
print("    output prefix:", outbase)
print("    output suffix:", outsuffix)
print("    doc trimmer:", trim)
print('')

fasta = Fasta()
fasta.fh = cl.fasta_file

trimre = re.compile(trim)

# initialize counters
base_total = 0
base_current = 0
n_out = 0
n_seq = 0
n_current = 0

while fasta.next():

    if trimre: fasta.trimDocByRegex(trimre)
    if not n_seq or base_current + fasta.length() > maxbases:
Example #25
0
            counts, bases = feature.total()
        -----------------------------------------------------------------------------------------"""
        total_count = 0
        total_len = 0
        for lspace in self.space:
            total_count += self.space[lspace]
            total_len += self.space[lspace] * int(lspace)

        return total_count, total_len


# ==================================================================================================
# main/test
# ==================================================================================================
if __name__ == '__main__':
    fasta = Fasta(file=sys.argv[1])
    feature = Feature()
    feature.feature = "A.C"

    c = 0;
    while fasta.next():
        count = feature.count_space(fasta)
        print(count)
        c += 1
        if c > 100:
            break

    fcount, flen = feature.total()
    print('count:{}  len:{}  avg:{}'.format( fcount, flen, flen/fcount))

exit(0)
Example #26
0
                # end of a run at end of diagonal (do not subtract because the end position is
                # the true end)
                filtered.append([pos, runlen])

            diagonal[d] = filtered

        return nmatch


# --------------------------------------------------------------------------------------------------
# Testing
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__':
    print('\ntest 0: identity matching')
    print('\texpect 7 matches\n')
    fasta = Fasta()
    fasta.id = 'test0'
    fasta.doc = '5 letter DNA test'
    fasta.seq = 'ACAGT'
    print('{}\n'.format(fasta.format()))

    match = Match()
    match.s1 = fasta
    match.s2 = fasta
    nmatch = match.identityPos()
    print('matches: {}'.format(nmatch))

    print('\ntest 1: identity matching, unequal length sequences')
    print('\texpect 11 matches\n')
    match = Match()
Example #27
0
    # idlist, idlist will be an emtpy list if none is provided
    idlist = get_id_list(args)

    # read the sequences and store all that match the IDs
    # duplicates in sequence files will be stored twice
    n_match = {}  # per file number of sequences in list
    n_notmatch = {}  # per file number of sequences not in list
    n_sequence = {}  # per file number of sequences
    n_found = {}  # per ID, number of times found in all files
    n_file = 0
    n_total = 0
    n_written = 0
    out = sys.stdout
    for fastafile in glob.glob(args.input_filename):
        fasta = Fasta()
        fasta.open(fastafile)
        if args.outsuffix:
            outfile = os.path.basename(fastafile) + f'{args.outsuffix}'
            out = opensafe(outfile, 'w')
            if not out:
                # if file can't be opened use stdout
                out = sys.stdout

        n_sequence[fastafile] = 0
        n_match[fastafile] = 0
        n_notmatch[fastafile] = 0
        n_file += 1

        while fasta.next():
            n_sequence[fastafile] += 1
Example #28
0
Michael Gribskov     20 April 2021
================================================================================================="""
import time
import pymongo
from sequence.fasta import Fasta

# --------------------------------------------------------------------------------------------------
# main program
# --------------------------------------------------------------------------------------------------

mongo = pymongo.MongoClient("mongodb://localhost:27017/")
biocomputing = mongo['biocomputing']
biocomputing.drop_collection('phage')
phage = biocomputing['phage']

fasta = Fasta('C:/Users/michael/Desktop/phage.fa')

fasta_start_time = time.perf_counter()
nseq = 0
all = []
while fasta.next():
    nseq += 1
    # if not nseq % 10001:
    #     break
    all.append({
        '_id': fasta.id,
        'documentation': fasta.doc,
        'sequence': fasta.seq
    })

fasta_end_time = time.perf_counter()