Ejemplo n.º 1
0
 def test_reduce_msa(self):
     msa = MSA(read_msa(test_data('test_reduce.msa')))
     reduced_alignment = reduce_alignment(msa.alignment)
     for i, line in enumerate(reduced_alignment):
         assert len(line) == 4 and \
                 ''.join(line) == ''.join(
                         msa.alignment[i])[:msa.alignment[i].index('(')]
Ejemplo n.º 2
0
def test_reduce_msa(test_data):
    msa = MSA(read_msa(str(test_data / 'test_reduce.msa')))
    reduced_alignment = reduce_alignment(msa.alignment)
    for i, line in enumerate(reduced_alignment):
        assert len(line) == 4 and \
                ''.join(line) == ''.join(
                        msa.alignment[i])[:msa.alignment[i].index('(')]
Ejemplo n.º 3
0
    def test_msa2str(self):
        aranger = '{body}{meta}'

        # read msa traditionally into an object
        msa_a = MSA(test_data('harry.msa'))

        # read msa from dictionary
        msa_b = qlc.read_msa(test_data('harry.msa'))

        # read msa with IDs
        msa_c = qlc.read_msa(test_data('harry_with_ids.msa'),
                             ids=True,
                             header=False)

        # we adjust the dataset and the seq_id since otherwise we won't have
        # similar output
        msa_c['seq_id'] = 'test'
        msa_c['dataset'] = 'file'

        # when converting these different objects to string with the same body
        # and the like, they should be identical, so we check this here
        str_a = msa2str(msa_a, _arange=aranger)
        str_b = msa2str(msa_b, _arange=aranger)
        str_c = msa2str(msa_c, _arange=aranger, wordlist=False)

        assert str_a == str_b == str_c

        # we next test for converting with the merging attribute
        str_d = msa2str(msa_c, _arange=aranger, wordlist=True, merge=True)
        str_e = msa2str(msa_c, _arange=aranger, wordlist=True, merge=False)

        # remove tabstops for checking similar strings
        str_d_st = str_d.replace('\t', '')
        str_e_st = str_e.replace('\t', '')

        # get index until 'COLUMN'
        idx = str_d_st.index('COLUMNID')
        assert str_d != str_e and str_d_st[:idx] == str_e_st[:idx]

        # add a consensus string to all msa objects
        consensus_a = get_consensus(MSA(msa_b), gaps=True)
        consensus_b = get_consensus(MSA(msa_c), gaps=True)

        msa_b['consensus'] = consensus_a
        msa_c['consensus'] = consensus_b

        assert msa2str(msa_b) == msa2str(msa_c, wordlist=False)
Ejemplo n.º 4
0
def msa2tex(infile, template='', filename='', **keywords):
    """
    Convert an MSA to a tabular representation which can easily be used in
    LaTeX documents.
    """
    util.setdefaults(keywords, pid_mode=1)

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task
    # load msa
    msa = read_msa(infile)

    ## load templates
    tex = util.read_text_file(template or template_path('msa.tex'))

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    dataset = msa['dataset']
    infile = msa['infile']
    seq_id = msa['seq_id']

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    height = len(msa['alignment'])
    width = len(msa['alignment'][0])

    start = r'\tabular{l' + width * 'c' + '}\n'
    start += r'\bf\ttfamily Taxon & \multicolumn{' + str(
        width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    body = start
    for i, taxon in enumerate(msa['taxa']):
        body += r'\ttfamily ' + taxon.replace('_', r'\_')
        for j, char in enumerate(msa['alignment'][i]):
            if char != '-':
                cls = token2class(char, rcParams['dolgo'])
            elif char == '-':
                cls = 'X'
            if char == '_':
                char = r'\#'
            if cls == '_':
                cls = '2'
            if j not in swaps:
                body += r'&\cellcolor{col' + cls + r'}' + char
            else:
                if char != '-':
                    body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char
                else:
                    body += r'&\cellcolor{col' + cls + r'}\bf ' + char
        body += r'\\' + '\n'

    body += r'&' + '&'.join([r'\color{white}XXX'
                             for i in range(width)]) + r'\\' + '\n'
    body += r'\endtabular' + '\n'

    # create the parameters etc.
    w = 1.5 * width + taxl * 0.25
    h = 0.5 * height + 1.0

    tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w))
    tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h))

    # create the rput stuff
    tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0))
    tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0))

    # insert the rest
    tex = tex.replace('<+CONTENT+>', body)

    # write to file
    if not filename:
        filename = 'lingpy-{0}'

    util.write_text_file(filename + '.tex', tex)
Ejemplo n.º 5
0
def msa2html(msa, shorttitle='', filename='', template='', **keywords):
    """
    Convert files in ``msa``-format into colored ``html``-format.

    Parameters
    ----------
    msa : dict
        A dictionary object that contains all the information of an MSA object.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``SCA`` will be used.

    filename : str (default="")
        Define the name of the output file. If no name is defined, the name of
        the input file will be taken as a default.

    template : str (default="")
        The path to the template file. If no name is defined, the basic
        template will be used. The basic template currently used can be found
        under ``lingpy/data/templates/msa2html.html``.

    Examples
    --------
    Load the libary.

    >>> from lingpy import *
    
    Load an ``msq``-file from the test-sets.

    >>> msa = MSA('harry.msq')

    Align the data progressively and carry out a check for swapped sites.

    >>> msa.prog_align()
    >>> msa.swap_check()
    >>> print(msa)
    w    o    l    -    d    e    m    o    r    t
    w    a    l    -    d    e    m    a    r    -
    v    -    l    a    d    i    m    i    r    -

    Save the data to the file ``harry.msa``.

    >>> msa.output('msa',filename='harry')

    Save the ``msa``-object as ``html``.

    >>> msa.output('html',filename='harry')
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the ``color``
    :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted.
    

    See also
    --------
    lingpy.convert.html.alm2html
    """
    util.setdefaults(
        keywords,
        pid_mode=1,
        stress=rcParams['stress'],
        css=False,
        js=False,
        compact=False,
        class_sort=True,
        write_to_file=True,
    )

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task

    # load templates
    template = template or template_path('msa2html.html')
    if template == 'js':
        template = template_path('msa2html.js.html')
    html = util.read_text_file(template)
    css = util.read_text_file(keywords['css'] or template_path('msa.css'))
    js = util.read_text_file(keywords['js'] or template_path('msa.js'))

    # treat the msa-object as a file and try to load the file if this is the
    # case
    if isinstance(msa, string_types):
        msa = read_msa(msa, **keywords)
    else:
        raise ValueError('[!] No filename specified.')

    # load dataset, etc.
    dataset = msa['dataset']

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    infile = msa['infile']
    seq_id = msa['seq_id']

    # define the titles etc.
    if not shorttitle:
        shorttitle = 'SCA'

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    # format css file
    css = css.replace('TAXON_LENGTH', str(taxl * 10))

    out = ''
    tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n'
    td_taxon = '<td class="taxon">{0}</td>'
    perc = int(80 / len(msa['alignment'][0]) + 0.5)
    td_residue = '<td class="residue {1}">{0}</td>'
    td_swap = '<td class="residue swap {1}">{0}</td>'
    td_unaligned = '<td class="residue noalign {1}">{0}</td>'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    # check for
    local = ['*'] * len(msa['alignment'][0])
    if 'local' in msa:
        local = ['.'] * len(msa['alignment'][0])
        for i in msa['local']:
            local[i] = '*'

    # get two sorting schemas for the sequences
    if keywords['class_sort']:

        classes = [
            tokens2class(ipa2tokens(seq), rcParams['asjp'])
            for seq in msa['seqs']
        ]
        seqs = dict([
            (a[1], b) for a, b in zip(
                sorted(
                    zip(classes, msa['seqs']),
                    key=lambda x: x[0]  # list(zip(x[0],x[1]))
                ),
                range(1,
                      len(msa['seqs']) + 1))
        ])
    else:
        seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1)))
    taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1)))

    # set up a list to store unique alignments
    alignments = []

    # start iteration
    for i, taxon in enumerate(msa['taxa']):
        tmp = ''
        tmp += td_taxon.format(taxon)

        # append alignment to alignments
        alignment = ''.join(msa['alignment'][i])
        sequence = msa['seqs'][i]
        if alignment in alignments:
            unique = 'false'
        else:
            unique = 'true'
            alignments += [alignment]

        for j, char in enumerate(msa['alignment'][i]):
            if char == '-':
                d = 'dolgo_GAP'
                c = '#bbbbbb'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])
                c = token2class(char, rcParams['_color'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'

            if j in swaps:
                tmp += td_swap.format(char, d)
            elif local[j] != '*':
                tmp += td_unaligned.format(char, d)
            else:
                tmp += td_residue.format(char, d)
        out += tr.format(tmp, unique, taxa[taxon], seqs[sequence])

    html = html.format(
        table=out,
        dataset=dataset,
        pid=pid_score,
        file=infile,
        sequence=seq_id,
        shorttitle=shorttitle,
        width=len(msa['alignment'][0]),
        table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl),
        taxa=len(msa['alignment']),
        uniseqs=len(set(msa['seqs'])),
        css=css,
        js=js)

    if not filename:
        filename = rcParams['filename']

    if not filename.endswith('.html'):
        filename = filename + '.html'

    if keywords['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    if keywords['write_to_file']:
        # check, whether the outfile already exists
        util.write_text_file(filename, html)
    else:
        return html
Ejemplo n.º 6
0
def test_normalize_alignment(test_data):
    msa = MSA(read_msa(str(test_data / 'harry_unnormal.msa')))

    for line in msa.alignment[1:]:
        assert len(line) == len(msa.alignment[0])
Ejemplo n.º 7
0
def test_read_msa(test_data):
    msa = MSA(read_msa(str(test_data / 'harry.msa')))
    assert hasattr(msa, 'seqs')
Ejemplo n.º 8
0
def msa2tex(
    infile,
    template='',
    filename='',
    **keywords
):
    """
    Convert an MSA to a tabular representation which can easily be used in
    LaTeX documents.
    """
    util.setdefaults(keywords, pid_mode=1)

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task
    # load msa
    msa = read_msa(infile)

    ## load templates
    tex = util.read_text_file(template or template_path('msa.tex'))

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    dataset = msa['dataset']
    infile = msa['infile']
    seq_id = msa['seq_id']

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    height = len(msa['alignment'])
    width = len(msa['alignment'][0])

    start = r'\tabular{l' + width * 'c' + '}\n'
    start += r'\bf\ttfamily Taxon & \multicolumn{' + str(
        width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    body = start
    for i, taxon in enumerate(msa['taxa']):
        body += r'\ttfamily ' + taxon.replace('_', r'\_')
        for j, char in enumerate(msa['alignment'][i]):
            if char != '-':
                cls = token2class(char, rcParams['dolgo'])
            elif char == '-':
                cls = 'X'
            if char == '_':
                char = r'\#'
            if cls == '_':
                cls = '2'
            if j not in swaps:
                body += r'&\cellcolor{col' + cls + r'}' + char
            else:
                if char != '-':
                    body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char
                else:
                    body += r'&\cellcolor{col' + cls + r'}\bf ' + char
        body += r'\\' + '\n'

    body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n'
    body += r'\endtabular' + '\n'

    # create the parameters etc.
    w = 1.5 * width + taxl * 0.25
    h = 0.5 * height + 1.0

    tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w))
    tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h))

    # create the rput stuff
    tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0))
    tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0))

    # insert the rest
    tex = tex.replace('<+CONTENT+>', body)

    # write to file
    if not filename:
        filename = 'lingpy-{0}'

    util.write_text_file(filename + '.tex', tex)
Ejemplo n.º 9
0
def msa2html(
    msa,
    shorttitle='',
    filename='',
    template='',
    **keywords
):
    """
    Convert files in ``msa``-format into colored ``html``-format.

    Parameters
    ----------
    msa : dict
        A dictionary object that contains all the information of an MSA object.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``SCA`` will be used.

    filename : str (default="")
        Define the name of the output file. If no name is defined, the name of
        the input file will be taken as a default.

    template : str (default="")
        The path to the template file. If no name is defined, the basic
        template will be used. The basic template currently used can be found
        under ``lingpy/data/templates/msa2html.html``.

    Examples
    --------
    Load the libary.

    >>> from lingpy import *
    
    Load an ``msq``-file from the test-sets.

    >>> msa = MSA('harry.msq')

    Align the data progressively and carry out a check for swapped sites.

    >>> msa.prog_align()
    >>> msa.swap_check()
    >>> print(msa)
    w    o    l    -    d    e    m    o    r    t
    w    a    l    -    d    e    m    a    r    -
    v    -    l    a    d    i    m    i    r    -

    Save the data to the file ``harry.msa``.

    >>> msa.output('msa',filename='harry')

    Save the ``msa``-object as ``html``.

    >>> msa.output('html',filename='harry')
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the ``color``
    :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted.
    

    See also
    --------
    lingpy.convert.html.alm2html
    """
    util.setdefaults(
        keywords,
        pid_mode=1,
        stress=rcParams['stress'],
        css=False,
        js=False,
        compact=False,
        class_sort=True,
        write_to_file=True,
    )

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task

    # load templates
    template = template or template_path('msa2html.html')
    if template == 'js':
        template = template_path('msa2html.js.html')
    html = util.read_text_file(template)
    css = util.read_text_file(keywords['css'] or template_path('msa.css'))
    js = util.read_text_file(keywords['js'] or template_path('msa.js'))

    # treat the msa-object as a file and try to load the file if this is the
    # case
    if isinstance(msa, string_types):
        msa = read_msa(msa, **keywords)
    else:
        raise ValueError('[!] No filename specified.')

    # load dataset, etc.
    dataset = msa['dataset']

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    infile = msa['infile']
    seq_id = msa['seq_id']

    # define the titles etc.
    if not shorttitle:
        shorttitle = 'SCA'

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    # format css file 
    css = css.replace('TAXON_LENGTH', str(taxl * 10))

    out = ''
    tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n'
    td_taxon = '<td class="taxon">{0}</td>'
    perc = int(80 / len(msa['alignment'][0]) + 0.5)
    td_residue = '<td class="residue {1}">{0}</td>'
    td_swap = '<td class="residue swap {1}">{0}</td>'
    td_unaligned = '<td class="residue noalign {1}">{0}</td>'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    # check for 
    local = ['*'] * len(msa['alignment'][0])
    if 'local' in msa:
        local = ['.'] * len(msa['alignment'][0])
        for i in msa['local']:
            local[i] = '*'

    # get two sorting schemas for the sequences
    if keywords['class_sort']:

        classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']]
        seqs = dict(
            [(a[1], b) for a, b in zip(
                sorted(
                    zip(classes, msa['seqs']),
                    key=lambda x: x[0]  # list(zip(x[0],x[1]))
                ),
                range(1, len(msa['seqs']) + 1)
            )]
        )
    else:
        seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1)))
    taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1)))

    # set up a list to store unique alignments
    alignments = []

    # start iteration
    for i, taxon in enumerate(msa['taxa']):
        tmp = ''
        tmp += td_taxon.format(taxon)

        # append alignment to alignments
        alignment = ''.join(msa['alignment'][i])
        sequence = msa['seqs'][i]
        if alignment in alignments:
            unique = 'false'
        else:
            unique = 'true'
            alignments += [alignment]

        for j, char in enumerate(msa['alignment'][i]):
            if char == '-':
                d = 'dolgo_GAP'
                c = '#bbbbbb'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])
                c = token2class(char, rcParams['_color'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'

            if j in swaps:
                tmp += td_swap.format(char, d)
            elif local[j] != '*':
                tmp += td_unaligned.format(char, d)
            else:
                tmp += td_residue.format(char, d)
        out += tr.format(tmp, unique, taxa[taxon], seqs[sequence])

    html = html.format(
        table=out,
        dataset=dataset,
        pid=pid_score,
        file=infile,
        sequence=seq_id,
        shorttitle=shorttitle,
        width=len(msa['alignment'][0]),
        table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl),
        taxa=len(msa['alignment']),
        uniseqs=len(set(msa['seqs'])),
        css=css,
        js=js
    )

    if not filename:
        filename = rcParams['filename']

    if not filename.endswith('.html'):
        filename = filename + '.html'

    if keywords['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    if keywords['write_to_file']:
        # check, whether the outfile already exists
        util.write_text_file(filename, html)
    else:
        return html
Ejemplo n.º 10
0
    def test_normalize_alignment(self):
        msa = MSA(read_msa(test_data('harry_unnormal.msa')))

        for line in msa.alignment[1:]:
            assert len(line) == len(msa.alignment[0])
Ejemplo n.º 11
0
 def test_read_msa(self):
     msa = MSA(read_msa(test_data('harry.msa')))
     assert hasattr(msa, 'seqs')