Beispiel #1
0
    def evaluate_string(self, string, tokens=False, **keywords):
        setdefaults(keywords, stress=rcParams['stress'],
                diacritics=rcParams['diacritics'], cldf=False)
        if not tokens:
            tokens = ipa2tokens(string)
        score = 1
        dist = self.dist['#']

        prostring = prosodic_string(tokens, rcParams['art'], cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
        if self.classes:
            c = tokens2class(tokens, self.model, cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
            teststring = list(zip(prostring, c))
        else:
            teststring = list(zip(prostring, tokens))

        scores = []

        while len(teststring) > 0:
            segment = teststring.pop(0)
            freq = dist.count(segment)
            allf = len(dist)
            s = freq / allf
            score = score * s
            scores += [s]
            dist = self.dist[segment]
        score = score * s
        scores += [s]
        lscore = np.log10(score)
        lscore = lscore / len(tokens)
        return score, lscore  # np.log10(score)
Beispiel #2
0
    def evaluate_string(self, string, tokens=False, **keywords):
        setdefaults(keywords, stress=rcParams['stress'],
                diacritics=rcParams['diacritics'], cldf=False)
        if not tokens:
            tokens = ipa2tokens(string)
        score = 1
        dist = self.dist['#']

        prostring = prosodic_string(tokens, rcParams['art'], cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
        if self.classes:
            c = tokens2class(tokens, self.model, cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
            teststring = list(zip(prostring, c))
        else:
            teststring = list(zip(prostring, tokens))

        scores = []

        while len(teststring) > 0:
            segment = teststring.pop(0)
            freq = dist.count(segment)
            allf = len(dist)
            s = freq / allf
            score = score * s
            scores += [s]
            dist = self.dist[segment]
        score = score * s
        scores += [s]
        lscore = np.log10(score)
        lscore = lscore / len(tokens)
        return score, lscore  # np.log10(score)
Beispiel #3
0
    def __init__(self,
                 words,
                 tokens=False,
                 prostrings=[],
                 classes=False,
                 class_model=rcParams['model'],
                 **keywords):
        setdefaults(keywords,
                    stress=rcParams['stress'],
                    diacritics=rcParams['diacritics'],
                    cldf=False)
        self.model = class_model
        self.words = words
        self.tokens = []
        self.bigrams = []
        self.classes = []

        # start filling the dictionary
        for i, w in enumerate(words):

            # check for tokenized string
            if not tokens:
                tk = ipa2tokens(w, **keywords)
            else:
                tk = w[:]
            self.tokens += [tk]

            # create prosodic string
            if prostrings:
                p = prostrings[i]
            else:
                print(w, tk)
                tt = tokens2class(tk, rcParams['art'])
                print(tt)
                p = prosodic_string(tk,
                                    rcParams['art'],
                                    cldf=keywords['cldf'],
                                    diacritics=keywords['diacritics'],
                                    stress=keywords['stress'])
            # create classes
            if classes:
                c = tokens2class(tk,
                                 class_model,
                                 cldf=keywords['cldf'],
                                 diacritics=keywords['diacritics'],
                                 stress=keywords['stress'])
                bigrams = list(zip(p, c))
                self.classes += [c]
            else:
                # zip the stuff
                bigrams = list(zip(p, tk))

            # start appending the stuff
            self.bigrams += [bigrams]

            # init the mother object
            MCBasic.__init__(self, self.bigrams)
Beispiel #4
0
    def __init__(
        self,
        words,
        tokens=False,
        prostrings=[],
        classes=False,
        class_model=rcParams['model'],
        **keywords
    ):
        setdefaults(keywords, stress=rcParams['stress'],
                diacritics=rcParams['diacritics'], cldf=False)
        self.model = class_model
        self.words = words
        self.tokens = []
        self.bigrams = []
        self.classes = []

        # start filling the dictionary
        for i, w in enumerate(words):

            # check for tokenized string
            if not tokens:
                tk = ipa2tokens(w, **keywords)
            else:
                tk = w[:]
            self.tokens += [tk]

            # create prosodic string
            if prostrings:
                p = prostrings[i]
            else:
                tt = tokens2class(tk, rcParams['art'])
                p = prosodic_string(
                        tk, 
                        rcParams['art'],
                        cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
            # create classes
            if classes:
                c = tokens2class(tk, class_model, cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
                bigrams = list(zip(p, c))
                self.classes += [c]
            else:
                # zip the stuff
                bigrams = list(zip(p, tk))

            # start appending the stuff
            self.bigrams += [bigrams]

            # init the mother object
            MCBasic.__init__(self, self.bigrams)
Beispiel #5
0
def check_tokens(tokens, **keywords):
    """
    Function checks whether tokens are given in a consistent input format.
    """
    setdefaults(keywords, stress=rcParams['stress'],
            diacritics=rcParams['diacritics'], cldf=False)
    errors = []
    for i, token in enumerate(tokens):
        # check for conversion within the articulation-model
        cls = token2class(token, rcParams['art'], stress=keywords['stress'],
                cldf=keywords['cldf'], diacritics=keywords['diacritics'])
        if cls == '0':
            errors.append((i, token))

    return errors
Beispiel #6
0
def check_tokens(tokens, **keywords):
    """
    Function checks whether tokens are given in a consistent input format.
    """
    setdefaults(keywords, stress=rcParams['stress'],
            diacritics=rcParams['diacritics'], cldf=False)
    errors = []
    for i, token in enumerate(tokens):
        # check for conversion within the articulation-model
        cls = token2class(token, rcParams['art'], stress=keywords['stress'],
                cldf=keywords['cldf'], diacritics=keywords['diacritics'])
        if cls == '0':
            errors.append((i, token))

    return errors
Beispiel #7
0
    def diff(self, **keywords):
        """
        Write all differences between two sets to a file.

        Parameters
        ----------

        filename : str (default='eval_psa_diff')
            Default

        """
        setdefaults(keywords, filename=self.gold.infile)
        if not keywords['filename'].endswith('.diff'):
            keywords['filename'] = keywords['filename'] + '.diff'

        out = []
        for i, (a,
                b) in enumerate(zip(self.gold.alignments,
                                    self.test.alignments)):
            g1, g2, g3 = a
            t1, t2, t3 = b
            maxL = max([len(g1), len(t1)])
            if g1 != t1 or g2 != t2:
                taxA, taxB = self.gold.taxa[i]
                taxlen = max(len(taxA), len(taxB))
                seq_id = self.gold.seq_ids[i]
                out.append(
                    '{0}\n{1}\t{2}\n{3}\t{4}\n{5}\n{1}\t{6}\n{3}\t{7}\n\n'.
                    format(
                        seq_id,
                        taxA,
                        '\t'.join(g1),
                        taxB,
                        '\t'.join(g2),
                        '{0}\t{1}'.format(
                            taxlen * ' ',
                            '\t'.join(['==' for x in range(maxL)])),
                        '\t'.join(t1),
                        '\t'.join(t2),
                    ))
        log.file_written(keywords['filename'])
        write_text_file(keywords['filename'], out)
Beispiel #8
0
    def diff(self, **keywords):
        """
        Write all differences between two sets to a file.

        Parameters
        ----------

        filename : str (default='eval_psa_diff')
            Default

        """
        setdefaults(keywords, filename=self.gold.infile)
        if not keywords['filename'].endswith('.diff'):
            keywords['filename'] = keywords['filename'] + '.diff'

        out = []
        for i, (a, b) in enumerate(zip(self.gold.alignments, self.test.alignments)):
            g1, g2, g3 = a
            t1, t2, t3 = b
            maxL = max([len(g1), len(t1)])
            if g1 != t1 or g2 != t2:
                taxA, taxB = self.gold.taxa[i]
                taxlen = max(len(taxA), len(taxB))
                seq_id = self.gold.seq_ids[i]
                out.append('{0}\n{1}\t{2}\n{3}\t{4}\n{5}\n{1}\t{6}\n{3}\t{7}\n\n'.format(
                    seq_id,
                    taxA,
                    '\t'.join(g1),
                    taxB,
                    '\t'.join(g2),
                    '{0}\t{1}'.format(
                        taxlen * ' ', '\t'.join(['==' for x in range(maxL)])),
                    '\t'.join(t1),
                    '\t'.join(t2),
                ))
        log.file_written(keywords['filename'])
        write_text_file(keywords['filename'], out)
Beispiel #9
0
def check_tokens(tokens, **keywords):
    """
    Function checks whether tokens are given in a consistent input format.
    """
    setdefaults(keywords, stress=rcParams['stress'])
    errors = []

    for i, token in enumerate(tokens):
        # check for conversion within the articulation-model
        try:
            rcParams['art'].converter[token]
        except KeyError:
            try:
                rcParams['art'].converter[token[0]]
            except KeyError:
                if token[0] in keywords['stress']:
                    try:
                        rcParams['art'].converter[token[1]]
                    except KeyError:
                        errors.append((i, token))
                else:
                    errors.append((i, token))

    return errors
Beispiel #10
0
def check_tokens(tokens, **keywords):
    """
    Function checks whether tokens are given in a consistent input format.
    """
    setdefaults(keywords, stress=rcParams['stress'])
    errors = []

    for i, token in enumerate(tokens):
        # check for conversion within the articulation-model
        try:
            rcParams['art'].converter[token]
        except KeyError:
            try:
                rcParams['art'].converter[token[0]]
            except KeyError:
                if token[0] in keywords['stress']:
                    try:
                        rcParams['art'].converter[token[1]]
                    except KeyError:
                        errors.append((i, token))
                else:
                    errors.append((i, token))

    return errors
Beispiel #11
0
    def align(self, **keywords):
        """
        Align a pair of sequences or multiple sequence pairs.

        Parameters
        ----------
        gop : int (default=-1)
            The gap opening penalty (GOP).
        scale : float (default=0.5)
            The gap extension penalty (GEP), calculated with help of a scaling
            factor.
        mode : {"global","local","overlap","dialign"}
            The alignment mode, see :evobib:`List2012a` for details.
        factor : float (default = 0.3)
            The factor by which matches in identical prosodic position are
            increased.
        restricted_chars : str (default="T\_")
            The restricted chars that function as an indicator of syllable or
            morpheme breaks for secondary alignment, see :evobib:`List2012c`
            for details.
        distance : bool (default=False)
            If set to *True*, return the distance instead of the similarity
            score. Distance is calculated using the formula by
            :evobib:`Downey2008`.
        model : { None, ~lingpy.data.model.Model }
            Specify the sound class model that shall be used for the analysis.
            If no model is specified, the default model of :evobib:`List2012a`
            will be used.
        pprint : bool (default=False)
            If set to *True*, the alignments are printed to the screen.

        """
        setdefaults(
            keywords,
            gop=-1,
            scale=0.5,
            mode='global',
            factor=0.3,
            restricted_chars='T_',
            distance=False,
            model=rcParams['sca'],
            pprint=False,
            transform=rcParams['align_transform'])

        if hasattr(self, 'model'):
            if keywords['model'] != self.model:
                self._set_model(**keywords)
        else:
            self._set_model(**keywords)

        # create the alignments array
        self._alignments = calign.align_pairs(
            self.classes,
            self.weights,
            self.prostrings,
            keywords['gop'],
            keywords['scale'],
            keywords['factor'],
            self.scoredict,
            keywords['mode'],
            keywords['restricted_chars'],
            distance=1 if keywords['distance'] else 0)

        # switch back to alignments
        self.alignments = []
        for i, (almA, almB, sim) in enumerate(self._alignments):
            self.alignments.append((
                class2tokens(self.tokens[i][0], almA, local=keywords['mode'] == "local"),
                class2tokens(self.tokens[i][1], almB, local=keywords['mode'] == "local"),
                sim))

        # print the alignments, if this is chosen
        as_string(self, pprint=keywords['pprint'])
Beispiel #12
0
def psa2html(infile, **kw):
    """
    Function converts a PSA-file into colored html-format.
    """
    util.setdefaults(kw,
                     template=False,
                     css=False,
                     comment='#',
                     filename=infile[:-4] + '.html',
                     compact=True)

    template = util.read_text_file(kw['template'] or template_path('psa.html'))
    css = util.read_text_file(kw['css'] or template_path('psa.css'))

    data = []
    for line in util.read_text_file(infile, lines=True):
        if not line.startswith(kw['comment']):
            data.append(line)

    seq_ids = []
    pairs = []
    taxa = []
    alignments = []

    del data[0]

    i = 0
    while i <= len(data) - 3:
        try:
            seq_ids.append(data[i])

            datA = data[i + 1].split('\t')
            datB = data[i + 2].split('\t')

            taxonA = datA[0].strip('.')
            taxonB = datB[0].strip('.')
            almA = datA[1:]
            almB = datB[1:]

            taxa.append((taxonA, taxonB))
            pairs.append(('.'.join([k for k in almA if k != '-']),
                          '.'.join([k for k in almB if k != '-'])))
            alignments.append(
                ([str(a) for a in almA], [str(b) for b in almB], 0))
            assert len(alignments[-1][0]) == len(alignments[-1][1])
            i += 4
        except AssertionError:
            log.warning("Line {0} of the data is probably miscoded.".format(i +
                                                                            1))
            i += 1

    def get_classes(alm):
        classes = []
        residue = '<div class="residue {1}">{0}</div>'
        for j, char in enumerate(alm):
            if char == '-':
                d = 'dolgo_GAP'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'
            classes += [residue.format(char, d)]
        return ''.join(classes)

    out = '<table>\n'  # codecs.open(kw['filename'], 'w', 'utf-8')
    for i, (a, b, c) in enumerate(alignments):
        clsA = get_classes(a)
        clsB = get_classes(b)

        ids = int(100 * pid(a, b) + 0.5)

        out += '<tr class="head">'
        out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format(
            i + 1, seq_ids[i], ids)
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][0])
        out += '<td class="psa">{0}</td>'.format(clsA)
        out += '</tr>'
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][1])
        out += '<td class="psa">{0}</td>'.format(clsB)
        out += '</tr>'
        out += '<tr><td colspan=2></td></tr>'

    out += '</table>'

    html = template.format(alignments=out, css=css)

    if kw['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    util.write_text_file(kw['filename'], html)
Beispiel #13
0
def msa2tex(infile, template='', filename='', **keywords):
    """
    Convert an MSA to a tabular representation which can easily be used in
    LaTeX documents.
    """
    util.setdefaults(keywords, pid_mode=1)

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task
    # load msa
    msa = read_msa(infile)

    ## load templates
    tex = util.read_text_file(template or template_path('msa.tex'))

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    dataset = msa['dataset']
    infile = msa['infile']
    seq_id = msa['seq_id']

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    height = len(msa['alignment'])
    width = len(msa['alignment'][0])

    start = r'\tabular{l' + width * 'c' + '}\n'
    start += r'\bf\ttfamily Taxon & \multicolumn{' + str(
        width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    body = start
    for i, taxon in enumerate(msa['taxa']):
        body += r'\ttfamily ' + taxon.replace('_', r'\_')
        for j, char in enumerate(msa['alignment'][i]):
            if char != '-':
                cls = token2class(char, rcParams['dolgo'])
            elif char == '-':
                cls = 'X'
            if char == '_':
                char = r'\#'
            if cls == '_':
                cls = '2'
            if j not in swaps:
                body += r'&\cellcolor{col' + cls + r'}' + char
            else:
                if char != '-':
                    body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char
                else:
                    body += r'&\cellcolor{col' + cls + r'}\bf ' + char
        body += r'\\' + '\n'

    body += r'&' + '&'.join([r'\color{white}XXX'
                             for i in range(width)]) + r'\\' + '\n'
    body += r'\endtabular' + '\n'

    # create the parameters etc.
    w = 1.5 * width + taxl * 0.25
    h = 0.5 * height + 1.0

    tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w))
    tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h))

    # create the rput stuff
    tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0))
    tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0))

    # insert the rest
    tex = tex.replace('<+CONTENT+>', body)

    # write to file
    if not filename:
        filename = 'lingpy-{0}'

    util.write_text_file(filename + '.tex', tex)
Beispiel #14
0
def alm2html(infile,
             title='',
             shorttitle='',
             filename='',
             colored=False,
             main_template='',
             table_template='',
             dataset='',
             confidence=False,
             **keywords):
    """
    Convert files in ``alm``-format into colored ``html``-format.

    Parameters
    ----------

    title : str
        Define the title of the output file. If no title is provided, the
        default title ``LexStat - Automatic Cognate Judgments`` will be used.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``LexStat`` will be used.
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the
    ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed
    and adapted. 

    See also
    --------
    lingpy.convert.html.msa2html
    lingpy.convert.html.msa2tex

    """
    util.setdefaults(keywords, json="", labels={})

    # open the infile
    if not os.path.exists(infile):
        infile = infile + '.alm'
    data = util.read_text_file(infile)

    # create the outfile
    if not filename:
        filename = rcParams['filename']

    # read in the templates
    html = util.read_text_file(main_template or template_path('alm2html.html'))
    if not table_template:
        table_template = template_path(
            'alm2html.table.js.html' if confidence else 'alm2html.table.html')
    table = util.read_text_file(table_template)
    css = util.read_text_file(template_path('alm.css'))
    js = util.read_text_file(template_path('alm.js'))

    # define a label function for the taxa
    label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x

    # check for windows-compatibility
    data = data.replace(os.linesep, '\n')[:-1]

    # split the data into blocks
    blocks = data.split('\n\n')

    # retrieve the dataset
    dataset = dataset or blocks[0]

    # create the outstring
    tmp_str = ''

    for block in blocks[1:]:
        lines = block.split('\n')
        m = [l.split('\t') for l in lines]

        # create colordict for different colors
        dc = len(set([l[0] for l in m]))

        if colored:
            colors = {
                a: b
                for a, b in zip(
                    sorted(set([int(l[0]) for l in m])),
                    colorRange(dc, brightness=400),
                )
            }
        else:
            colors = []
            white = True
            for i in sorted(set([abs(int(l[0])) for l in m])):
                if white:
                    colors.append((i, 'white'))
                    white = False
                else:
                    colors.append((i, 'gray'))
                    white = True
            colors = dict(colors)

        # get the basic item and its id
        iName = m[0][2]
        iID = m[0][3]

        # start writing the stuff to string
        tmp_str += table.format(NAME=iName, ID=iID)
        # define the basic string for the insertion
        bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}'

        for tracer, l in enumerate(m):
            # check whether the current line is a borrowing
            if int(l[0]) < 0:
                loan_line = ' loan'
            else:
                loan_line = ''

            # assign the cognate id
            tmp = '  <td>{0}</td>\n'.format(l[0])
            tmp += '  <td>{0}</td>\n'.format(label(l[1].strip('.')))

            # check alignments for confidence scores
            ipa_string = ''.join([cell.split('/')[0]
                                  for cell in l[4:]]).replace('-', '')

            tmp += '  <td>{0}</td>\n'.format(ipa_string)
            tmp += '  <td class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '   <table class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '    <tr>\n{0}    </tr>\n   </table>\n  </td>\n </tr>\n'

            # check whether another entry follows that is also an alignment,
            # otherwise, there's no need to display a word as an alignment
            cognate_set = False
            if tracer < len(m) - 1:
                if abs(int(m[tracer + 1][0])) == abs(int(l[0])):
                    cognate_set = True
            if tracer > 0:
                if abs(int(m[tracer - 1][0])) == abs(int(l[0])):
                    cognate_set = True

            # fill out html for the cognate sets
            if cognate_set:

                alm = ''
                for char in l[4:]:

                    # check for confidence scores
                    if '/' in char:
                        try:
                            char, conf, num = char.split('/')
                            conf = int(conf)
                        except ValueError:
                            print(char.split('/'))
                            raise ValueError("Something is wrong with %s." %
                                             (char))

                    else:
                        char, conf, rgb = char, (255, 255, 255), 0.0

                    if char == '-':
                        d = 'dolgo_GAP'
                    else:
                        d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                        # bad check for three classes named differently
                        if d == 'dolgo__':
                            d = 'dolgo_X'
                        elif d == 'dolgo_1':
                            d = 'dolgo_TONE'
                        elif d == 'dolgo_0':
                            d = 'dolgo_ERROR'

                    if confidence:
                        alm += '     '
                        alm += '<td class="char {1}" confidence={0} '.format(
                            conf, d)
                        alm += 'char="{0}" '.format(char)
                        alm += 'onclick="' + "show('{0}')".format(num) + '" '
                        alm += 'num="{0}"'.format(num)
                        alm += '>\n      {0}\n     </td>\n'.format(char)
                    else:
                        alm += '     '
                        alm += '<td class="char {0}">{1}</td>\n'.format(
                            d, char)
            else:
                alm = '      '
                alm += '<td class="{0}">--</td>\n'.format(colors[abs(int(
                    l[0]))])

            # format the alignment
            try:
                tmp = tmp.format(alm)
            except ValueError:
                raise ValueError("Unknown problem in matchin %s and %s." %
                                 (alm, tmp))

            # check for last line, where a new line should be inserted (not the
            # fastest solution, but plotting is not a matter of time, and it
            # suffices it's current purpose
            if tracer < len(m) - 1:
                pass
            else:
                if confidence:
                    tmp += ' </table>\n'

                tmp += ' <tr class="empty"><td colspan="4" class="empty">'
                tmp += '<hr class="empty" /></td></tr>\n'

            # format the whole string
            tmp_str += bas.format(colors[abs(int(l[0]))], tmp, loan_line, l[1])

    if not title:
        title = "LexStat - Automatic Cognate Judgments"
    if not shorttitle:
        shorttitle = "LexStat"

    # check for json-attribute
    if keywords['json']:
        keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'],
                                                        indent=1)

    html = html.format(shorttitle=shorttitle,
                       title=title,
                       table=tmp_str,
                       dataset=dataset,
                       javascript=js,
                       css=css,
                       **keywords)
    util.write_text_file(filename + '.html', html)
    return
Beispiel #15
0
def msa2html(msa, shorttitle='', filename='', template='', **keywords):
    """
    Convert files in ``msa``-format into colored ``html``-format.

    Parameters
    ----------
    msa : dict
        A dictionary object that contains all the information of an MSA object.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``SCA`` will be used.

    filename : str (default="")
        Define the name of the output file. If no name is defined, the name of
        the input file will be taken as a default.

    template : str (default="")
        The path to the template file. If no name is defined, the basic
        template will be used. The basic template currently used can be found
        under ``lingpy/data/templates/msa2html.html``.

    Examples
    --------
    Load the libary.

    >>> from lingpy import *
    
    Load an ``msq``-file from the test-sets.

    >>> msa = MSA('harry.msq')

    Align the data progressively and carry out a check for swapped sites.

    >>> msa.prog_align()
    >>> msa.swap_check()
    >>> print(msa)
    w    o    l    -    d    e    m    o    r    t
    w    a    l    -    d    e    m    a    r    -
    v    -    l    a    d    i    m    i    r    -

    Save the data to the file ``harry.msa``.

    >>> msa.output('msa',filename='harry')

    Save the ``msa``-object as ``html``.

    >>> msa.output('html',filename='harry')
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the ``color``
    :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted.
    

    See also
    --------
    lingpy.convert.html.alm2html
    """
    util.setdefaults(
        keywords,
        pid_mode=1,
        stress=rcParams['stress'],
        css=False,
        js=False,
        compact=False,
        class_sort=True,
        write_to_file=True,
    )

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task

    # load templates
    template = template or template_path('msa2html.html')
    if template == 'js':
        template = template_path('msa2html.js.html')
    html = util.read_text_file(template)
    css = util.read_text_file(keywords['css'] or template_path('msa.css'))
    js = util.read_text_file(keywords['js'] or template_path('msa.js'))

    # treat the msa-object as a file and try to load the file if this is the
    # case
    if isinstance(msa, string_types):
        msa = read_msa(msa, **keywords)
    else:
        raise ValueError('[!] No filename specified.')

    # load dataset, etc.
    dataset = msa['dataset']

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    infile = msa['infile']
    seq_id = msa['seq_id']

    # define the titles etc.
    if not shorttitle:
        shorttitle = 'SCA'

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    # format css file
    css = css.replace('TAXON_LENGTH', str(taxl * 10))

    out = ''
    tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n'
    td_taxon = '<td class="taxon">{0}</td>'
    perc = int(80 / len(msa['alignment'][0]) + 0.5)
    td_residue = '<td class="residue {1}">{0}</td>'
    td_swap = '<td class="residue swap {1}">{0}</td>'
    td_unaligned = '<td class="residue noalign {1}">{0}</td>'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    # check for
    local = ['*'] * len(msa['alignment'][0])
    if 'local' in msa:
        local = ['.'] * len(msa['alignment'][0])
        for i in msa['local']:
            local[i] = '*'

    # get two sorting schemas for the sequences
    if keywords['class_sort']:

        classes = [
            tokens2class(ipa2tokens(seq), rcParams['asjp'])
            for seq in msa['seqs']
        ]
        seqs = dict([
            (a[1], b) for a, b in zip(
                sorted(
                    zip(classes, msa['seqs']),
                    key=lambda x: x[0]  # list(zip(x[0],x[1]))
                ),
                range(1,
                      len(msa['seqs']) + 1))
        ])
    else:
        seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1)))
    taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1)))

    # set up a list to store unique alignments
    alignments = []

    # start iteration
    for i, taxon in enumerate(msa['taxa']):
        tmp = ''
        tmp += td_taxon.format(taxon)

        # append alignment to alignments
        alignment = ''.join(msa['alignment'][i])
        sequence = msa['seqs'][i]
        if alignment in alignments:
            unique = 'false'
        else:
            unique = 'true'
            alignments += [alignment]

        for j, char in enumerate(msa['alignment'][i]):
            if char == '-':
                d = 'dolgo_GAP'
                c = '#bbbbbb'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])
                c = token2class(char, rcParams['_color'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'

            if j in swaps:
                tmp += td_swap.format(char, d)
            elif local[j] != '*':
                tmp += td_unaligned.format(char, d)
            else:
                tmp += td_residue.format(char, d)
        out += tr.format(tmp, unique, taxa[taxon], seqs[sequence])

    html = html.format(
        table=out,
        dataset=dataset,
        pid=pid_score,
        file=infile,
        sequence=seq_id,
        shorttitle=shorttitle,
        width=len(msa['alignment'][0]),
        table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl),
        taxa=len(msa['alignment']),
        uniseqs=len(set(msa['seqs'])),
        css=css,
        js=js)

    if not filename:
        filename = rcParams['filename']

    if not filename.endswith('.html'):
        filename = filename + '.html'

    if keywords['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    if keywords['write_to_file']:
        # check, whether the outfile already exists
        util.write_text_file(filename, html)
    else:
        return html
Beispiel #16
0
def calculate_data(
        wordlist,
        data,
        taxa='taxa',
        concepts='concepts',
        ref='cogid',
        **keywords):
    """
    Manipulate a wordlist object by adding different kinds of data.

    Parameters
    ----------
    data : str
        The type of data that shall be calculated. Currently supports

        * "tree": calculate a reference tree based on shared cognates
        * "dst": get distances between taxa based on shared cognates
        * "cluster": cluster the taxa into groups using different methods


    """
    logger = log.get_logger()
    util.setdefaults(
        keywords,
        distances=False,
        tree_calc="upgma",
        cluster="upgma",
        force=False,
        threshold=0.5,
        cluster_method='upgma')

    # get taxa for current calculation
    these_taxa = eval('wordlist.' + taxa)

    # calculate distances
    if data in ['distances', 'dst']:
        wordlist._meta['distances'] = wl2dst(
                wordlist, taxa, concepts, ref, **keywords)
    elif data in ['diversity', 'div']:
        etd = wordlist.get_etymdict(ref=ref)
        wordlist._meta['diversity'] = \
            (len(etd) - wordlist.height) / (len(wordlist) - wordlist.height)
    elif data in ['tre', 'tree', 'nwk']:
        if 'distances' not in wordlist._meta:
            wordlist._meta['distances'] = \
                wl2dst(wordlist, taxa, concepts, ref, **keywords)
        distances = wordlist._meta['distances']
        if 'tree' in wordlist._meta and not keywords['force']:
            logger.warn(
                    "Reference tree has already been calculated, "
                    "force overwrite by "
                    "setting 'force' to 'True'.")
            return
        wordlist._meta['tree'] = clustering.matrix2tree(
            distances, these_taxa, keywords['tree_calc'],
            keywords['distances'])

    elif data in ['groups', 'cluster']:
        if 'distances' not in wordlist._meta:
            distances = wl2dst(wordlist, taxa, concepts, ref, **keywords)
        else:
            distances = wordlist._meta['distances']
        if 'groups' in wordlist._meta and not keywords['force']:
            logger.warn(
                    "Distance matrix has already been calculated, "
                    "force overwrite by "
                    "setting 'force' to 'True'.")
            return
        wordlist._meta['groups'] = clustering.matrix2groups(
            keywords['threshold'], distances, these_taxa,
            keywords['cluster_method'])
    log.info("Successfully calculated {0}.".format(data))
Beispiel #17
0
    def _export(
            self,
            fileformat,
            sections=None,
            entries=None,
            entry_sep='',
            item_sep='',
            template='',
            exclude=None,
            entry_start='',
            entry_close='',
            **keywords):
        """
        Export a wordlist to various file formats.
        """
        if not sections:
            if fileformat == 'txt':
                sections = dict(
                    h1=('concept', '\n# Concept: {0}\n'),
                    h2=('cogid', '## Cognate-ID: {0}\n'))
            elif fileformat == 'tex':
                sections = dict(
                    h1=('concept', r'\section{{Concept: ``{0}"}}' + '\n'),
                    h2=('cogid', r'\subsection{{Cognate Set: ``{0}"}}' + '\n'))
            elif fileformat == 'html':
                sections = dict(
                    h1=('concept', '<h1>Concept: {0}</h1>'),
                    h2=('cogid', '<h2>Cognate Set: {0}</h2>'))

        if not entries:
            if fileformat == 'txt':
                entries = [('language', '{0} '), ('ipa', '{0}\n')]
            elif fileformat == 'tex':
                entries = [('language', '{0} '), ('ipa', '[{0}]' + '\n')]
            elif fileformat == 'html':
                entries = [('language', '{0}&nbsp;'), ('ipa', '[{0}]\n')]

        util.setdefaults(keywords, filename=rcParams['filename'])

        # get the temporary dictionary
        out = wl2dict(self, sections, entries, exclude)

        # assign the output string
        out_string = ''

        # iterate over the dictionary and start to fill the string
        for key in sorted(out, key=lambda x: str(x).lower()):
            # write key to file
            out_string += key[1]

            # reassign tmp
            tmp = out[key]

            # set the pointer and the index
            pointer = {0: [tmp, sorted(tmp.keys())]}

            while True:
                idx = max(pointer.keys())

                # check for type of current point
                if isinstance(tmp, dict):
                    if pointer[idx][1]:
                        next_key = pointer[idx][1].pop()
                        out_string += next_key[1]
                        tmp = pointer[idx][0][next_key]
                        if isinstance(tmp, dict):
                            pointer[idx + 1] = [tmp, sorted(tmp.keys())]
                        else:
                            pointer[idx + 1] = [tmp, tmp]
                    else:
                        del pointer[idx]
                        if idx == 0:
                            break
                else:
                    tmp_strings = []
                    for line in sorted(tmp):
                        tmp_strings += [item_sep.join(line)]
                    out_string += entry_start + entry_sep.join(tmp_strings) + entry_close
                    tmp = pointer[idx - 1][0]
                    del pointer[idx]

        if fileformat == 'tex':
            out_string = out_string.replace('_', r'\_')
        tmpl = util.read_text_file(template) if template else '{0}'
        _write_file(keywords['filename'], tmpl.format(out_string), fileformat)
Beispiel #18
0
    def _output(self, fileformat, **keywords):
        """
        Internal function that eases its modification by daughter classes.
        """
        # check for stamp attribute
        keywords["stamp"] = getattr(self, '_stamp', '')

        # add the default parameters, they will be checked against the keywords
        util.setdefaults(
            keywords,
            cols=False,
            distances=False,
            entries=("concept", "counterpart"),
            entry='concept',
            fileformat=fileformat,
            filename=rcParams['filename'],
            formatter='concept',
            modify_ref=False,
            meta=self._meta,
            missing=0,
            prettify='false',
            ignore='all',
            ref='cogid',
            rows=False,
            subset=False,  # setup a subset of the data,
            taxa='taxa',
            threshold=0.6,  # threshold for flat clustering
            tree_calc='neighbor')

        if fileformat in ['triple', 'triples', 'triples.tsv']:
            return tsv2triple(self, keywords['filename'] + '.' + fileformat)

        if fileformat in ['paps.nex', 'paps.csv']:
            paps = self.get_paps(
                ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing'])
            kw = dict(filename=keywords['filename'] + '.paps')
            if fileformat == 'paps.nex':
                kw['missing'] = keywords['missing']
                return pap2nex(self.cols, paps, **kw)
            return pap2csv(self.cols, paps, **kw)

        # simple printing of taxa
        if fileformat == 'taxa':
            assert hasattr(self, 'taxa')
            return util.write_text_file(keywords['filename'] + '.taxa', self.cols)

        # csv-output
        if fileformat in ['csv', 'qlc', 'tsv']:

            # get the header line
            header = sorted(
                [s for s in set(self._alias.values()) if s in self._header],
                key=lambda x: self._header[x])
            header = [h.upper() for h in header]

            self._meta.setdefault('taxa', self.cols)

            # get the data, in case a subset is chosen
            if not keywords['subset']:
                # write stuff to file
                return wl2qlc(header, self._data, **keywords)

            cols, rows = keywords['cols'], keywords['rows']

            if not isinstance(cols, (list, tuple, bool)):
                raise ValueError("[i] Argument 'cols' should be list or tuple.")
            if not isinstance(rows, (dict, bool)):
                raise ValueError("[i] Argument 'rows' should be a dictionary.")

            # check for chosen header
            if cols:
                # get indices for header
                indices = [self._header[x] for x in cols]
                header = [c.upper() for c in cols]
            else:
                indices = [r for r in range(len(self.header))]

            if rows:
                stmts = []
                for key, value in rows.items():
                    if key == 'ID':
                        stmts += ["key " + value]
                    else:
                        idx = self._header[key]
                        stmts += ["line[{0}] ".format(idx) + value]

            log.debug("calculated what should be excluded")

            # get the data
            out = {}
            for key, line in self._data.items():
                log.debug(key)

                if rows:
                    if eval(" and ".join(stmts)):
                        out[key] = [line[i] for i in indices]
                else:
                    out[key] = [line[i] for i in indices]

            log.debug("passing data to wl2qlc")
            return wl2qlc(header, out, **keywords)

        # output dst-format (phylip)
        if fileformat == 'dst':
            # check for distances as keyword
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self, **keywords)

            out = matrix2dst(self._meta['distances'], self.taxa,
                    stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0))
            return _write_file(keywords['filename'], out, fileformat)

        # output tre-format (newick)
        if fileformat in ['tre', 'nwk']:  # ,'cluster','groups']:
            if 'tree' not in self._meta:
                # check for distances
                if 'distances' not in self._meta:
                    self._meta['distances'] = wl2dst(self)
                # we look up a function to calculate a tree in the cluster module:
                tree = getattr(cluster, keywords['tree_calc'])(
                    self._meta['distances'], self.cols, distances=keywords['distances'])
            else:
                tree = self._meta['tree']

            return _write_file(keywords['filename'], '{0}'.format(tree), fileformat)

        if fileformat in ['cluster', 'groups']:
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self)  # check for keywords

            if 'groups' not in self._meta:
                self._meta['groups'] = cluster.matrix2groups(
                    keywords['threshold'], self._meta['distances'], self.taxa)
            lines = []
            for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]):
                lines.append('{0}\t{1}'.format(taxon, group))
            return _write_file(keywords['filename'], lines, fileformat)

        if fileformat in ['starling', 'star.csv']:
            # make lambda inline for data-check
            l = lambda x: ['-' if x == 0 else x][0]

            lines = []
            if 'cognates' not in keywords:
                lines.append('ID\tConcept\t' + '\t'.join(self.taxa))
                for i, concept in enumerate(self.concepts):
                    for line in self.get_list(row=concept, entry=keywords['entry']):
                        lines.append(
                            str(i + 1) + '\t' + concept + '\t' + '\t'.join(
                                [l(t) for t in line]))
            else:
                lines.append(
                    'ID\tConcept\t' + '\t'.join(
                        ['{0}\t COG'.format(t) for t in self.taxa]))
                for i, concept in enumerate(self.concepts):
                    cogs = self.get_list(row=concept, entry=keywords['cognates'])
                    for j, line in enumerate(
                            self.get_list(row=concept, entry=keywords['entry'])):
                        part = '\t'.join(
                            '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j]))
                        lines.append(util.tabjoin(i + 1, concept, part))

            return _write_file(
                keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv')

        if fileformat == 'multistate.nex':
            if not keywords['filename'].endswith('.multistate.nex'):
                keywords['filename'] += '.multistate.nex'

            matrix = wl2multistate(self, keywords['ref'], keywords['missing'])
            return multistate2nex(self.taxa, matrix, keywords['filename'])

        if fileformat == 'separated':
            if not os.path.isdir(keywords['filename']):
                os.mkdir(keywords['filename'])

            for l in self.cols:
                lines = [''] if 'ignore_keys' in keywords else ['ID\t']
                lines[0] += '\t'.join(x.upper() for x in keywords['entries'])
                for key in self.get_list(col=l, flat=True):
                    line = [] if 'ignore_keys' in keywords else [key]
                    for entry in keywords['entries']:
                        tmp = self[key, entry]
                        if isinstance(tmp, list):
                            tmp = ' '.join([str(x) for x in tmp])
                        line += [tmp]
                    lines.append('\t'.join('{0}'.format(x) for x in line))
                _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')
Beispiel #19
0
def msa2tex(
    infile,
    template='',
    filename='',
    **keywords
):
    """
    Convert an MSA to a tabular representation which can easily be used in
    LaTeX documents.
    """
    util.setdefaults(keywords, pid_mode=1)

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task
    # load msa
    msa = read_msa(infile)

    ## load templates
    tex = util.read_text_file(template or template_path('msa.tex'))

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    dataset = msa['dataset']
    infile = msa['infile']
    seq_id = msa['seq_id']

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    height = len(msa['alignment'])
    width = len(msa['alignment'][0])

    start = r'\tabular{l' + width * 'c' + '}\n'
    start += r'\bf\ttfamily Taxon & \multicolumn{' + str(
        width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    body = start
    for i, taxon in enumerate(msa['taxa']):
        body += r'\ttfamily ' + taxon.replace('_', r'\_')
        for j, char in enumerate(msa['alignment'][i]):
            if char != '-':
                cls = token2class(char, rcParams['dolgo'])
            elif char == '-':
                cls = 'X'
            if char == '_':
                char = r'\#'
            if cls == '_':
                cls = '2'
            if j not in swaps:
                body += r'&\cellcolor{col' + cls + r'}' + char
            else:
                if char != '-':
                    body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char
                else:
                    body += r'&\cellcolor{col' + cls + r'}\bf ' + char
        body += r'\\' + '\n'

    body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n'
    body += r'\endtabular' + '\n'

    # create the parameters etc.
    w = 1.5 * width + taxl * 0.25
    h = 0.5 * height + 1.0

    tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w))
    tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h))

    # create the rput stuff
    tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0))
    tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0))

    # insert the rest
    tex = tex.replace('<+CONTENT+>', body)

    # write to file
    if not filename:
        filename = 'lingpy-{0}'

    util.write_text_file(filename + '.tex', tex)
Beispiel #20
0
def alm2html(
    infile,
    title='',
    shorttitle='',
    filename='',
    colored=False,
    main_template='',
    table_template='',
    dataset='',
    confidence=False,
    **keywords
):
    """
    Convert files in ``alm``-format into colored ``html``-format.

    Parameters
    ----------

    title : str
        Define the title of the output file. If no title is provided, the
        default title ``LexStat - Automatic Cognate Judgments`` will be used.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``LexStat`` will be used.
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the
    ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed
    and adapted. 

    See also
    --------
    lingpy.convert.html.msa2html
    lingpy.convert.html.msa2tex

    """
    util.setdefaults(keywords, json="", labels={})

    # open the infile
    if not os.path.exists(infile):
        infile = infile + '.alm'
    data = util.read_text_file(infile)

    # create the outfile
    if not filename:
        filename = rcParams['filename']

    # read in the templates
    html = util.read_text_file(main_template or template_path('alm2html.html'))
    if not table_template:
        table_template = template_path(
            'alm2html.table.js.html' if confidence else 'alm2html.table.html')
    table = util.read_text_file(table_template)
    css = util.read_text_file(template_path('alm.css'))
    js = util.read_text_file(template_path('alm.js'))

    # define a label function for the taxa
    label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x

    # check for windows-compatibility
    data = data.replace(os.linesep, '\n')[:-1]

    # split the data into blocks
    blocks = data.split('\n\n')

    # retrieve the dataset
    dataset = dataset or blocks[0]

    # create the outstring
    tmp_str = ''

    for block in blocks[1:]:
        lines = block.split('\n')
        m = [l.split('\t') for l in lines]

        # create colordict for different colors
        dc = len(set([l[0] for l in m]))

        if colored:
            colors = {a: b for a, b in zip(
                sorted(set([int(l[0]) for l in m])),
                colorRange(dc, brightness=400),
            )}
        else:
            colors = []
            white = True
            for i in sorted(set([abs(int(l[0])) for l in m])):
                if white:
                    colors.append((i, 'white'))
                    white = False
                else:
                    colors.append((i, 'gray'))
                    white = True
            colors = dict(colors)

        # get the basic item and its id
        iName = m[0][2]
        iID = m[0][3]

        # start writing the stuff to string
        tmp_str += table.format(NAME=iName, ID=iID)
        # define the basic string for the insertion
        bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}'

        for tracer, l in enumerate(m):
            # check whether the current line is a borrowing
            if int(l[0]) < 0:
                loan_line = ' loan'
            else:
                loan_line = ''

            # assign the cognate id
            tmp = '  <td>{0}</td>\n'.format(l[0])
            tmp += '  <td>{0}</td>\n'.format(label(l[1].strip('.')))

            # check alignments for confidence scores
            ipa_string = ''.join([cell.split('/')[0] for cell in
                                  l[4:]]).replace('-', '')

            tmp += '  <td>{0}</td>\n'.format(ipa_string)
            tmp += '  <td class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '   <table class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '    <tr>\n{0}    </tr>\n   </table>\n  </td>\n </tr>\n'

            # check whether another entry follows that is also an alignment,
            # otherwise, there's no need to display a word as an alignment
            cognate_set = False
            if tracer < len(m) - 1:
                if abs(int(m[tracer + 1][0])) == abs(int(l[0])):
                    cognate_set = True
            if tracer > 0:
                if abs(int(m[tracer - 1][0])) == abs(int(l[0])):
                    cognate_set = True

            # fill out html for the cognate sets
            if cognate_set:

                alm = ''
                for char in l[4:]:

                    # check for confidence scores
                    if '/' in char:
                        try:
                            char, conf, num = char.split('/')
                            conf = int(conf)
                        except ValueError:
                            print(char.split('/'))
                            raise ValueError("Something is wrong with %s." % (char))

                    else:
                        char, conf, rgb = char, (255, 255, 255), 0.0

                    if char == '-':
                        d = 'dolgo_GAP'
                    else:
                        d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                        # bad check for three classes named differently
                        if d == 'dolgo__':
                            d = 'dolgo_X'
                        elif d == 'dolgo_1':
                            d = 'dolgo_TONE'
                        elif d == 'dolgo_0':
                            d = 'dolgo_ERROR'

                    if confidence:
                        alm += '     '
                        alm += '<td class="char {1}" confidence={0} '.format(
                            conf,
                            d
                        )
                        alm += 'char="{0}" '.format(char)
                        alm += 'onclick="' + "show('{0}')".format(num) + '" '
                        alm += 'num="{0}"'.format(num)
                        alm += '>\n      {0}\n     </td>\n'.format(char)
                    else:
                        alm += '     '
                        alm += '<td class="char {0}">{1}</td>\n'.format(d, char)
            else:
                alm = '      '
                alm += '<td class="{0}">--</td>\n'.format(colors[abs(int(l[0]))])

            # format the alignment
            try:
                tmp = tmp.format(alm)
            except ValueError:
                raise ValueError("Unknown problem in matchin %s and %s." % (alm, tmp))

            # check for last line, where a new line should be inserted (not the
            # fastest solution, but plotting is not a matter of time, and it
            # suffices it's current purpose
            if tracer < len(m) - 1:
                pass
            else:
                if confidence:
                    tmp += ' </table>\n'

                tmp += ' <tr class="empty"><td colspan="4" class="empty">'
                tmp += '<hr class="empty" /></td></tr>\n'

            # format the whole string
            tmp_str += bas.format(
                colors[abs(int(l[0]))],
                tmp,
                loan_line,
                l[1]
            )

    if not title:
        title = "LexStat - Automatic Cognate Judgments"
    if not shorttitle:
        shorttitle = "LexStat"

    # check for json-attribute
    if keywords['json']:
        keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'],
                                                        indent=1)

    html = html.format(
        shorttitle=shorttitle,
        title=title,
        table=tmp_str,
        dataset=dataset,
        javascript=js,
        css=css,
        **keywords
    )
    util.write_text_file(filename + '.html', html)
    return
Beispiel #21
0
def msa2html(
    msa,
    shorttitle='',
    filename='',
    template='',
    **keywords
):
    """
    Convert files in ``msa``-format into colored ``html``-format.

    Parameters
    ----------
    msa : dict
        A dictionary object that contains all the information of an MSA object.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``SCA`` will be used.

    filename : str (default="")
        Define the name of the output file. If no name is defined, the name of
        the input file will be taken as a default.

    template : str (default="")
        The path to the template file. If no name is defined, the basic
        template will be used. The basic template currently used can be found
        under ``lingpy/data/templates/msa2html.html``.

    Examples
    --------
    Load the libary.

    >>> from lingpy import *
    
    Load an ``msq``-file from the test-sets.

    >>> msa = MSA('harry.msq')

    Align the data progressively and carry out a check for swapped sites.

    >>> msa.prog_align()
    >>> msa.swap_check()
    >>> print(msa)
    w    o    l    -    d    e    m    o    r    t
    w    a    l    -    d    e    m    a    r    -
    v    -    l    a    d    i    m    i    r    -

    Save the data to the file ``harry.msa``.

    >>> msa.output('msa',filename='harry')

    Save the ``msa``-object as ``html``.

    >>> msa.output('html',filename='harry')
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the ``color``
    :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted.
    

    See also
    --------
    lingpy.convert.html.alm2html
    """
    util.setdefaults(
        keywords,
        pid_mode=1,
        stress=rcParams['stress'],
        css=False,
        js=False,
        compact=False,
        class_sort=True,
        write_to_file=True,
    )

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task

    # load templates
    template = template or template_path('msa2html.html')
    if template == 'js':
        template = template_path('msa2html.js.html')
    html = util.read_text_file(template)
    css = util.read_text_file(keywords['css'] or template_path('msa.css'))
    js = util.read_text_file(keywords['js'] or template_path('msa.js'))

    # treat the msa-object as a file and try to load the file if this is the
    # case
    if isinstance(msa, string_types):
        msa = read_msa(msa, **keywords)
    else:
        raise ValueError('[!] No filename specified.')

    # load dataset, etc.
    dataset = msa['dataset']

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    infile = msa['infile']
    seq_id = msa['seq_id']

    # define the titles etc.
    if not shorttitle:
        shorttitle = 'SCA'

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    # format css file 
    css = css.replace('TAXON_LENGTH', str(taxl * 10))

    out = ''
    tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n'
    td_taxon = '<td class="taxon">{0}</td>'
    perc = int(80 / len(msa['alignment'][0]) + 0.5)
    td_residue = '<td class="residue {1}">{0}</td>'
    td_swap = '<td class="residue swap {1}">{0}</td>'
    td_unaligned = '<td class="residue noalign {1}">{0}</td>'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    # check for 
    local = ['*'] * len(msa['alignment'][0])
    if 'local' in msa:
        local = ['.'] * len(msa['alignment'][0])
        for i in msa['local']:
            local[i] = '*'

    # get two sorting schemas for the sequences
    if keywords['class_sort']:

        classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']]
        seqs = dict(
            [(a[1], b) for a, b in zip(
                sorted(
                    zip(classes, msa['seqs']),
                    key=lambda x: x[0]  # list(zip(x[0],x[1]))
                ),
                range(1, len(msa['seqs']) + 1)
            )]
        )
    else:
        seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1)))
    taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1)))

    # set up a list to store unique alignments
    alignments = []

    # start iteration
    for i, taxon in enumerate(msa['taxa']):
        tmp = ''
        tmp += td_taxon.format(taxon)

        # append alignment to alignments
        alignment = ''.join(msa['alignment'][i])
        sequence = msa['seqs'][i]
        if alignment in alignments:
            unique = 'false'
        else:
            unique = 'true'
            alignments += [alignment]

        for j, char in enumerate(msa['alignment'][i]):
            if char == '-':
                d = 'dolgo_GAP'
                c = '#bbbbbb'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])
                c = token2class(char, rcParams['_color'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'

            if j in swaps:
                tmp += td_swap.format(char, d)
            elif local[j] != '*':
                tmp += td_unaligned.format(char, d)
            else:
                tmp += td_residue.format(char, d)
        out += tr.format(tmp, unique, taxa[taxon], seqs[sequence])

    html = html.format(
        table=out,
        dataset=dataset,
        pid=pid_score,
        file=infile,
        sequence=seq_id,
        shorttitle=shorttitle,
        width=len(msa['alignment'][0]),
        table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl),
        taxa=len(msa['alignment']),
        uniseqs=len(set(msa['seqs'])),
        css=css,
        js=js
    )

    if not filename:
        filename = rcParams['filename']

    if not filename.endswith('.html'):
        filename = filename + '.html'

    if keywords['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    if keywords['write_to_file']:
        # check, whether the outfile already exists
        util.write_text_file(filename, html)
    else:
        return html
Beispiel #22
0
def _list2msa(msa_lines, ids=False, header=True, normalize=False, **keywords):
    """
    Function retrieves a dictionary from a list of MSA strings.

    """
    setdefaults(keywords, seq_id='-', dataset='-', input_file='dummy')
    d = dict(ID=[], taxa=[], alignment=[], seqs=[], infile=keywords['input_file'])

    if header:
        start = 2
        d['dataset'] = msa_lines[0]
        d['seq_id'] = msa_lines[1]
    else:
        start = 0
        d['dataset'] = keywords['dataset']
        d['seq_id'] = keywords['seq_id']

    for i, line in enumerate(msa_lines[start:]):
        idx = 1 if ids else 0

        # check for specific id
        if line[0] in ['0', 'LOCAL', 'CROSSED', 'SWAPS', 'MERGE', 'COMPLEX', ]:
            if line[idx] == 'LOCAL':
                d['local'] = []
                for j, x in enumerate(line[idx + 1:]):
                    if x == '*':
                        d['local'] += [j]
            elif line[idx] in ['CROSSED', 'SWAPS']:
                d['swaps'] = []
                swapline = [x for x in line[idx + 1:]]
                j = 0
                while swapline:
                    x = swapline.pop(0)
                    if x == '+':
                        d['swaps'] += [(j, j + 1, j + 2)]
                        swapline.pop(0)
                        swapline.pop(0)
                        j += 2
                    else:
                        pass
                    j += 1
            elif line[idx] in ['COMPLEX', 'MERGE']:
                d['merge'] = {}
                mergeline = [x for x in line[idx + 1:]]
                k = 0
                merge = False
                for j, m in enumerate(mergeline):
                    if m == '<':
                        merge = True
                    if m == '>':
                        merge = False

                    d['merge'][j] = k
                    if not merge:
                        k += 1
            else:
                d[line[idx].lower()] = line[idx + 1:]

        elif line[0] not in ['LOCAL', 'SWAPS', 'MERGE', 'COMPLEX', '0']:
            if ids:
                try:
                    d['ID'] += [int(line[0])]
                except ValueError:
                    d['ID'] += [line[0]]
            else:
                d["ID"] += [i]
            d["taxa"] += [line[idx].rstrip('.')]
            d["seqs"] += [' '.join([l for l in line[idx + 1:] if l != '-'])]
            d["alignment"] += [line[idx + 1:]]

    # normalize the alignment if the option is chosen
    if normalize:
        d['alignment'] = normalize_alignment(d['alignment'])

    return d
Beispiel #23
0
    def _output(self, fileformat, **keywords):
        """
        Internal function that eases its modification by daughter classes.
        """
        # check for stamp attribute
        keywords["stamp"] = getattr(self, '_stamp', '')

        # add the default parameters, they will be checked against the keywords
        util.setdefaults(
            keywords,
            cols=False,
            distances=False,
            entries=("concept", "counterpart"),
            entry='concept',
            fileformat=fileformat,
            filename=rcParams['filename'],
            formatter='concept',
            modify_ref=False,
            meta=self._meta,
            missing=0,
            prettify='false',
            ignore='all',
            ref='cogid',
            rows=False,
            subset=False,  # setup a subset of the data,
            taxa='taxa',
            threshold=0.6,  # threshold for flat clustering
            tree_calc='neighbor')

        if fileformat in ['triple', 'triples', 'triples.tsv']:
            return tsv2triple(self, keywords['filename'] + '.' + fileformat)

        if fileformat in ['paps.nex', 'paps.csv']:
            paps = self.get_paps(
                ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing'])
            kw = dict(filename=keywords['filename'] + '.paps')
            if fileformat == 'paps.nex':
                kw['missing'] = keywords['missing']
                return pap2nex(self.cols, paps, **kw)
            return pap2csv(self.cols, paps, **kw)

        # simple printing of taxa
        if fileformat == 'taxa':
            assert hasattr(self, 'taxa')
            return util.write_text_file(keywords['filename'] + '.taxa', self.cols)

        # csv-output
        if fileformat in ['csv', 'qlc', 'tsv']:

            # get the header line
            header = sorted(
                [s for s in set(self._alias.values()) if s in self._header],
                key=lambda x: self._header[x])
            header = [h.upper() for h in header]

            self._meta.setdefault('taxa', self.cols)

            # get the data, in case a subset is chosen
            if not keywords['subset']:
                # write stuff to file
                return wl2qlc(header, self._data, **keywords)

            cols, rows = keywords['cols'], keywords['rows']

            if not isinstance(cols, (list, tuple, bool)):
                raise ValueError("[i] Argument 'cols' should be list or tuple.")
            if not isinstance(rows, (dict, bool)):
                raise ValueError("[i] Argument 'rows' should be a dictionary.")

            # check for chosen header
            if cols:
                # get indices for header
                indices = [self._header[x] for x in cols]
                header = [c.upper() for c in cols]
            else:
                indices = [r for r in range(len(self.header))]

            if rows:
                stmts = []
                for key, value in rows.items():
                    if key == 'ID':
                        stmts += ["key " + value]
                    else:
                        idx = self._header[key]
                        stmts += ["line[{0}] ".format(idx) + value]

            log.debug("calculated what should be excluded")

            # get the data
            out = {}
            for key, line in self._data.items():
                log.debug(key)

                if rows:
                    if eval(" and ".join(stmts)):
                        out[key] = [line[i] for i in indices]
                else:
                    out[key] = [line[i] for i in indices]

            log.debug("passing data to wl2qlc")
            return wl2qlc(header, out, **keywords)

        # output dst-format (phylip)
        if fileformat == 'dst':
            # check for distances as keyword
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self, **keywords)

            out = matrix2dst(self._meta['distances'], self.taxa,
                    stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0))
            return _write_file(keywords['filename'], out, fileformat)

        # output tre-format (newick)
        if fileformat in ['tre', 'nwk']:  # ,'cluster','groups']:
            if 'tree' not in self._meta:
                # check for distances
                if 'distances' not in self._meta:
                    self._meta['distances'] = wl2dst(self)
                # we look up a function to calculate a tree in the cluster module:
                tree = getattr(cluster, keywords['tree_calc'])(
                    self._meta['distances'], self.cols, distances=keywords['distances'])
            else:
                tree = self._meta['tree']

            return _write_file(keywords['filename'], '{0}'.format(tree), fileformat)

        if fileformat in ['cluster', 'groups']:
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self)  # check for keywords

            if 'groups' not in self._meta:
                self._meta['groups'] = cluster.matrix2groups(
                    keywords['threshold'], self._meta['distances'], self.taxa)
            lines = []
            for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]):
                lines.append('{0}\t{1}'.format(taxon, group))
            return _write_file(keywords['filename'], lines, fileformat)

        if fileformat in ['starling', 'star.csv']:
            # make lambda inline for data-check
            l = lambda x: ['-' if x == 0 else x][0]

            lines = []
            if 'cognates' not in keywords:
                lines.append('ID\tConcept\t' + '\t'.join(self.taxa))
                for i, concept in enumerate(self.concepts):
                    for line in self.get_list(row=concept, entry=keywords['entry']):
                        lines.append(
                            str(i + 1) + '\t' + concept + '\t' + '\t'.join(
                                [l(t) for t in line]))
            else:
                lines.append(
                    'ID\tConcept\t' + '\t'.join(
                        ['{0}\t COG'.format(t) for t in self.taxa]))
                for i, concept in enumerate(self.concepts):
                    cogs = self.get_list(row=concept, entry=keywords['cognates'])
                    for j, line in enumerate(
                            self.get_list(row=concept, entry=keywords['entry'])):
                        part = '\t'.join(
                            '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j]))
                        lines.append(util.tabjoin(i + 1, concept, part))

            return _write_file(
                keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv')

        if fileformat == 'multistate.nex':
            if not keywords['filename'].endswith('.multistate.nex'):
                keywords['filename'] += '.multistate.nex'

            matrix = wl2multistate(self, keywords['ref'], keywords['missing'])
            return multistate2nex(self.taxa, matrix, keywords['filename'])

        if fileformat == 'separated':
            if not os.path.isdir(keywords['filename']):
                os.mkdir(keywords['filename'])

            for l in self.cols:
                lines = [''] if 'ignore_keys' in keywords else ['ID\t']
                lines[0] += '\t'.join(x.upper() for x in keywords['entries'])
                for key in self.get_list(col=l, flat=True):
                    line = [] if 'ignore_keys' in keywords else [key]
                    for entry in keywords['entries']:
                        tmp = self[key, entry]
                        if isinstance(tmp, list):
                            tmp = ' '.join([str(x) for x in tmp])
                        line += [tmp]
                    lines.append('\t'.join('{0}'.format(x) for x in line))
                _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')
Beispiel #24
0
def wl2qlc(header, data, filename='', formatter='concept', **keywords):
    """
    Write the basic data of a wordlist to file.
    """
    util.setdefaults(keywords,
                     ignore=['taxa', 'doculects', 'msa'],
                     fileformat='qlc',
                     prettify=True)
    if keywords['ignore'] == 'all':
        keywords['ignore'] = [
            'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json'
        ]

    formatter = formatter.upper()
    if not filename:
        filename = rcParams['filename']

    # create output string
    out = '# Wordlist\n' if keywords['prettify'] else ''

    # write meta to file
    meta = keywords.get("meta", {})
    kvpairs = {}
    jsonpairs = {}
    msapairs = {}
    trees = {}
    distances = ''
    taxa = ''
    scorer = ''

    for k, v in meta.items():
        # simple key-value-pairs
        if isinstance(v, (str, int)) or k == "tree":
            kvpairs[k] = v
        elif k == 'msa' and k not in keywords['ignore']:
            # go a level deeper, checking for keys
            for ref in v:
                if ref not in msapairs:
                    msapairs[ref] = {}
                for a, b in v[ref].items():
                    msapairs[ref][a] = b
        elif k == 'distances':
            distances = matrix2dst(v, meta['taxa'])
        elif k in ['taxa', 'doculect', 'taxon', 'doculects']:
            # we need to find a better solution here, since it is not nice to
            # have taxa written to json again and again
            pass
        elif k == 'trees' and k not in keywords['ignore']:
            trees = ''
            for key, value in v.items():
                trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value)
        elif k == 'scorer' and k not in keywords['ignore']:
            for key, value in v.items():
                scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format(
                    key, scorer2str(value), k)
        else:
            # check whether serialization works
            try:
                json.dumps(v)
                jsonpairs[k] = v
            except TypeError:
                pass

    if kvpairs and 'meta' not in keywords['ignore']:
        out += '\n# META\n' if keywords['prettify'] else ''
        for k, v in sorted(kvpairs.items(), key=lambda x: x[0]):
            out += '@{0}:{1}\n'.format(k, v)
    if taxa and keywords['taxa']:
        out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n'
    if jsonpairs and 'json' not in keywords['ignore']:
        out += "@json: " + json.dumps(jsonpairs) + '\n'
    if msapairs and 'msa' not in keywords['ignore']:
        for ref in msapairs:
            out += "\n# MSA reference: {0}\n".format(ref)
            for k, v in msapairs[ref].items():
                if 'consensus' in v:
                    out += '#\n<msa '
                    out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format(
                        k, ref, ' '.join(v['consensus']))
                else:
                    out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref)
                outs = msa2str(v, wordlist=True)
                out += outs
                out += "</msa>\n"

    if distances and 'distances' not in keywords['ignore']:
        out += '\n# DISTANCES\n<dst>\n'
        out += distances + '</dst>\n'

    if trees:
        out += '\n# TREES\n' + trees

    if scorer and 'scorer' not in keywords['ignore']:
        out += '\n# SCORER\n' + scorer

    out += '\n# DATA\n' if keywords['prettify'] else ''
    out += 'ID\t' + '\t'.join(header) + '\n'

    # check for gloss in header to create nice output format
    if formatter in header:
        idx = header.index(formatter)
        formatter = None
        sorted_data = sorted(data.keys(), key=lambda x: data[x][idx])
    elif len(formatter.split(',')) == 2:
        idxA, idxB = formatter.split(',')
        idxA = header.index(idxA)
        idxB = header.index(idxB)
        idx = idxA
        formatter = None
        sorted_data = sorted(data.keys(),
                             key=lambda x: (data[x][idxA], data[x][idxB]))
    else:
        idx = False
        formatter = ''
        sorted_data = sorted(data.keys())

    for key in sorted_data:
        # get the line
        line = data[key]

        # check for formatter
        if idx in range(len(line)):
            if line[idx] != formatter:
                out += '#\n' if keywords['prettify'] else ''
                formatter = line[idx]

        # add the key
        out += str(key)

        # add the rest of the values
        for value in line:
            if type(value) == list:
                try:
                    out += '\t' + ' '.join(value)
                except:
                    out += '\t' + ' '.join([str(v) for v in value])
            elif type(value) == int:
                out += '\t' + str(value)
            elif type(value) == float:
                out += '\t{0:.4f}'.format(value)
            elif value is None:
                out += '\t'
            else:
                out += '\t{:}'.format(value)
        out += '\n'

    util.write_text_file(filename + '.' + keywords['fileformat'],
                         out + keywords.get('stamp', ''),
                         normalize="NFC")
    return
Beispiel #25
0
def psa2html(infile, **kw):
    """
    Function converts a PSA-file into colored html-format.
    """
    util.setdefaults(
        kw,
        template=False,
        css=False,
        comment='#',
        filename=infile[:-4]+'.html',
        compact=True)

    template = util.read_text_file(kw['template'] or template_path('psa.html'))
    css = util.read_text_file(kw['css'] or template_path('psa.css'))

    data = []
    for line in util.read_text_file(infile, lines=True):
        if not line.startswith(kw['comment']):
            data.append(line)

    seq_ids = []
    pairs = []
    taxa = []
    alignments = []

    del data[0]

    i = 0
    while i <= len(data) - 3:
        try:
            seq_ids.append(data[i])

            datA = data[i + 1].split('\t')
            datB = data[i + 2].split('\t')

            taxonA = datA[0].strip('.')
            taxonB = datB[0].strip('.')
            almA = datA[1:]
            almB = datB[1:]

            taxa.append((taxonA, taxonB))
            pairs.append(
                (
                    '.'.join([k for k in almA if k != '-']),
                    '.'.join([k for k in almB if k != '-'])
                )
            )
            alignments.append(
                (
                    [str(a) for a in almA],
                    [str(b) for b in almB],
                    0)
            )
            assert len(alignments[-1][0]) == len(alignments[-1][1])
            i += 4
        except AssertionError:
            log.warning("Line {0} of the data is probably miscoded.".format(i + 1))
            i += 1

    def get_classes(alm):
        classes = []
        residue = '<div class="residue {1}">{0}</div>'
        for j, char in enumerate(alm):
            if char == '-':
                d = 'dolgo_GAP'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'
            classes += [residue.format(char, d)]
        return ''.join(classes)

    out = '<table>\n'  # codecs.open(kw['filename'], 'w', 'utf-8')
    for i, (a, b, c) in enumerate(alignments):
        clsA = get_classes(a)
        clsB = get_classes(b)

        ids = int(100 * pid(a, b) + 0.5)

        out += '<tr class="head">'
        out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format(
            i + 1,
            seq_ids[i],
            ids
        )
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][0])
        out += '<td class="psa">{0}</td>'.format(clsA)
        out += '</tr>'
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][1])
        out += '<td class="psa">{0}</td>'.format(clsB)
        out += '</tr>'
        out += '<tr><td colspan=2></td></tr>'

    out += '</table>'

    html = template.format(alignments=out, css=css)

    if kw['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    util.write_text_file(kw['filename'], html)
Beispiel #26
0
def wl2qlc(
        header,
        data,
        filename='',
        formatter='concept',
        **keywords):
    """
    Write the basic data of a wordlist to file.
    """
    util.setdefaults(
        keywords,
        ignore=['taxa', 'doculects', 'msa'],
        fileformat='qlc',
        prettify=True)
    if keywords['ignore'] == 'all':
        keywords['ignore'] = [
            'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json']

    formatter = formatter.upper()
    if not filename:
        filename = rcParams['filename']

    # create output string
    out = '# Wordlist\n' if keywords['prettify'] else ''

    # write meta to file
    meta = keywords.get("meta", {})
    kvpairs = {}
    jsonpairs = {}
    msapairs = {}
    trees = {}
    distances = ''
    taxa = ''
    scorer = ''

    for k, v in meta.items():
        # simple key-value-pairs
        if isinstance(v, (text_type, int)) or k == "tree":
            kvpairs[k] = v
        elif k == 'msa' and k not in keywords['ignore']:
            # go a level deeper, checking for keys
            for ref in v:
                if ref not in msapairs:
                    msapairs[ref] = {}
                for a, b in v[ref].items():
                    msapairs[ref][a] = b
        elif k == 'distances':
            distances = matrix2dst(v, meta['taxa'])
        elif k in ['taxa', 'doculect', 'taxon', 'doculects']:
            # we need to find a better solution here, since it is not nice to
            # have taxa written to json again and again
            pass
        elif k == 'trees' and k not in keywords['ignore']:
            trees = ''
            for key, value in v.items():
                trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value)
        elif k == 'scorer' and k not in keywords['ignore']:
            for key, value in v.items():
                scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format(
                    key, scorer2str(value), k)
        else:
            # check whether serialization works
            try:
                json.dumps(v)
                jsonpairs[k] = v
            except TypeError:
                pass

    if kvpairs and 'meta' not in keywords['ignore']:
        out += '\n# META\n' if keywords['prettify'] else ''
        for k, v in sorted(kvpairs.items(), key=lambda x: x[0]):
            out += '@{0}:{1}\n'.format(k, v)
    if taxa and keywords['taxa']:
        out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n'
    if jsonpairs and 'json' not in keywords['ignore']:
        out += "@json: " + json.dumps(jsonpairs) + '\n'
    if msapairs and 'msa' not in keywords['ignore']:
        for ref in msapairs:
            out += "\n# MSA reference: {0}\n".format(ref)
            for k, v in msapairs[ref].items():
                if 'consensus' in v:
                    out += '#\n<msa '
                    out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format(
                        k, ref, ' '.join(v['consensus']))
                else:
                    out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref)
                outs = msa2str(v, wordlist=True)
                out += outs
                out += "</msa>\n"

    if distances and 'distances' not in keywords['ignore']:
        out += '\n# DISTANCES\n<dst>\n'
        out += distances + '</dst>\n'

    if trees:
        out += '\n# TREES\n' + trees

    if scorer and 'scorer' not in keywords['ignore']:
        out += '\n# SCORER\n' + scorer

    out += '\n# DATA\n' if keywords['prettify'] else ''
    out += 'ID\t' + '\t'.join(header) + '\n'

    # check for gloss in header to create nice output format
    if formatter in header:
        idx = header.index(formatter)
        formatter = None
        sorted_data = sorted(data.keys(), key=lambda x: data[x][idx])
    elif len(formatter.split(',')) == 2:
        idxA, idxB = formatter.split(',')
        idxA = header.index(idxA)
        idxB = header.index(idxB)
        idx = idxA
        formatter = None
        sorted_data = sorted(data.keys(), key=lambda x: (
            data[x][idxA], data[x][idxB]))
    else:
        idx = False
        formatter = ''
        sorted_data = sorted(data.keys())

    for key in sorted_data:
        # get the line
        line = data[key]

        # check for formatter
        if idx in range(len(line)):
            if line[idx] != formatter:
                out += '#\n' if keywords['prettify'] else ''
                formatter = line[idx]

        # add the key
        out += text_type(key)

        # add the rest of the values
        for value in line:
            if type(value) == list:
                try:
                    out += '\t' + ' '.join(value)
                except:
                    out += '\t' + ' '.join([text_type(v) for v in value])
            elif type(value) == int:
                out += '\t' + text_type(value)
            elif type(value) == float:
                out += '\t{0:.4f}'.format(value)
            elif value is None:
                out += '\t'
            else:
                out += '\t{:}'.format(value)
        out += '\n'

    util.write_text_file(
        filename + '.' + keywords['fileformat'],
        out + keywords.get('stamp', ''),
        normalize="NFC")
    return
Beispiel #27
0
def _list2msa(msa_lines, ids=False, header=True, normalize=False, **keywords):
    """
    Function retrieves a dictionary from a list of MSA strings.

    """
    setdefaults(keywords, seq_id='-', dataset='-', input_file='dummy')
    d = dict(ID=[],
             taxa=[],
             alignment=[],
             seqs=[],
             infile=keywords['input_file'])

    if header:
        start = 2
        d['dataset'] = msa_lines[0]
        d['seq_id'] = msa_lines[1]
    else:
        start = 0
        d['dataset'] = keywords['dataset']
        d['seq_id'] = keywords['seq_id']

    for i, line in enumerate(msa_lines[start:]):
        idx = 1 if ids else 0

        # check for specific id
        if line[0] in [
                '0',
                'LOCAL',
                'CROSSED',
                'SWAPS',
                'MERGE',
                'COMPLEX',
        ]:
            if line[idx] == 'LOCAL':
                d['local'] = []
                for j, x in enumerate(line[idx + 1:]):
                    if x == '*':
                        d['local'] += [j]
            elif line[idx] in ['CROSSED', 'SWAPS']:
                d['swaps'] = []
                swapline = [x for x in line[idx + 1:]]
                j = 0
                while swapline:
                    x = swapline.pop(0)
                    if x == '+':
                        d['swaps'] += [(j, j + 1, j + 2)]
                        swapline.pop(0)
                        swapline.pop(0)
                        j += 2
                    else:
                        pass
                    j += 1
            elif line[idx] in ['COMPLEX', 'MERGE']:
                d['merge'] = {}
                mergeline = [x for x in line[idx + 1:]]
                k = 0
                merge = False
                for j, m in enumerate(mergeline):
                    if m == '<':
                        merge = True
                    if m == '>':
                        merge = False

                    d['merge'][j] = k
                    if not merge:
                        k += 1
            else:
                d[line[idx].lower()] = line[idx + 1:]

        elif line[0] not in ['LOCAL', 'SWAPS', 'MERGE', 'COMPLEX', '0']:
            if ids:
                try:
                    d['ID'] += [int(line[0])]
                except ValueError:
                    d['ID'] += [line[0]]
            else:
                d["ID"] += [i]
            d["taxa"] += [line[idx].rstrip('.')]
            d["seqs"] += [' '.join([l for l in line[idx + 1:] if l != '-'])]
            d["alignment"] += [line[idx + 1:]]

    # normalize the alignment if the option is chosen
    if normalize:
        d['alignment'] = normalize_alignment(d['alignment'])

    return d
Beispiel #28
0
def calculate_data(wordlist,
                   data,
                   taxa='taxa',
                   concepts='concepts',
                   ref='cogid',
                   **keywords):
    """
    Manipulate a wordlist object by adding different kinds of data.

    Parameters
    ----------
    data : str
        The type of data that shall be calculated. Currently supports

        * "tree": calculate a reference tree based on shared cognates
        * "dst": get distances between taxa based on shared cognates
        * "cluster": cluster the taxa into groups using different methods


    """
    logger = log.get_logger()
    util.setdefaults(keywords,
                     distances=False,
                     tree_calc="upgma",
                     cluster="upgma",
                     force=False,
                     threshold=0.5,
                     cluster_method='upgma')

    # get taxa for current calculation
    these_taxa = eval('wordlist.' + taxa)

    # calculate distances
    if data in ['distances', 'dst']:
        wordlist._meta['distances'] = wl2dst(wordlist, taxa, concepts, ref,
                                             **keywords)
    elif data in ['diversity', 'div']:
        etd = wordlist.get_etymdict(ref=ref)
        wordlist._meta['diversity'] = \
            (len(etd) - wordlist.height) / (len(wordlist) - wordlist.height)
    elif data in ['tre', 'tree', 'nwk']:
        if 'distances' not in wordlist._meta:
            wordlist._meta['distances'] = \
                wl2dst(wordlist, taxa, concepts, ref, **keywords)
        distances = wordlist._meta['distances']
        if 'tree' in wordlist._meta and not keywords['force']:
            logger.warning("Reference tree has already been calculated, "
                           "force overwrite by "
                           "setting 'force' to 'True'.")
            return
        wordlist._meta['tree'] = clustering.matrix2tree(
            distances, these_taxa, keywords['tree_calc'],
            keywords['distances'])

    elif data in ['groups', 'cluster']:
        if 'distances' not in wordlist._meta:
            distances = wl2dst(wordlist, taxa, concepts, ref, **keywords)
        else:
            distances = wordlist._meta['distances']
        if 'groups' in wordlist._meta and not keywords['force']:
            logger.warning("Distance matrix has already been calculated, "
                           "force overwrite by "
                           "setting 'force' to 'True'.")
            return
        wordlist._meta['groups'] = clustering.matrix2groups(
            keywords['threshold'], distances, these_taxa,
            keywords['cluster_method'])
    log.info("Successfully calculated {0}.".format(data))
Beispiel #29
0
    def _export(
            self,
            fileformat,
            sections=None,
            entries=None,
            entry_sep='',
            item_sep='',
            template='',
            exclude=None,
            entry_start='',
            entry_close='',
            **keywords):
        """
        Export a wordlist to various file formats.
        """
        if not sections:
            if fileformat == 'txt':
                sections = dict(
                    h1=('concept', '\n# Concept: {0}\n'),
                    h2=('cogid', '## Cognate-ID: {0}\n'))
            elif fileformat == 'tex':
                sections = dict(
                    h1=('concept', r'\section{{Concept: ``{0}"}}' + '\n'),
                    h2=('cogid', r'\subsection{{Cognate Set: ``{0}"}}' + '\n'))
            elif fileformat == 'html':
                sections = dict(
                    h1=('concept', '<h1>Concept: {0}</h1>'),
                    h2=('cogid', '<h2>Cognate Set: {0}</h2>'))

        if not entries:
            if fileformat == 'txt':
                entries = [('language', '{0} '), ('ipa', '{0}\n')]
            elif fileformat == 'tex':
                entries = [('language', '{0} '), ('ipa', '[{0}]' + '\n')]
            elif fileformat == 'html':
                entries = [('language', '{0}&nbsp;'), ('ipa', '[{0}]\n')]

        util.setdefaults(keywords, filename=rcParams['filename'])

        # get the temporary dictionary
        out = wl2dict(self, sections, entries, exclude)

        # assign the output string
        out_string = ''

        # iterate over the dictionary and start to fill the string
        for key in sorted(out, key=lambda x: str(x).lower()):
            # write key to file
            out_string += key[1]

            # reassign tmp
            tmp = out[key]

            # set the pointer and the index
            pointer = {0: [tmp, sorted(tmp.keys())]}

            while True:
                idx = max(pointer.keys())

                # check for type of current point
                if isinstance(tmp, dict):
                    if pointer[idx][1]:
                        next_key = pointer[idx][1].pop()
                        out_string += next_key[1]
                        tmp = pointer[idx][0][next_key]
                        if isinstance(tmp, dict):
                            pointer[idx + 1] = [tmp, sorted(tmp.keys())]
                        else:
                            pointer[idx + 1] = [tmp, tmp]
                    else:
                        del pointer[idx]
                        if idx == 0:
                            break
                else:
                    tmp_strings = []
                    for line in sorted(tmp):
                        tmp_strings += [item_sep.join(line)]
                    out_string += entry_start + entry_sep.join(tmp_strings) + entry_close
                    tmp = pointer[idx - 1][0]
                    del pointer[idx]

        if fileformat == 'tex':
            out_string = out_string.replace('_', r'\_')
        tmpl = util.read_text_file(template) if template else '{0}'
        _write_file(keywords['filename'], tmpl.format(out_string), fileformat)
Beispiel #30
0
    def align(self, **keywords):
        """
        Align a pair of sequences or multiple sequence pairs.

        Parameters
        ----------
        gop : int (default=-1)
            The gap opening penalty (GOP).
        scale : float (default=0.5)
            The gap extension penalty (GEP), calculated with help of a scaling
            factor.
        mode : {"global","local","overlap","dialign"}
            The alignment mode, see :evobib:`List2012a` for details.
        factor : float (default = 0.3)
            The factor by which matches in identical prosodic position are
            increased.
        restricted_chars : str (default="T_")
            The restricted chars that function as an indicator of syllable or
            morpheme breaks for secondary alignment, see :evobib:`List2012c`
            for details.
        distance : bool (default=False)
            If set to *True*, return the distance instead of the similarity
            score. Distance is calculated using the formula by
            :evobib:`Downey2008`.
        model : { None, ~lingpy.data.model.Model }
            Specify the sound class model that shall be used for the analysis.
            If no model is specified, the default model of :evobib:`List2012a`
            will be used.
        pprint : bool (default=False)
            If set to *True*, the alignments are printed to the screen.

        """
        setdefaults(
            keywords,
            gop=-1,
            scale=0.5,
            mode='global',
            factor=0.3,
            restricted_chars='T_',
            distance=False,
            model=rcParams['sca'],
            pprint=False,
            transform=rcParams['align_transform'])

        if hasattr(self, 'model'):
            if keywords['model'] != self.model:
                self._set_model(**keywords)
        else:
            self._set_model(**keywords)

        # create the alignments array
        self._alignments = calign.align_pairs(
            self.classes,
            self.weights,
            self.prostrings,
            keywords['gop'],
            keywords['scale'],
            keywords['factor'],
            self.scoredict,
            keywords['mode'],
            keywords['restricted_chars'],
            distance=1 if keywords['distance'] else 0)

        # switch back to alignments
        self.alignments = []
        for i, (almA, almB, sim) in enumerate(self._alignments):
            self.alignments.append((
                class2tokens(self.tokens[i][0], almA, local=keywords['mode'] == "local"),
                class2tokens(self.tokens[i][1], almB, local=keywords['mode'] == "local"),
                sim))

        # print the alignments, if this is chosen
        as_string(self, pprint=keywords['pprint'])