Esempio n. 1
0
def tsv2triple(wordlist, outfile=None):
    """
    Function converts a wordlist to a triple data structure.

    Notes
    -----
    The basic values of which the triples consist are:
      * ID (the ID in the TSV file)
      * COLUMN (the column in the TSV file)
      * VALUE (the entry in the TSV file)
    """
    tstore = []
    for head in wordlist.header:
        log.debug('tsv2triple: ' + head)
        for key in wordlist:
            tstore.append((key, head.upper(), wordlist[key, head]))

    if outfile:
        out = ''
        for a, b, c in tstore:
            if isinstance(c, list):
                c = ' '.join([text_type(x) for x in c])
            if c != '-':
                out += '{0}\t{1}\t{2}\n'.format(a, b, c)
        util.write_text_file(outfile, out, normalize='NFC')
    return tstore
Esempio n. 2
0
def tsv2triple(wordlist, outfile=None):
    """
    Function converts a wordlist to a triple data structure.

    Notes
    -----
    The basic values of which the triples consist are:
      * ID (the ID in the TSV file)
      * COLUMN (the column in the TSV file)
      * VALUE (the entry in the TSV file)
    """
    tstore = []
    for head in wordlist.header:
        log.debug('tsv2triple: ' + head)
        for key in wordlist:
            tstore.append((key, head.upper(), wordlist[key, head]))

    if outfile:
        out = ''
        for a, b, c in tstore:
            if isinstance(c, list):
                c = ' '.join([str(x) for x in c])
            if c != '-':
                out += '{0}\t{1}\t{2}\n'.format(a, b, c)
        util.write_text_file(outfile, out, normalize='NFC')
    return tstore
Esempio n. 3
0
def pap2nex(taxa, paps, missing=0, filename='', datatype='STANDARD'):
    """
    Function converts a list of paps into nexus file format.

    Parameters
    ----------
    taxa : list
        List of taxa.
    paps : {list, dict}
        A two-dimensional list with the first dimension being identical to the
        number of taxa and the second dimension being identical to the number
        of paps. If a dictionary is passed, each key represents a given pap.
        The following two structures will thus be treated identically::
            
          >>> paps = [[1,0],[1,0],[1,0]] # two languages, three paps
          >>> paps = {1:[1,0], 2:[1,0], 3:[1,0]} # two languages, three paps
    
    missing : {str, int} (default=0)
        Indicate how missing characters are represented in the original data.

    """
    out = '#NEXUS\n\nBEGIN DATA;\nDIMENSIONS ntax={0} NCHAR={1};\n'
    out += "FORMAT DATATYPE={5} GAP=- MISSING={2} interleave=yes;\n"
    out += "MATRIX\n\n{3}\n;\n\nEND;\n"
    out += "[PAPS-REFERENCE]\n{4}"

    # get longest taxon
    maxTax = max([len(taxon) for taxon in taxa])
    paps_ref = ""

    # check whether paps are dict or list
    if hasattr(paps, 'keys'):
        new_paps = [paps[k] for k in sorted(paps)]
        reference = [k for k in sorted(paps)]
    else:
        new_paps = paps
        reference = [k for k in range(1, len(paps) + 1)]

    # create reference
    ref_string = ''
    for i, ref in enumerate(reference):
        ref_string += '[{0} :: {1}]\n'.format(i, ref)
    # create the matrix
    matrix = ""

    for i, taxon in enumerate(taxa):
        tmp = '{0:XXX} '
        matrix += tmp.replace('XXX', str(maxTax)).format(taxon)
        matrix += ''.join([str(itm[i]) for itm in new_paps])
        matrix += '\n'

    if not filename:
        return out.format(len(taxa), len(paps), missing, matrix, ref_string,
                          datatype)
    util.write_text_file(
        filename + '.nex',
        out.format(len(taxa), len(paps), missing, matrix, ref_string,
                   datatype))
    return
Esempio n. 4
0
def multistate2nex(taxa, matrix, filename='', missing="?"):
    """
    Convert the data in a given wordlist to NEXUS-format for multistate analyses in PAUP.
    
    Parameters
    ----------
    taxa : list
        The list of taxa that shall be written to file.
    matrix : list
        The multi-state matrix with the first dimension indicating the taxa,
        and the second their states.
    filename : str (default="")
        If not specified, the filename of the Wordlist will be taken,
        otherwise, it specifies the name of the file to which the data will be
        written.
    """

    # set up the nexus template
    nexus = """#NEXUS

BEGIN DATA;
DIMENSIONS ntax={ntax} NCHAR={nchar};
FORMAT RESPECTCASE DATATYPE=STANDARD symbols="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP0123456789" GAP=? MISSING={missing} interleave=yes;
OPTIONS MSTAXA = POLYMORPH;

MATRIX

{matrix}

END;
"""

    # calculate maximal length of taxon strings
    tlen = max([len(t) for t in taxa])

    # calculate the matrix-text in the nexus template
    matrix_text = ""
    for taxon, line in zip(taxa, matrix):
        ntaxon = taxon + tlen * ' ' + ' '
        ntaxon = ntaxon[:tlen]
        matrix_text += "{0} {1}\n".format(ntaxon, ''.join(line))

    if filename:
        util.write_text_file(
            filename,
            nexus.format(
                ntax=len(taxa),
                nchar=len(matrix[0]),
                matrix=matrix_text,
                missing=missing
            )
        )
    else:
        raise ValueError("[!] A wrong filename was specified!")
    return
Esempio n. 5
0
def _export_score_dict(score_dict):
    """
    Function exports a scoring dictionary to a csv-file.

    @todo: This function can be better ported to another file.
    """
    letters = list(set([key[0] for key in score_dict.keys()]))
    rows = [['+'] + letters]
    for l1 in letters:
        rows.append([l1] + [str(score_dict[(l1, l2)]) for l2 in letters])
    util.write_text_file('score_dict.csv', '\n'.join('\t'.join(row) for row in rows))
Esempio n. 6
0
def check_stats(models, wordlist, filename='results.txt', pprint=False):
    results = []
    for m in models:
        p, z = tstats(wordlist, m, return_dists=True)
        results += [[m, p, z]]

    txt = ''
    for a, b, c in results:
        txt += '{0}\t{1:.2f}\t{2:.2f}\n'.format(a, b, c)
    as_string(txt, pprint)
    if filename: write_text_file(filename, txt)
Esempio n. 7
0
def _export_score_dict(score_dict):
    """
    Function exports a scoring dictionary to a csv-file.

    @todo: This function can be better ported to another file.
    """
    letters = list(set([key[0] for key in score_dict.keys()]))
    rows = [['+'] + letters]
    for l1 in letters:
        rows.append([l1] + [str(score_dict[(l1, l2)]) for l2 in letters])
    util.write_text_file('score_dict.csv',
                         '\n'.join('\t'.join(row) for row in rows))
Esempio n. 8
0
    def test_output(self):
        fpsa = self.tmp_path('test.psa')
        write_text_file(fpsa, '\n')
        psa = PSA(text_type(fpsa))
        fname = text_type(self.tmp_path('test'))
        psa.output(fileformat='psa', filename=fname)

        psq = self.tmp_path('test.psq')
        write_text_file(psq, '\n')
        psa = PSA(text_type(psq))
        fname = text_type(self.tmp_path('test'))
        psa.output(fileformat='psq', filename=fname)
Esempio n. 9
0
def check_stats(models, wordlist, filename='results.txt', pprint=False):
    results = []
    for m in models:
        p, z = tstats(wordlist, m, return_dists=True)
        results += [[m, p, z]]


    txt = ''
    for a, b, c in results:
        txt += '{0}\t{1:.2f}\t{2:.2f}\n'.format(a, b, c)
    as_string(txt, pprint)
    if filename: write_text_file(filename, txt)
Esempio n. 10
0
    def test_output(self):
        fpsa = self.tmp_path('test.psa')
        write_text_file(fpsa, '\n')
        psa = PSA(text_type(fpsa))
        fname = text_type(self.tmp_path('test'))
        psa.output(fileformat='psa', filename=fname)

        psq = self.tmp_path('test.psq')
        write_text_file(psq, '\n')
        psa = PSA(text_type(psq))
        fname = text_type(self.tmp_path('test'))
        psa.output(fileformat='psq', filename=fname)
Esempio n. 11
0
def multistate2nex(taxa, matrix, filename='', missing="?"):
    """
    Convert the data in a given wordlist to NEXUS-format for multistate analyses in PAUP.
    
    Parameters
    ----------
    taxa : list
        The list of taxa that shall be written to file.
    matrix : list
        The multi-state matrix with the first dimension indicating the taxa,
        and the second their states.
    filename : str (default="")
        If not specified, the filename of the Wordlist will be taken,
        otherwise, it specifies the name of the file to which the data will be
        written.
    """

    # set up the nexus template
    nexus = """#NEXUS

BEGIN DATA;
DIMENSIONS ntax={ntax} NCHAR={nchar};
FORMAT RESPECTCASE DATATYPE=STANDARD symbols="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP0123456789" GAP=? MISSING={missing} interleave=yes;
OPTIONS MSTAXA = POLYMORPH;

MATRIX

{matrix}

END;
"""

    # calculate maximal length of taxon strings
    tlen = max([len(t) for t in taxa])

    # calculate the matrix-text in the nexus template
    matrix_text = ""
    for taxon, line in zip(taxa, matrix):
        ntaxon = taxon + tlen * ' ' + ' '
        ntaxon = ntaxon[:tlen]
        matrix_text += "{0} {1}\n".format(ntaxon, ''.join(line))

    if filename:
        util.write_text_file(
            filename,
            nexus.format(ntax=len(taxa),
                         nchar=len(matrix[0]),
                         matrix=matrix_text,
                         missing=missing))
    else:
        raise ValueError("[!] A wrong filename was specified!")
    return
Esempio n. 12
0
def pap2csv(taxa, paps, filename=''):
    """
    Write paps created by the Wordlist class to a csv-file.
    """

    out = "ID\t" + '\t'.join(taxa) + '\n'

    for key in sorted(paps):
        out += '{0}\t{1}\n'.format(key, '\t'.join(str(i) for i in paps[key]))

    if not filename:
        return out
    util.write_text_file(filename + '.csv', out)
    return
Esempio n. 13
0
    def test_output(self):
        fpsa = self.tmp_path('test.psa')
        write_text_file(fpsa, '\n')
        psa = PSA(text_type(fpsa))
        fname = text_type(self.tmp_path('test'))
        psa.output(fileformat='psa', filename=fname)

        psq = self.tmp_path('test.psq')
        write_text_file(psq, '\n')
        psa = PSA(text_type(psq))
        fname = text_type(self.tmp_path('test'))
        psa.output(fileformat='psq', filename=fname)

        psa = PSA(text_type(test_data('harry_potter.psa')))
        psa.align()
        psa.output(fileformat="psa", filename=fname, scores=True)
        psa.output(fileformat="psq", filename=fname)
Esempio n. 14
0
def test_output(tmppath, test_data):
    fpsa = tmppath / 'test.psa'
    write_text_file(fpsa, '\n')
    psa = PSA(str(fpsa))
    fname = str(tmppath / 'test')
    psa.output(fileformat='psa', filename=fname)

    psq = tmppath / 'test.psq'
    write_text_file(psq, '\n')
    psa = PSA(str(psq))
    fname = str(tmppath / 'test')
    psa.output(fileformat='psq', filename=fname)

    psa = PSA(str(test_data / 'harry_potter.psa'))
    psa.align()
    psa.output(fileformat="psa", filename=fname, scores=True)
    psa.output(fileformat="psq", filename=fname)
Esempio n. 15
0
    def test_output(self):
        fpsa = self.tmp_path('test.psa')
        write_text_file(fpsa, '\n')
        psa = PSA(text_type(fpsa))
        fname = text_type(self.tmp_path('test'))
        psa.output(fileformat='psa', filename=fname)

        psq = self.tmp_path('test.psq')
        write_text_file(psq, '\n')
        psa = PSA(text_type(psq))
        fname = text_type(self.tmp_path('test'))
        psa.output(fileformat='psq', filename=fname)

        psa = PSA(text_type(test_data('harry_potter.psa')))
        psa.align()
        psa.output(fileformat="psa", filename=fname, scores=True)
        psa.output(fileformat="psq", filename=fname)
Esempio n. 16
0
def matrix2tree(matrix,
                taxa,
                tree_calc="neighbor",
                distances=True,
                filename=""):
    """
    Calculate a tree of a given distance matrix.

    Parameters
    ----------
    matrix : list
        The distance matrix to be used.
    taxa : list
        A list of the taxa in the distance matrix.
    tree_calc : str (default="neighbor")
        The method for tree calculation that shall be used. Select between:

        * "neighbor": Neighbor-joining method (:evobib:`Saitou1987`)
        * "upgma" : UPGMA method (:evobib:`Sokal1958`)

    distances : bool (default=True)
        If set to c{True}, distances will be included in the
        tree-representation.
    filename : str (default='')
        If a filename is specified, the data will be written to that file.

    Returns
    -------
    tree : ~lingpy.thirdparty.cogent.tree.PhyloNode
        A ~lingpy.thirdparty.cogent.tree.PhyloNode object for handling tree
        files.
    """

    if tree_calc == 'upgma':
        algorithm = cluster.upgma
    elif tree_calc == 'neighbor':
        algorithm = cluster.neighbor
    else:
        raise ValueError(tree_calc)

    tree = cg.LoadTree(treestring=algorithm(matrix, taxa, distances))

    if not filename:
        return tree
    util.write_text_file(filename + '.nwk', text_type(tree))
Esempio n. 17
0
    def diff(self, **keywords):
        """
        Write all differences between two sets to a file.

        Parameters
        ----------

        filename : str (default='eval_psa_diff')
            Default

        """
        setdefaults(keywords, filename=self.gold.infile)
        if not keywords['filename'].endswith('.diff'):
            keywords['filename'] = keywords['filename'] + '.diff'

        out = []
        for i, (a,
                b) in enumerate(zip(self.gold.alignments,
                                    self.test.alignments)):
            g1, g2, g3 = a
            t1, t2, t3 = b
            maxL = max([len(g1), len(t1)])
            if g1 != t1 or g2 != t2:
                taxA, taxB = self.gold.taxa[i]
                taxlen = max(len(taxA), len(taxB))
                seq_id = self.gold.seq_ids[i]
                out.append(
                    '{0}\n{1}\t{2}\n{3}\t{4}\n{5}\n{1}\t{6}\n{3}\t{7}\n\n'.
                    format(
                        seq_id,
                        taxA,
                        '\t'.join(g1),
                        taxB,
                        '\t'.join(g2),
                        '{0}\t{1}'.format(
                            taxlen * ' ',
                            '\t'.join(['==' for x in range(maxL)])),
                        '\t'.join(t1),
                        '\t'.join(t2),
                    ))
        log.file_written(keywords['filename'])
        write_text_file(keywords['filename'], out)
Esempio n. 18
0
def pap2csv(
    taxa,
    paps,
    filename=''
):
    """
    Write paps created by the Wordlist class to a csv-file.
    """

    out = "ID\t" + '\t'.join(taxa) + '\n'
    for key in sorted(paps, key=lambda x: int(re.sub(r'[^0-9]+', '', str(x)))):
        out += '{0}\t{1}\n'.format(
            key,
            '\t'.join(str(i) for i in paps[key])
        )

    if not filename:
        return out
    util.write_text_file(filename + '.csv', out)
    return
Esempio n. 19
0
def matrix2tree(matrix, taxa, tree_calc="neighbor", distances=True, filename=""):
    """
    Calculate a tree of a given distance matrix.

    Parameters
    ----------
    matrix : list
        The distance matrix to be used.
    taxa : list
        A list of the taxa in the distance matrix.
    tree_calc : str (default="neighbor")
        The method for tree calculation that shall be used. Select between:

        * "neighbor": Neighbor-joining method (:evobib:`Saitou1987`)
        * "upgma" : UPGMA method (:evobib:`Sokal1958`)

    distances : bool (default=True)
        If set to c{True}, distances will be included in the
        tree-representation.
    filename : str (default='')
        If a filename is specified, the data will be written to that file.

    Returns
    -------
    tree : ~lingpy.thirdparty.cogent.tree.PhyloNode
        A ~lingpy.thirdparty.cogent.tree.PhyloNode object for handling tree
        files.
    """

    if tree_calc == 'upgma':
        algorithm = cluster.upgma
    elif tree_calc == 'neighbor':
        algorithm = cluster.neighbor
    else:
        raise ValueError(tree_calc)

    tree = cg.LoadTree(treestring=algorithm(matrix, taxa, distances))

    if not filename:
        return tree
    util.write_text_file(filename + '.nwk', text_type(tree))
Esempio n. 20
0
    def diff(self, **keywords):
        """
        Write all differences between two sets to a file.

        Parameters
        ----------

        filename : str (default='eval_psa_diff')
            Default

        """
        setdefaults(keywords, filename=self.gold.infile)
        if not keywords['filename'].endswith('.diff'):
            keywords['filename'] = keywords['filename'] + '.diff'

        out = []
        for i, (a, b) in enumerate(zip(self.gold.alignments, self.test.alignments)):
            g1, g2, g3 = a
            t1, t2, t3 = b
            maxL = max([len(g1), len(t1)])
            if g1 != t1 or g2 != t2:
                taxA, taxB = self.gold.taxa[i]
                taxlen = max(len(taxA), len(taxB))
                seq_id = self.gold.seq_ids[i]
                out.append('{0}\n{1}\t{2}\n{3}\t{4}\n{5}\n{1}\t{6}\n{3}\t{7}\n\n'.format(
                    seq_id,
                    taxA,
                    '\t'.join(g1),
                    taxB,
                    '\t'.join(g2),
                    '{0}\t{1}'.format(
                        taxlen * ' ', '\t'.join(['==' for x in range(maxL)])),
                    '\t'.join(t1),
                    '\t'.join(t2),
                ))
        log.file_written(keywords['filename'])
        write_text_file(keywords['filename'], out)
Esempio n. 21
0
def test_write_text_file(tmppath):
    def lines_generator(n):
        for i in range(n):
            yield 'line%s' % i

    path = tmppath / 'test'
    util.write_text_file(path, 'test')
    assert util.read_text_file(path) == 'test'

    util.write_text_file(path, ['line1', 'line2'])
    assert len(util.read_text_file(path, lines=True)) == 2

    util.write_text_file(path, lines_generator(5))
    assert len(util.read_text_file(path, lines=True)) == 5
Esempio n. 22
0
    def test_write_text_file(self):
        def lines_generator(n):
            for i in range(n):
                yield 'line%s' % i

        path = self.tmp_path('test')
        util.write_text_file(path, 'test')
        self.assertEqual(util.read_text_file(path), 'test')

        util.write_text_file(path, ['line1', 'line2'])
        self.assertEqual(len(util.read_text_file(path, lines=True)), 2)

        util.write_text_file(path, lines_generator(5))
        self.assertEqual(len(util.read_text_file(path, lines=True)), 5)
Esempio n. 23
0
    def test_write_text_file(self):
        def lines_generator(n):
            for i in range(n):
                yield 'line%s' % i

        path = self.tmp_path('test')
        util.write_text_file(path, 'test')
        self.assertEqual(util.read_text_file(path), 'test')

        util.write_text_file(path, ['line1', 'line2'])
        self.assertEqual(len(util.read_text_file(path, lines=True)), 2)

        util.write_text_file(path, lines_generator(5))
        self.assertEqual(len(util.read_text_file(path, lines=True)), 5)
Esempio n. 24
0
    def cognate_detection(self, **keywords):
        """
        Method runs a cognate detection analysis.
        """
        kw = dict(
            align_method='progressive',
            align_mode=rcParams['align_mode'],
            align_modes=rcParams['align_modes'],
            cluster_method=rcParams['lexstat_cluster_method'],
            cognate_method='sca',
            cognate_mode='overlap',
            defaults=False,
            factor=rcParams['align_factor'],
            gap_weight=rcParams['gap_weight'],
            gop=rcParams['align_gop'],
            iteration=False,
            lexstat_modes=rcParams['lexstat_modes'],
            limit=rcParams['lexstat_limit'],
            merge_vowels=rcParams['merge_vowels'],
            model=rcParams['sca'],
            export="html",
            preprocessing=False,
            preprocessing_method=rcParams['lexstat_preprocessing_method'],
            preprocessing_threshold=rcParams[
                'lexstat_preprocessing_threshold'],
            rands=rcParams['lexstat_rands'],
            ratio=rcParams['lexstat_ratio'],
            ref="customid",
            restricted_chars=rcParams['restricted_chars'],
            restriction='',
            runs=rcParams['lexstat_runs'],
            scale=rcParams['align_scale'],
            scoring_method=rcParams['lexstat_scoring_method'],
            swap_check=False,
            threshold=rcParams['lexstat_threshold'],
            tree_calc=rcParams['align_tree_calc'],
            vscale=rcParams['lexstat_vscale'],
            outfile=False,
            sonar=True,
        )

        # first load
        kw.update(keywords)
        if kw['defaults']:
            return kw

        # carry out lexstat cluster analysis
        self.lex = LexStat(self.infile, **kw)

        # reset filename if it is not defined
        kw['outfile'] = kw['outfile'] or self.lex.filename + '_lingpy'

        # check for traditional lexstat analysis
        if kw['cognate_method'] == 'lexstat':
            self.lex.get_scorer(method=kw['scoring_method'],
                                modes=kw['lexstat_modes'],
                                **kw)

        self.lex.cluster(method=kw['cognate_method'],
                         mode=kw['cognate_mode'],
                         **kw)

        # align the data
        self.alms = Alignments(self.lex, **kw)
        kw['scoredict'] = self.lex.cscorer \
            if kw['cognate_method'] == 'lexstat' else self.lex.bscorer

        self.alms.align(method=kw['align_method'],
                        mode=kw['align_mode'],
                        modes=kw['align_modes'],
                        **kw)

        if 'tsv' in kw['export']:
            self.alms.output('tsv',
                             filename=kw['outfile'],
                             ignore=['scorer', 'json', 'taxa', 'msa'],
                             **kw)
        if 'html' in kw['export']:
            corrs, occs = get_correspondences(self.alms, kw['ref'])

            # serialize the wordlist
            wl = {}
            for concept in self.alms.concepts:
                entries = self.alms.get_list(concept=concept, flat=True)
                cogids = [self.alms[idx, kw['ref']] for idx in entries]
                words = [self.alms[idx, 'ipa'] for idx in entries]
                alms = [self.alms[idx, 'alignment'] for idx in entries]
                langs = [self.alms[idx, 'doculect'] for idx in entries]

                checkalm = lambda x: x if type(x) == str else ' '.join(x)

                wl[concept] = [
                    list(k) for k in sorted(zip(
                        langs,
                        [str(x) for x in entries],
                        words,
                        [str(x) for x in cogids],
                        [checkalm(x) for x in alms],
                    ),
                                            key=lambda x: int(x[3]))
                ]

                # make simple gloss id for internal use as id
                gloss2id = list(
                    zip(self.alms.concepts, [
                        str(x) for x in range(1,
                                              len(self.alms.concepts) + 1)
                    ]))
                id2gloss = dict([[b, a] for a, b in gloss2id])
                gloss2id = dict(gloss2id)

                txt = ''
                txt += 'CORRS = ' + json.dumps(corrs) + ';\n'
                txt += 'LANGS = ' + json.dumps(self.alms.taxa) + ';\n'
                txt += 'OCCS = ' + json.dumps(occs) + ';\n'
                txt += 'WLS = ' + json.dumps(wl) + ';\n'
                txt += 'GlossId = ' + json.dumps(gloss2id) + ';\n'
                txt += 'IdGloss = ' + json.dumps(id2gloss) + ';\n'
                txt += 'FILE = "' + kw['outfile'] + '.tsv";\n'

                tpath = partial(util.data_path, 'templates')

                tname = 'jcov.{0}.html'.format('remote' if 'remote' in
                                               kw['export'] else 'direct')
                content = util.read_text_file(tpath(tname))

                util.write_text_file(
                    kw['outfile'] + '.html',
                    content.format(
                        CORRS=txt,
                        JCOV=util.read_text_file(tpath('jcov.js')),
                        STYLE=util.read_text_file(tpath('jcov.css')),
                        VENDOR=util.read_text_file(tpath('jcov.vendor.js')),
                        DIGHL=util.read_text_file(tpath('jcov.dighl.js'))))
Esempio n. 25
0
def compare_conceptlists(
        list1,
        list2,
        output='',
        match=None,
        filename='matches',
        debug=False,
        **keywords):
    """
    Function compares two concept lists and outputs suggestions for mapping.

    Notes
    -----
    Idea is to take one conceptlist as the basic list and then to search for a
    plausible mapping of concepts in the second list to the first list. All
    suggestions can then be output in various forms, both with multiple matches
    excluded or included, and in textform or in other forms.

    What is important, regarding the output here, is, that the output contains
    all matches, including non-matched items which occur **in the second list
    but not in the first list**. Non-matched items which occur in the first
    list but not in the second list are ignored.

    The syntax for matching types is organized as follows:

    * 1 indicates a full match between glosses, including information on part
      speech and the like
    * 2 indicates a very good match between a full gloss and the main part of a
      gloss or the two main parts of a gloss
    * 3 indicates a very good match between the main parts of two glosses with
      non-matching information regarding part of speech
    * 4 indicates that the longest part of two glosses matches along with the
      part-of-speech information.
    * 5 indicates that the longest part of two glosses matches with
      non-matching part-of-speech information.
    * 6 indicates that the longest part of the first list is matched by one of
      the parts in the second list
    * 7 indicates that the longest part of the second list is matched by one of
      the parts in the first list
    * 8 indicates that no match could be found.
    """
    # check for match quality
    if not match:
        match = [1, 2, 3, 4, 5]

    # check for keywords
    defaults = dict(
        id_name='CONCEPTICON_ID',
        gloss_name='CONCEPTICON_GLOSS',
        match_quality='MATCH_QUALITY',
        gloss='GLOSS',
        number='NUMBER')
    defaults.update(keywords)

    # take first list as basic list
    base = csv2list(list1)
    comp = csv2list(list2)

    # get headers
    baseh, base = base[0], base[1:]
    comph, comp = comp[0], comp[1:]

    # make sure to raise if 'gloss' is not in the headers
    if (not defaults["gloss"] in baseh and not defaults["gloss"] in comph) or \
            (not defaults["number"] in baseh and not defaults["number"] in comph):
        raise ValueError(
            "[!] There is no field for '{0}' or '{1}'".format(
                keywords['gloss'],
                keywords['number']
            ) + " in the header of the input lists.")

    # get gloss indices
    bidx = baseh.index(defaults['gloss'])
    cidx = comph.index(defaults['gloss'])
    bnum = baseh.index(defaults['number'])
    cnum = comph.index(defaults['number'])

    # extract glossing information from the data
    B = {}
    idx = 1
    for i, line in enumerate(base):
        gloss = line[bidx]
        gdata = parse_gloss(gloss, output='dict')
        for gdatum in gdata:
            gdatum['number'] = line[bnum]  # we won't need "enumerate" XXX
            B[idx] = gdatum
            idx += 1

    idx = 1
    line2idx = {}
    C = {}
    for i, line in enumerate(comp):
        gloss = line[cidx]
        gdata = parse_gloss(gloss, output='dict')
        for gdatum in gdata:
            gdatum['number'] = line[cnum]  # we won't need "enumerate" XXX
            C[idx] = gdatum
            try:
                line2idx[i] += [idx]
            except KeyError:
                line2idx[i] = [idx]
            idx += 1

    # now that we have prepared all the glossed list as planned, we compare
    # them item by item and check for similarity
    sims = []
    for i, a in sorted(B.items()):
        for j, b in sorted(C.items()):
            # first-order-match: identical glosses
            if a['gloss'] == b['gloss']:
                sims += [(i, j, 1)]
            # second-order match: identical main-parts
            elif a['main'] == b['gloss'] or a['gloss'] == b['main'] or \
                    a['main'] == b['main']:
                # best match if pos matches
                if a['pos'] == b['pos']:
                    sims += [(i, j, 2)]

                # less good match if pos mismatches
                else:
                    sims += [(i, j, 3)]
            elif a['longest_part'] == b['longest_part']:
                if a['pos'] == b['pos'] and a['pos']:
                    sims += [(i, j, 4)]
                else:
                    sims += [(i, j, 5)]
            elif b['longest_part'] in a['parts']:
                sims += [(i, j, 6)]
            elif a['longest_part'] in b['parts']:
                sims += [(i, j, 7)]

    # get the number of items which were not matched in the second list
    matched = [x[1] for x in sims if x[2] in match]
    not_matched = [idx_ for idx_ in C if idx_ not in matched]
    for idx in not_matched:
        sims += [(0, idx, 8)]

    # sort the matches, add them to a dictionary
    best = {}
    for a, b, c in sims:
        try:
            best[b] += [(a, c)]
        except KeyError:
            best[b] = [(a, c)]

    for k, v in best.items():
        best[k] = sorted(set(v), key=lambda x: x[1])

        if best[k][0][1] in matched:
            best[k] = [best[k][0]]

    # prepare the output
    out = []
    for b in best:  # in sims:
        for a, c in best[b]:
            if c in match:
                out += [(c, B[a]['gloss'], B[a]['number'], C[b]['gloss'], C[b]['number'])]
            elif c == 0:
                out += [(c, '?', '0', C[b]['gloss'], C[b]['number'])]

    if not output:
        return out

    elif output == 'tsv':
        added = []
        txt = ['\t'.join(comph) + '\t{0}\t{1}\t{2}\n'.format(
            defaults['id_name'],
            defaults['gloss_name'],
            defaults['match_quality'])]
        for i, line in enumerate(comp):
            for idx in line2idx[i]:
                if idx in best:
                    data = best[idx]
                else:
                    data = [('?', '0')]

                for a, b in data:
                    if b in match or b == 8:
                        try:
                            base_gloss = B[a]['gloss']
                            base_num = B[a]['number']
                        except KeyError:
                            base_gloss = '???'
                            base_num = '0'

                        nline = '\t'.join(line) + '\t' + str(base_num) + '\t' + \
                                base_gloss + '\t' + str(b) + '\n'
                        if nline not in added:
                            txt += [nline]
                            added += [nline]
                    else:
                        nline = '\t'.join(line) + '\t???\t???\t8\n'
                        if nline not in added:
                            txt += [nline]
                            added += [nline]

            txt[-1] += '\n'

        out = [txt[0]] + sorted(txt[1:], key=lambda x: x[x.index('\t')])
        write_text_file(filename, ''.join(out))

    if debug:
        return sims
Esempio n. 26
0
def psa2html(infile, **kw):
    """
    Function converts a PSA-file into colored html-format.
    """
    util.setdefaults(kw,
                     template=False,
                     css=False,
                     comment='#',
                     filename=infile[:-4] + '.html',
                     compact=True)

    template = util.read_text_file(kw['template'] or template_path('psa.html'))
    css = util.read_text_file(kw['css'] or template_path('psa.css'))

    data = []
    for line in util.read_text_file(infile, lines=True):
        if not line.startswith(kw['comment']):
            data.append(line)

    seq_ids = []
    pairs = []
    taxa = []
    alignments = []

    del data[0]

    i = 0
    while i <= len(data) - 3:
        try:
            seq_ids.append(data[i])

            datA = data[i + 1].split('\t')
            datB = data[i + 2].split('\t')

            taxonA = datA[0].strip('.')
            taxonB = datB[0].strip('.')
            almA = datA[1:]
            almB = datB[1:]

            taxa.append((taxonA, taxonB))
            pairs.append(('.'.join([k for k in almA if k != '-']),
                          '.'.join([k for k in almB if k != '-'])))
            alignments.append(
                ([str(a) for a in almA], [str(b) for b in almB], 0))
            assert len(alignments[-1][0]) == len(alignments[-1][1])
            i += 4
        except AssertionError:
            log.warning("Line {0} of the data is probably miscoded.".format(i +
                                                                            1))
            i += 1

    def get_classes(alm):
        classes = []
        residue = '<div class="residue {1}">{0}</div>'
        for j, char in enumerate(alm):
            if char == '-':
                d = 'dolgo_GAP'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'
            classes += [residue.format(char, d)]
        return ''.join(classes)

    out = '<table>\n'  # codecs.open(kw['filename'], 'w', 'utf-8')
    for i, (a, b, c) in enumerate(alignments):
        clsA = get_classes(a)
        clsB = get_classes(b)

        ids = int(100 * pid(a, b) + 0.5)

        out += '<tr class="head">'
        out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format(
            i + 1, seq_ids[i], ids)
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][0])
        out += '<td class="psa">{0}</td>'.format(clsA)
        out += '</tr>'
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][1])
        out += '<td class="psa">{0}</td>'.format(clsB)
        out += '</tr>'
        out += '<tr><td colspan=2></td></tr>'

    out += '</table>'

    html = template.format(alignments=out, css=css)

    if kw['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    util.write_text_file(kw['filename'], html)
Esempio n. 27
0
def diff(wordlist,
         gold='cogid',
         test='lexstatid',
         modify_ref=False,
         pprint=True,
         filename='',
         tofile=True,
         transcription="ipa",
         concepts=False):
    r"""
    Write differences in classifications on an item-basis to file.

    lex : :py:class:`lingpy.compare.lexstat.LexStat`
        The :py:class:`~lingpy.compare.lexstat.LexStat` class used for the
        computation. It should have two columns indicating cognate IDs.
    gold : str (default='cogid')
        The name of the column containing the gold standard cognate
        assignments.
    test : str (default='lexstatid')
        The name of the column containing the automatically implemented cognate
        assignments.
    modify_ref : function (default=False)
        Use a function to modify the reference. If your cognate identifiers
        are numerical, for example, and negative values are assigned as
        loans, but you want to suppress this behaviour, just set this
        keyword to "abs", and all cognate IDs will be converted to their
        absolute value.
    pprint : bool (default=True)
        Print out the results
    filename : str (default='')
        Name of the output file. If not specified, it is identical with the
        name of the :py:class:`~lingpy.compare.lexstat.LexStat`, but with the
        extension ``diff``.
    tofile : bool (default=True)
        If set to c{False}, no data will be written to file, but instead, the
        data will be returned.
    transcription : str (default="ipa")
        The file in which the transcriptions are located (should be a string,
        no segmentized version, for convenience of writing to file).

    Returns
    -------
    t : tuple
        A nested tuple consisting of two further tuples. The first
        containing precision, recall, and harmonic mean
        (F-scores), the second containing the same values for the pair-scores.

    Notes
    -----
    If the **tofile** option is chosen, the results are written to a specific
    file with the extension ``diff``. This file contains all cognate sets in
    which there are differences between gold standard and test sets. It also
    gives detailed information regarding false positives, false negatives, and
    the words involved in these wrong decisions.

    See also
    --------
    bcubes
    pairs
    """
    filename = filename or wordlist.filename
    loan = modify_ref if modify_ref else identity

    # open file
    lines = []

    # concepts, allow to check scores for only one concept
    concepts = concepts or [c for c in wordlist.rows]

    # get a formatter for language names
    lform = '{0:' + str(max([len(l) for l in wordlist.cols])) + '}'

    preT, recT = [], []
    preB, recB = [], []
    preP, recP = [], []

    def get_pairs(cogs, idxs):
        tmp = defaultdict(list)
        for x, y in zip(cogs, idxs):
            tmp[x].append(y)
        for x in tmp:
            for yA, yB in combinations(tmp[x], r=2):
                yield tuple(sorted([yA, yB]))

    for concept in concepts:
        idxs = wordlist.get_list(row=concept, flat=True)
        # get the basic index for all seqs
        bidx = [i + 1 for i in range(len(idxs))]

        cogsG = _get_cogs(gold, concept, loan, wordlist)
        cogsT = _get_cogs(test, concept, loan, wordlist)

        if cogsG != cogsT:
            # calculate the transformation distance of the sets
            tramGT = len(set(zip(cogsG, cogsT)))
            tramG = len(set(cogsG))
            tramT = len(set(cogsT))
            preT += [tramT / tramGT]
            recT += [tramG / tramGT]

            # calculate the bcubed precision for the sets
            preB += [_get_bcubed_score(cogsT, cogsG)]

            # calculate b-cubed recall
            recB += [_get_bcubed_score(cogsG, cogsT)]

            # calculate pair precision
            pairsG = set(get_pairs(cogsG, idxs))
            pairsT = set(get_pairs(cogsT, idxs))

            preP.append(
                len(pairsT.intersection(pairsG)) /
                len(pairsT) if pairsT else 1.0)
            recP.append(
                len(pairsT.intersection(pairsG)) /
                len(pairsG) if pairsG else 1.0)
            fp = "no" if preP[-1] == 1.0 else "yes"
            fn = "no" if recP[-1] == 1.0 else "yes"

            lines.append(
                "Concept: {0}, False Positives: {1}, False Negatives: {2}".
                format(concept, fp, fn))

            # get the words
            words = [wordlist[i, 'ipa'] for i in idxs]
            langs = [wordlist[i, 'taxa'] for i in idxs]

            # get a word-formater
            wform = '{0:' + str(max([len(w) for w in words])) + '}'

            # write differences to file
            for word, lang, cG, cT in sorted(zip(words, langs, cogsG, cogsT),
                                             key=lambda x: (x[2], x[3])):
                lines.append('{0}\t{1}\t{2:4}\t{3:4}'.format(
                    lform.format(lang), wform.format(word), cG, cT))
            lines.append('#')
        else:
            preT += [1.0]
            recT += [1.0]
            preB += [1.0]
            recB += [1.0]
            preP += [1.0]
            recP += [1.0]

    bp = sum(preB) / len(preB)
    br = sum(recB) / len(recB)
    bf = 2 * (bp * br) / (bp + br)
    pp = sum(preP) / len(preP)
    pr = sum(recP) / len(recP)
    pf = 2 * (pp * pr) / (pp + pr)

    as_string('\n'.join(lines), pprint=pprint)

    if tofile:
        write_text_file(filename + '.diff', lines)
    return (bp, br, bf), (pp, pr, pf)
Esempio n. 28
0
def compare_conceptlists(
        list1,
        list2,
        output='',
        match=None,
        filename='matches',
        **keywords):
    """
    Function compares two concept lists and outputs suggestions for mapping.

    Notes
    -----
    Idea is to take one conceptlist as the basic list and then to search for a
    plausible mapping of concepts in the second list to the first list. All
    suggestions can then be output in various forms, both with multiple matches
    excluded or included, and in textform or in other forms.

    What is important, regarding the output here, is, that the output contains
    all matches, including non-matched items which occur **in the second list
    but not in the first list**. Non-matched items which occur in the first
    list but not in the second list are ignored.

    The syntax for matching types is organized as follows:

    * 1 indicates a full match between glosses, including information on part
      speech and the like
    * 2 indicates a very good match between a full gloss and the main part of a
      gloss or the two main parts of a gloss
    * 3 indicates a very good match between the main parts of two glosses with
      non-matching information regarding part of speech
    * 4 indicates that the longest part of two glosses matches along with the
      part-of-speech information.
    * 5 indicates that the longest part of two glosses matches with
      non-matching part-of-speech information.
    * 6 indicates that the longest part of the first list is matched by one of
      the parts in the second list
    * 7 indicates that the longest part of the second list is matched by one of
      the parts in the first list
    * 8 indicates that no match could be found.
    """
    # check for match quality
    if not match:
        match = [1, 2, 3, 4, 5]

    # check for keywords
    defaults = dict(
        id_name='CONCEPTICON_ID',
        gloss_name='CONCEPTICON_GLOSS',
        match_quality='MATCH_QUALITY',
        gloss='GLOSS',
        number='NUMBER')
    defaults.update(keywords)

    # take first list as basic list
    base = csv2list(list1)
    comp = csv2list(list2)

    # get headers
    baseh, base = base[0], base[1:]
    comph, comp = comp[0], comp[1:]

    # make sure to raise if 'gloss' is not in the headers
    if (not defaults["gloss"] in baseh and not defaults["gloss"] in comph) or \
            (not defaults["number"] in baseh and not defaults["number"] in comph):
        raise ValueError(
            "[!] There is no field for '{0}' or '{1}'".format(
                keywords['gloss'],
                keywords['number']
            ) + " in the header of the input lists.")

    # get gloss indices
    bidx = baseh.index(defaults['gloss'])
    cidx = comph.index(defaults['gloss'])
    bnum = baseh.index(defaults['number'])
    cnum = comph.index(defaults['number'])

    # extract glossing information from the data
    B = {}
    idx = 1
    for i, line in enumerate(base):
        gloss = line[bidx]
        gdata = parse_gloss(gloss, output='dict')
        for gdatum in gdata:
            gdatum['number'] = line[bnum]  # we won't need "enumerate" XXX
            B[idx] = gdatum
            idx += 1

    idx = 1
    line2idx = {}
    C = {}
    for i, line in enumerate(comp):
        gloss = line[cidx]
        gdata = parse_gloss(gloss, output='dict')
        for gdatum in gdata:
            gdatum['number'] = line[cnum]  # we won't need "enumerate" XXX
            C[idx] = gdatum
            try:
                line2idx[i] += [idx]
            except KeyError:
                line2idx[i] = [idx]
            idx += 1

    # now that we have prepared all the glossed list as planned, we compare
    # them item by item and check for similarity
    sims = []
    for i, a in sorted(B.items()):
        for j, b in sorted(C.items()):
            # first-order-match: identical glosses
            if a['gloss'] == b['gloss']:
                sims += [(i, j, 1)]
            # second-order match: identical main-parts
            elif a['main'] == b['gloss'] or a['gloss'] == b['main'] or \
                    a['main'] == b['main']:
                # best match if pos matches
                if a['pos'] == b['pos']:
                    sims += [(i, j, 2)]
                # less good match if pos mismatches
                else:
                    sims += [(i, j, 3)]
            elif a['longest_part'] == b['longest_part']:
                if a['pos'] == b['pos'] and a['pos']:
                    sims += [(i, j, 4)]
                else:
                    sims += [(i, j, 5)]
            elif b['longest_part'] in a['parts']:
                sims += [(i, j, 6)]
            elif a['longest_part'] in b['parts']:
                sims += [(i, j, 7)]

    # get the number of items which were not matched in the second list
    matched = [x[1] for x in sims if x[2] in match]
    not_matched = [idx_ for idx_ in C if idx_ not in matched]
    for idx in not_matched:
        sims += [(0, idx, 8)]

    # sort the matches, add them to a dictionary
    best = {}
    for a, b, c in sims:
        try:
            best[b] += [(a, c)]
        except KeyError:
            best[b] = [(a, c)]

    for k, v in best.items():
        best[k] = sorted(set(v), key=lambda x: x[1])

        if best[k][0][1] in matched:
            best[k] = [best[k][0]]

    # prepare the output
    out = []
    for b in best:  # in sims:
        for a, c in best[b]:
            if c in match:
                out += [(c, B[a]['gloss'], B[a]['number'], C[b]['gloss'], C[b]['number'])]
            elif c == 0:
                out += [(c, '?', '0', C[b]['gloss'], C[b]['number'])]

    if not output:
        return out
    elif output == 'tsv':
        added = []
        txt = ['\t'.join(comph) + '\t{0}\t{1}\t{2}\n'.format(
            defaults['id_name'],
            defaults['gloss_name'],
            defaults['match_quality'])]
        for i, line in enumerate(comp):
            for idx in line2idx[i]:
                if idx in best:
                    data = best[idx]
                else:
                    data = [('?', '0')]

                for a, b in data:
                    if b in match or b == 8:
                        try:
                            base_gloss = B[a]['gloss']
                            base_num = B[a]['number']
                        except KeyError:
                            base_gloss = '???'
                            base_num = '0'

                        nline = '\t'.join(line) + '\t' + str(base_num) + '\t' + \
                                base_gloss + '\t' + str(b) + '\n'
                        if nline not in added:
                            txt += [nline]
                            added += [nline]
                    else:
                        nline = '\t'.join(line) + '\t???\t???\t8\n'
                        if nline not in added:
                            txt += [nline]
                            added += [nline]

            txt[-1] += '\n'

        out = [txt[0]] + sorted(txt[1:], key=lambda x: x[x.index('\t')])
        write_text_file(filename, ''.join(out))
Esempio n. 29
0
def compile_model(model, path=None):
    """
    Function compiles customized sound-class models.

    Parameters
    ----------

    model : str
        A string indicating the name of the model which shall be created.

    path : str
        A string indication the path where the model-folder is stored.

    Notes
    -----
    A model is defined by a folder placed in :file:`data/models` directory of
    the LingPy package. The name of the folder reflects the name of the model.
    It contains three files: the file :file:`converter`, the file :file:`INFO`,
    and the optional file :file:`scorer`. The format requirements for these
    files are as follows:

    :file:`INFO`
        The ``INFO``-file serves as a reference for a given sound-class model.
        It can contain arbitrary information (and also be empty). If one wants
        to define specific characteristics, like the ``source``, the
        ``compiler``, the ``date``, or a ``description`` of a given model,
        this can be done by employing a key-value structure in which the key is
        preceded by an ``@`` and followed by a colon and the value is written
        right next to the key in the same line, e.g.::
            
            @source: Dolgopolsky (1986)

        This information will then be read from the ``INFO`` file and rendered
        when printing the model to screen with help of the :py:func:`print`
        function.

    :file:`converter`
        The ``converter`` file contains all sound classes which are matched
        with their respective sound values. Each line is reserved for one
        class, precede by the key (preferably an ASCII-letter) representing the
        class::

            B : ɸ, β, f, p͡f, p͜f, ƀ
            E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ
            D : θ, ð, ŧ, þ, đ
            G : x, ɣ, χ
            ...
    
    :file:`matrix`
        A scoring matrix indicating the alignment scores of all sound-class
        characters defined by the model. The scoring is structured as a simple
        tab-delimited text file. The first cell contains the character names,
        the following cells contain the scores in redundant form (with both
        triangles being filled)::

            B  10.0 -10.0   5.0 ...
            E -10.0   5.0 -10.0 ...
            F   5.0 -10.0  10.0 ...
            ...

    :file:`scorer`
        The ``scorer`` file (which is optional) contains the graph of
        class-transitions which is used for the calculation of the scoring
        dictionary. Each class is listed in a separate line, followed by the
        symbols ``v``,``c``, or ``t`` (indicating whether the class
        represents vowels, consonants, or tones), and by the classes it is
        directly connected to. The strength of this connection is indicated by
        digits (the smaller the value, the shorter the path between the
        classes)::

            A : v, E:1, O:1
            C : c, S:2
            B : c, W:2
            E : v, A:1, I:1
            D : c, S:2
            ...
        
        The information in such a file is automatically converted into a
        scoring dictionary (see :evobib:`List2012b` for details).

    Based on the information provided by the files, a dictionary for the
    conversion of IPA-characters to sound classes and a scoring dictionary are
    created and stored as a binary.  The model can be loaded with help of the
    :py:class:`~lingpy.data.model.Model` class and used in the various classes
    and functions provided by the library.
    
    See also
    --------
    lingpy.data.model.Model
    compile_dvt

    """
    log.info("Compiling model <" + model + ">...")
    # get the path to the models
    new_path = lambda *cmps: os.path.join(path or util.data_path('models'), model, *cmps)

    log.debug("Model-Path: %s" % new_path)

    # load the sound classes
    sound_classes = _import_sound_classes(new_path('converter'))

    # dump the data
    cache.dump(sound_classes, model + '.converter')
    log.info("... successfully created the converter.")

    # try to load the scoring function or the score tree
    scorer = False

    if os.path.isfile(new_path('matrix')):
        scorer = read_scorer(new_path('matrix'))
    elif os.path.isfile(new_path('scorer')):
        score_tree = _import_score_tree(new_path('scorer'))

        # calculate the scoring dictionary
        score_dict = _make_scoring_dictionary(score_tree)

        # make score_dict a ScoreDict instance
        chars = sorted(set([s[0] for s in score_dict.keys()]))
        matrix = [[0 for i in range(len(chars))] for j in
                  range(len(chars))]
        for (i, charA), (j, charB) in util.multicombinations2(enumerate(chars)):
            if i < j:
                matrix[i][j] = score_dict.get((charA, charB), -100)
                matrix[j][i] = score_dict.get((charB, charA), -100)
            elif i == j:
                matrix[i][j] = score_dict[charA, charB]

        scorer = misc.ScoreDict(chars, matrix)
        util.write_text_file(new_path('matrix'), scorer2str(scorer))

    if scorer:
        cache.dump(scorer, model + '.scorer')
        log.info("... successfully created the scorer.")
    else:
        log.info("... no scoring dictionary defined.")

    log.info("Model <" + model + "> was compiled successfully.")
Esempio n. 30
0
def _graph_or_file(graph, filename):
    if filename:
        util.write_text_file(filename + '.gml', nx.generate_gml(graph))
        return
    return graph
Esempio n. 31
0
    def _output(self, fileformat, **keywords):
        """
        Internal function that eases its modification by daughter classes.
        """
        # check for stamp attribute
        keywords["stamp"] = getattr(self, '_stamp', '')

        # add the default parameters, they will be checked against the keywords
        util.setdefaults(
            keywords,
            cols=False,
            distances=False,
            entries=("concept", "counterpart"),
            entry='concept',
            fileformat=fileformat,
            filename=rcParams['filename'],
            formatter='concept',
            modify_ref=False,
            meta=self._meta,
            missing=0,
            prettify='false',
            ignore='all',
            ref='cogid',
            rows=False,
            subset=False,  # setup a subset of the data,
            taxa='taxa',
            threshold=0.6,  # threshold for flat clustering
            tree_calc='neighbor')

        if fileformat in ['triple', 'triples', 'triples.tsv']:
            return tsv2triple(self, keywords['filename'] + '.' + fileformat)

        if fileformat in ['paps.nex', 'paps.csv']:
            paps = self.get_paps(
                ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing'])
            kw = dict(filename=keywords['filename'] + '.paps')
            if fileformat == 'paps.nex':
                kw['missing'] = keywords['missing']
                return pap2nex(self.cols, paps, **kw)
            return pap2csv(self.cols, paps, **kw)

        # simple printing of taxa
        if fileformat == 'taxa':
            assert hasattr(self, 'taxa')
            return util.write_text_file(keywords['filename'] + '.taxa', self.cols)

        # csv-output
        if fileformat in ['csv', 'qlc', 'tsv']:

            # get the header line
            header = sorted(
                [s for s in set(self._alias.values()) if s in self._header],
                key=lambda x: self._header[x])
            header = [h.upper() for h in header]

            self._meta.setdefault('taxa', self.cols)

            # get the data, in case a subset is chosen
            if not keywords['subset']:
                # write stuff to file
                return wl2qlc(header, self._data, **keywords)

            cols, rows = keywords['cols'], keywords['rows']

            if not isinstance(cols, (list, tuple, bool)):
                raise ValueError("[i] Argument 'cols' should be list or tuple.")
            if not isinstance(rows, (dict, bool)):
                raise ValueError("[i] Argument 'rows' should be a dictionary.")

            # check for chosen header
            if cols:
                # get indices for header
                indices = [self._header[x] for x in cols]
                header = [c.upper() for c in cols]
            else:
                indices = [r for r in range(len(self.header))]

            if rows:
                stmts = []
                for key, value in rows.items():
                    if key == 'ID':
                        stmts += ["key " + value]
                    else:
                        idx = self._header[key]
                        stmts += ["line[{0}] ".format(idx) + value]

            log.debug("calculated what should be excluded")

            # get the data
            out = {}
            for key, line in self._data.items():
                log.debug(key)

                if rows:
                    if eval(" and ".join(stmts)):
                        out[key] = [line[i] for i in indices]
                else:
                    out[key] = [line[i] for i in indices]

            log.debug("passing data to wl2qlc")
            return wl2qlc(header, out, **keywords)

        # output dst-format (phylip)
        if fileformat == 'dst':
            # check for distances as keyword
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self, **keywords)

            out = matrix2dst(self._meta['distances'], self.taxa,
                    stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0))
            return _write_file(keywords['filename'], out, fileformat)

        # output tre-format (newick)
        if fileformat in ['tre', 'nwk']:  # ,'cluster','groups']:
            if 'tree' not in self._meta:
                # check for distances
                if 'distances' not in self._meta:
                    self._meta['distances'] = wl2dst(self)
                # we look up a function to calculate a tree in the cluster module:
                tree = getattr(cluster, keywords['tree_calc'])(
                    self._meta['distances'], self.cols, distances=keywords['distances'])
            else:
                tree = self._meta['tree']

            return _write_file(keywords['filename'], '{0}'.format(tree), fileformat)

        if fileformat in ['cluster', 'groups']:
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self)  # check for keywords

            if 'groups' not in self._meta:
                self._meta['groups'] = cluster.matrix2groups(
                    keywords['threshold'], self._meta['distances'], self.taxa)
            lines = []
            for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]):
                lines.append('{0}\t{1}'.format(taxon, group))
            return _write_file(keywords['filename'], lines, fileformat)

        if fileformat in ['starling', 'star.csv']:
            # make lambda inline for data-check
            l = lambda x: ['-' if x == 0 else x][0]

            lines = []
            if 'cognates' not in keywords:
                lines.append('ID\tConcept\t' + '\t'.join(self.taxa))
                for i, concept in enumerate(self.concepts):
                    for line in self.get_list(row=concept, entry=keywords['entry']):
                        lines.append(
                            str(i + 1) + '\t' + concept + '\t' + '\t'.join(
                                [l(t) for t in line]))
            else:
                lines.append(
                    'ID\tConcept\t' + '\t'.join(
                        ['{0}\t COG'.format(t) for t in self.taxa]))
                for i, concept in enumerate(self.concepts):
                    cogs = self.get_list(row=concept, entry=keywords['cognates'])
                    for j, line in enumerate(
                            self.get_list(row=concept, entry=keywords['entry'])):
                        part = '\t'.join(
                            '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j]))
                        lines.append(util.tabjoin(i + 1, concept, part))

            return _write_file(
                keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv')

        if fileformat == 'multistate.nex':
            if not keywords['filename'].endswith('.multistate.nex'):
                keywords['filename'] += '.multistate.nex'

            matrix = wl2multistate(self, keywords['ref'], keywords['missing'])
            return multistate2nex(self.taxa, matrix, keywords['filename'])

        if fileformat == 'separated':
            if not os.path.isdir(keywords['filename']):
                os.mkdir(keywords['filename'])

            for l in self.cols:
                lines = [''] if 'ignore_keys' in keywords else ['ID\t']
                lines[0] += '\t'.join(x.upper() for x in keywords['entries'])
                for key in self.get_list(col=l, flat=True):
                    line = [] if 'ignore_keys' in keywords else [key]
                    for entry in keywords['entries']:
                        tmp = self[key, entry]
                        if isinstance(tmp, list):
                            tmp = ' '.join([str(x) for x in tmp])
                        line += [tmp]
                    lines.append('\t'.join('{0}'.format(x) for x in line))
                _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')
Esempio n. 32
0
def write_nexus(wordlist,
                mode='mrbayes',
                filename="mrbayes.nex",
                ref="cogid",
                missing="?",
                gap="-",
                custom=None,
                custom_name='lingpy',
                commands=None,
                commands_name="mrbayes"):
    """Write a nexus file for phylogenetic analyses.

    Parameters
    ----------
    wordlist : lingpy.basic.wordlist.Wordlist
        A Wordlist object containing cognate IDs.
    mode : str (default="mrbayes")
        The name of the output nexus style. Valid values are:
            * 'MRBAYES': a MrBayes formatted nexus file.
            * 'SPLITSTREE': a SPLITSTREE formatted nexus file.
            * 'BEAST': a BEAST formatted nexus file.
            * 'BEASTWORDS': a BEAST formatted nexus for word-partitioned
               analyses.
            * 'TRAITLAB': a TRAITLab formatted nexus.
    filename : str (default=None)
        Name of the file to which the nexus file will be written.
        If set to c{None}, then this function will not write the nexus ontent
        to a file, but simply return the content as a string.
    ref: str (default="cogid")
        Column in which you store the cognate sets in your data.
    gap : str (default="-")
        The symbol for gaps (not relevant for linguistic analyses).
    missing : str (default="?")
        The symbol for missing characters.
    custom : list {default=None)
        This information allows to add custom information to the nexus file, like, for
        example, the structure of the characters, their original concept, or their
        type, and it will be written into a custom block in the nexus file. The name of
        the custom block can be specified with help of the `custom_name` keyword. The
        content is a list of strings which will be written line by line into the custom
        block.
    custom_name : str (default="lingpy")
        The name of the custom block which will be written to the file.
    commands : list (default=None)
        If specified, will write an additional block containing commands for
        phylogenetic software. The commands are passed as a list, containing
        strings. The name of the block is given by the keywords commands_name.
    commands_name : str (default="mrbayes")
        Determines how the block will be called to which the commands will be
        written.

    Returns
    -------
    nexus : str
        A string containing nexus file output
    """
    templates = {
        'BEAST': 'beast.nex',
        'BEASTWORDS': 'beast.nex',
        'SPLITSTREE': 'splitstree.nex',
        'MRBAYES': 'mrbayes.nex',
        'TRAITLAB': 'splitstree.nex',
    }

    block = "\n\nBEGIN {0};\n{1}\nEND;\n"  # template for nexus blocks

    # check for valid mode
    mode = mode.upper()
    if mode not in templates.keys():
        raise ValueError("Unknown output mode %s" % mode)

    # check for valid template
    template = templates.get(mode)
    tpath = util.Path(template_path(template))
    if tpath.exists:
        _template = util.read_text_file(tpath.as_posix())
    else:  # pragma: no cover
        raise IOError("Unknown template %s" % template)

    # check that `ref` is a valid column
    if ref not in wordlist._alias:
        raise KeyError("Unknown _ref_ column in wordlist '%s'" % ref)

    # retrieve the matrix
    matrix = [[] for x in range(wordlist.width)]
    etd = wordlist.get_etymdict(ref=ref)
    concepts = sorted(
        [(cogid, wordlist[[x[0] for x in vals if x][0]][wordlist._rowIdx])
         for (cogid, vals) in etd.items()],
        key=lambda x: (x[1], x[0]))
    # and missing data..
    missing_ = {
        t: [
            concept for (cogid, concept) in concepts if concept not in
            wordlist.get_list(col=t, entry=wordlist._row_name, flat=True)
        ]
        for t in wordlist.cols
    }

    # add ascertainment character for mode=BEAST
    if mode == 'BEAST':
        matrix = [['0'] for m in matrix]

    # skip the constant sites for traitlab
    if mode == 'TRAITLAB':
        concepts = [(i, c) for (i, c) in concepts if not _is_constant(etd[i])]

    # fill matrix
    for i, t in enumerate(wordlist.cols):
        previous = ''
        for cogid, concept in concepts:
            if previous != concept:
                previous = concept
                # add ascertainment character for mode=BEASTWORDS. Note that if
                # a given word:language is missing, then its ascertainment
                # character is the `missing` character.
                if mode == "BEASTWORDS":
                    matrix[i] += ['0'] if concept not in missing_[t] else [
                        missing
                    ]
            matrix[i] += ['1'] if etd[cogid][i] else ['0'] if concept not in \
                missing_[t] else [missing]

    # parse characters into `charsets` (a dict of word=>siteindex positions),
    # and `chars` (a list of characters).
    charsets, chars, previous = defaultdict(list), [], ''
    for i, (cogid, concept) in enumerate(concepts, 1):
        char = util.nexus_slug(concept)
        # add label for ascertainment character in BEAST mode
        if i == 1 and mode == 'BEAST':
            chars.append("_ascertainment")
        # add label for per-word ascertainment characters in BEASTWORDS
        if mode == 'BEASTWORDS' and previous != concept:
            chars.append("%s_ascertainment" % char)
            charsets[char].append(len(chars))
        # finally add label.
        chars.append(char)
        charsets[char].append(len(chars))
        previous = concept

    # create character labels block if needed
    if mode in ('BEAST', 'BEASTWORDS'):
        charblock = ",\n".join(["\t%d %s" % o for o in enumerate(chars, 1)])
    else:
        charblock = ""

    # create charsets block
    blockname, assumptions = None, ""
    if mode in ('BEASTWORDS', 'MRBAYES'):
        charsets = [
            "\tcharset %s = %d-%d;" % (c, min(m), max(m))
            for (c, m) in charsets.items()
        ]
        blockname = 'ASSUMPTIONS' if mode == 'BEASTWORDS' else 'MRBAYES'
        assumptions = "\n".join(charsets)

    # commands
    if commands_name.upper() == blockname and len(assumptions) and commands:
        # merge commands specified in function call into output blockname
        assumptions += "\n" + "\n".join("\t%s" % c for c in commands)
    else:
        # different commands block set in commands_name.
        assumptions += block.format(commands_name,
                                    '\n'.join(commands)) if commands else ''

    # convert state matrix to string.
    _matrix = ""
    maxtaxlen = max([len(util.nexus_slug(t)) for t in wordlist.cols]) + 1
    for i, (taxon, m) in enumerate(zip(wordlist.cols, matrix)):
        _matrix += str(util.nexus_slug(taxon) +
                       maxtaxlen * ' ')[:maxtaxlen] + ' '
        _matrix += ''.join(
            ['({0})'.format(c) if len(c) > 1 else str(c) for c in m]) + '\n'
    _matrix = _matrix.rstrip()  # remove trailing

    # TODO: symbols could be more than "01" but we this function doesn't handle
    # multistate data so we just specify them here.
    symbols = '01'

    text = _template.format(
        matrix=_matrix,
        ntax=wordlist.width,
        nchar=len(matrix[0]),
        gap=gap,
        missing=missing,
        dtype='RESTRICTION' if mode == 'MRBAYES' else 'STANDARD',
        commands=block.format(blockname, assumptions),
        custom=block.format(custom_name, '\n'.join(custom)) if custom else '',
        symbols=symbols,
        chars=charblock)
    text = text.replace("\t", " " * 4)  # normalise tab-stops
    for i, (cogid, concept) in enumerate(concepts, 1):
        text += '\n[MATRIX:{0}=COGID:{1}=CONCEPT:{2}]'.format(
            i, cogid, concept)
    if filename:
        util.write_text_file(filename, text)
    return text
Esempio n. 33
0
def matrix2dst(
    matrix,
    taxa=None,
    stamp='',
    filename='',
    taxlen=10,
    comment='#'
):
    """
    Convert matrix to dst-format.

    Parameters
    ----------
    taxa : {None, list}
        List of taxon names corresponding to the distances. Make sure that you
        only use alphanumeric characters and the understroke for assigning the
        taxon names. Especially avoid the usage of brackets, since this will
        confuse many phylogenetic programs.
    stamp : str (default='')
        Convenience stamp passed as a comment that can be used to indicate how
        the matrix was created.
    filename : str
        If you specify a filename, the data will be written to file.
    taxlen : int (default=10)
        Indicate how long the taxon names are allowed to be. The Phylip package
        only allows taxon names consisting of maximally 10 characters. Other
        packages, however, allow more. If Phylip compatibility is not important
        for you and you just want to allow for as long taxon names as possible,
        set this value to 0.
    comment : str (default = '#')
        The comment character to be used when adding additional information in
        the "stamp".
    
    Returns
    -------
    output : {str or file}
        Depending on your settings, this function returns a string in DST
        (=Phylip) format, or a file containing the string.
        
    """
    if not taxa:
        taxa = ['t_{0}'.format(i + 1) for i in range(len(matrix))]

    out = ' {0}\n'.format(len(taxa))
    for i, taxon in enumerate(taxa):

        # check for zero-taxlen
        if taxlen == 0:
            dummy = '{0}\t'
            idx = len(taxon)
            joinchar = '\t'  # normally in Phylip this is a space
        else:
            dummy = '{0:' + str(taxlen) + '}'
            idx = taxlen + 1
            joinchar = ' '

        out += dummy.format(taxon)[:idx] + joinchar
        out += joinchar.join(['{0:.2f}'.format(d) for d in
                              matrix[i]])
        out += '\n'
    if stamp:
        out += '{1} {0}'.format(stamp, comment)
    if not filename:
        return out
    else:
        util.write_text_file(filename + '.dst', out)
Esempio n. 34
0
def write_nexus(
        wordlist,
        mode='mrbayes',
        filename="mrbayes.nex",
        ref="cogid",
        missing="?", gap="-",
        custom=None,
        custom_name='lingpy',
        commands=None, commands_name="mrbayes"):
    """Write a nexus file for phylogenetic analyses.

    Parameters
    ----------
    wordlist : lingpy.basic.wordlist.Wordlist
        A Wordlist object containing cognate IDs.
    mode : str (default="mrbayes")
        The name of the output nexus style. Valid values are:
            * 'MRBAYES': a MrBayes formatted nexus file.
            * 'SPLITSTREE': a SPLITSTREE formatted nexus file.
            * 'BEAST': a BEAST formatted nexus file.
            * 'BEASTWORDS': a BEAST formatted nexus for word-partitioned
               analyses.
            * 'TRAITLAB': a TRAITLab formatted nexus.
    filename : str (default=None)
        Name of the file to which the nexus file will be written.
        If set to c{None}, then this function will not write the nexus ontent
        to a file, but simply return the content as a string.
    ref: str (default="cogid")
        Column in which you store the cognate sets in your data.
    gap : str (default="-")
        The symbol for gaps (not relevant for linguistic analyses).
    missing : str (default="?")
        The symbol for missing characters.
    custom : list {default=None)
        This information allows to add custom information to the nexus file, like, for
        example, the structure of the characters, their original concept, or their
        type, and it will be written into a custom block in the nexus file. The name of
        the custom block can be specified with help of the `custom_name` keyword. The
        content is a list of strings which will be written line by line into the custom
        block.
    custom_name : str (default="lingpy")
        The name of the custom block which will be written to the file.
    commands : list (default=None)
        If specified, will write an additional block containing commands for
        phylogenetic software. The commands are passed as a list, containing
        strings. The name of the block is given by the keywords commands_name.
    commands_name : str (default="mrbayes")
        Determines how the block will be called to which the commands will be
        written.

    Returns
    -------
    nexus : str
        A string containing nexus file output
    """
    templates = {
        'BEAST': 'beast.nex',
        'BEASTWORDS': 'beast.nex',
        'SPLITSTREE': 'splitstree.nex',
        'MRBAYES': 'mrbayes.nex',
        'TRAITLAB': 'splitstree.nex',
    }
    
    block = "\n\nBEGIN {0};\n{1}\nEND;\n"  # template for nexus blocks
    
    # check for valid mode
    mode = mode.upper()
    if mode not in templates.keys():
        raise ValueError("Unknown output mode %s" % mode)

    # check for valid template
    template = templates.get(mode)
    tpath = util.Path(template_path(template))
    if tpath.exists:
        _template = util.read_text_file(tpath.as_posix())
    else:  # pragma: no cover
        raise IOError("Unknown template %s" % template)

    # check that `ref` is a valid column
    if ref not in wordlist._alias:
        raise KeyError("Unknown _ref_ column in wordlist '%s'" % ref)

    # retrieve the matrix
    matrix = [[] for x in range(wordlist.width)]
    etd = wordlist.get_etymdict(ref=ref)
    concepts = sorted([(cogid, wordlist[[
        x[0] for x in vals if x][0]][wordlist._rowIdx]) for (cogid, vals) in
        etd.items()],
        key=lambda x: (x[1], x[0]))
    # and missing data..
    missing_ = {t: [concept for (cogid, concept) in concepts if concept not in wordlist.get_list(
                col=t, entry=wordlist._row_name, flat=True)] for t in
                wordlist.cols}
    
    # add ascertainment character for mode=BEAST
    if mode == 'BEAST':
        matrix = [['0'] for m in matrix]
    
    # skip the constant sites for traitlab
    if mode == 'TRAITLAB':
        concepts = [(i, c) for (i, c) in concepts if not _is_constant(etd[i])]
    
    # fill matrix
    for i, t in enumerate(wordlist.cols):
        previous = ''
        for cogid, concept in concepts:
            if previous != concept:
                previous = concept
                # add ascertainment character for mode=BEASTWORDS. Note that if
                # a given word:language is missing, then its ascertainment
                # character is the `missing` character.
                if mode == "BEASTWORDS":
                    matrix[i] += ['0'] if concept not in missing_[t] else [missing]
            matrix[i] += ['1'] if etd[cogid][i] else ['0'] if concept not in \
                missing_[t] else [missing]

    # parse characters into `charsets` (a dict of word=>siteindex positions),
    # and `chars` (a list of characters).
    charsets, chars, previous = defaultdict(list), [], ''
    for i, (cogid, concept) in enumerate(concepts, 1):
        char = util.nexus_slug(concept)
        # add label for ascertainment character in BEAST mode
        if i == 1 and mode == 'BEAST':
            chars.append("_ascertainment")
        # add label for per-word ascertainment characters in BEASTWORDS
        if mode == 'BEASTWORDS' and previous != concept:
            chars.append("%s_ascertainment" % char)
            charsets[char].append(len(chars))
        # finally add label.
        chars.append(char)
        charsets[char].append(len(chars))
        previous = concept
    
    # create character labels block if needed
    if mode in ('BEAST', 'BEASTWORDS'):
        charblock = ",\n".join(["\t%d %s" % o for o in enumerate(chars, 1)])
    else:
        charblock = ""
    
    # create charsets block
    blockname, assumptions = None, ""
    if mode in ('BEASTWORDS', 'MRBAYES'):
        charsets = ["\tcharset %s = %d-%d;" % (
            c, min(m), max(m)) for (c, m) in charsets.items()
        ]
        blockname = 'ASSUMPTIONS' if mode == 'BEASTWORDS' else 'MRBAYES'
        assumptions = "\n".join(charsets)
    
    # commands
    if commands_name.upper() == blockname and len(assumptions) and commands:
        # merge commands specified in function call into output blockname
        assumptions += "\n" + "\n".join("\t%s" % c for c in commands)
    else:
        # different commands block set in commands_name.
        assumptions += block.format(commands_name, '\n'.join(commands)) if commands else ''
    
    # convert state matrix to string.
    _matrix = ""
    maxtaxlen = max([len(util.nexus_slug(t)) for t in wordlist.cols]) + 1
    for i, (taxon, m) in enumerate(zip(wordlist.cols, matrix)):
        _matrix += str(util.nexus_slug(taxon) + maxtaxlen * ' ')[:maxtaxlen] + ' '
        _matrix += ''.join([
            '({0})'.format(c) if len(c) > 1 else str(c) for c in m
        ]) + '\n'
    _matrix = _matrix.rstrip()  # remove trailing

    # TODO: symbols could be more than "01" but we this function doesn't handle
    # multistate data so we just specify them here.
    symbols = '01'

    text = _template.format(
        matrix=_matrix,
        ntax=wordlist.width,
        nchar=len(matrix[0]),
        gap=gap, missing=missing,
        dtype='RESTRICTION' if mode == 'MRBAYES' else 'STANDARD',
        commands=block.format(blockname, assumptions),
        custom=block.format(custom_name, '\n'.join(custom)) if custom else '',
        symbols=symbols, chars=charblock
    )
    text = text.replace("\t", " " * 4)  # normalise tab-stops
    for i, (cogid, concept) in enumerate(concepts, 1):
        text += '\n[MATRIX:{0}=COGID:{1}=CONCEPT:{2}]'.format(i, cogid, concept)
    if filename:
        util.write_text_file(filename, text)
    return text
Esempio n. 35
0
def diff(
        wordlist,
        gold='cogid',
        test='lexstatid',
        modify_ref=False,
        pprint=True,
        filename='',
        tofile=True,
        transcription="ipa"):
    r"""
    Write differences in classifications on an item-basis to file.

    lex : :py:class:`lingpy.compare.lexstat.LexStat`
        The :py:class:`~lingpy.compare.lexstat.LexStat` class used for the
        computation. It should have two columns indicating cognate IDs.
    gold : str (default='cogid')
        The name of the column containing the gold standard cognate
        assignments.
    test : str (default='lexstatid')
        The name of the column containing the automatically implemented cognate
        assignments.
    modify_ref : function (default=False)
        Use a function to modify the reference. If your cognate identifiers
        are numerical, for example, and negative values are assigned as
        loans, but you want to suppress this behaviour, just set this
        keyword to "abs", and all cognate IDs will be converted to their
        absolute value.
    pprint : bool (default=True)
        Print out the results
    filename : str (default='')
        Name of the output file. If not specified, it is identical with the
        name of the :py:class:`~lingpy.compare.lexstat.LexStat`, but with the
        extension ``diff``.
    tofile : bool (default=True)
        If set to c{False}, no data will be written to file, but instead, the
        data will be returned.
    transcription : str (default="ipa")
        The file in which the transcriptions are located (should be a string,
        no segmentized version, for convenience of writing to file).

    Returns
    -------
    t : tuple
        A nested tuple consisting of two further tuples. The first
        containing precision, recall, and harmonic mean
        (F-scores), the second containing the same values for the pair-scores.

    Notes
    -----
    If the **tofile** option is chosen, the results are written to a specific
    file with the extension ``diff``. This file contains all cognate sets in
    which there are differences between gold standard and test sets. It also
    gives detailed information regarding false positives, false negatives, and
    the words involved in these wrong decisions.

    .. This function also calculates the "transformation" score. This score is
    .. based on the calculation of steps that are needed to transform one cluster
    .. for one set of meanings into the other. Ideally, if there are *n* different
    .. cognate sets covering one gloss in the gold standard, the minimal length of
    .. a mapping to convert the *m* cognate sets of the test set into the gold standard
    .. is *n*. In this case, both gold standard and test set are identical.
    .. However, if gold standard and test set differ, the number of mappings
    .. necessarily exceeds *m* and *n*. Based on this, the transformation
    .. precision is defined as :math:`\frac{m}{M}`, where *m* is the number of
    .. distinct clusters in the test set and *M* is the length of the mapping.
    .. Accordingly, the recall is defined as :math:`\frac{n}{M}`, where *n* is the
    .. number of clusters in the gold standard.

    .. Note that if precision is lower than 1.0, this means there are false
    .. positive decisions in the test set. Accordingly, a recall lower than 1.0
    .. indicates that there are false negative decisions in the test set.
    .. The drawback of this score is that it is not sensitive regarding the
    .. distinct number of decisions in which gold standard and test set differ, so
    .. the recall can be very low although most of the words have been grouped
    .. accurately. The advantage is that it can be directly interpreted in terms
    .. of 'false positive/false negative' decisions.

    See also
    --------
    bcubes
    pairs
    """
    filename = filename or wordlist.filename
    loan = modify_ref if modify_ref else identity

    # open file
    lines = []

    # get a formatter for language names
    lform = '{0:' + str(max([len(l) for l in wordlist.cols])) + '}'
    
    preT, recT = [], []
    preB, recB = [], []
    preP, recP = [], []

    def get_pairs(cogs, idxs):
        tmp = defaultdict(list)
        for x, y in zip(cogs, idxs):
            tmp[x].append(y)
        for x in tmp:
            for yA, yB in combinations(tmp[x], r=2):
                yield tuple(sorted([yA, yB]))

    for concept in wordlist.rows:
        idxs = wordlist.get_list(row=concept, flat=True)
        # get the basic index for all seqs
        bidx = [i + 1 for i in range(len(idxs))]

        cogsG = _get_cogs(gold, concept, loan, wordlist)
        cogsT = _get_cogs(test, concept, loan, wordlist)

        if cogsG != cogsT:
            # calculate the transformation distance of the sets
            tramGT = len(set(zip(cogsG, cogsT)))
            tramG = len(set(cogsG))
            tramT = len(set(cogsT))
            preT += [tramT / tramGT]
            recT += [tramG / tramGT]

            # calculate the bcubed precision for the sets
            preB += [_get_bcubed_score(cogsT, cogsG)]

            # calculate b-cubed recall
            recB += [_get_bcubed_score(cogsG, cogsT)]

            # calculate pair precision
            pairsG = set(get_pairs(cogsG, idxs))
            pairsT = set(get_pairs(cogsT, idxs))

            preP.append(len(pairsT.intersection(pairsG)) / len(pairsT) if pairsT else 1.0)
            recP.append(len(pairsT.intersection(pairsG)) / len(pairsG) if pairsG else 1.0)
            fp = "no" if preP[-1] == 1.0 else "yes"
            fn = "no" if recP[-1] == 1.0 else "yes"

            lines.append("Concept: {0}, False Positives: {1}, False Negatives: {2}".format(
                concept, fp, fn))

            # get the words
            words = [wordlist[i, 'ipa'] for i in idxs]
            langs = [wordlist[i, 'taxa'] for i in idxs]

            # get a word-formater
            wform = '{0:' + str(max([len(w) for w in words])) + '}'

            # write differences to file
            for word, lang, cG, cT in sorted(
                    zip(words, langs, cogsG, cogsT),
                    key=lambda x: (x[2], x[3])):
                lines.append('{0}\t{1}\t{2:4}\t{3:4}'.format(
                    lform.format(lang), wform.format(word), cG, cT))
            lines.append('#')
        else:
            preT += [1.0]
            recT += [1.0]
            preB += [1.0]
            recB += [1.0]
            preP += [1.0]
            recP += [1.0]

    bp = sum(preB) / len(preB)
    br = sum(recB) / len(recB)
    bf = 2 * (bp * br) / (bp + br)
    pp = sum(preP) / len(preP)
    pr = sum(recP) / len(recP)
    pf = 2 * (pp * pr) / (pp + pr)

    as_string(_format_results('B-Cubed', bp, br, bf) + \
            _format_results('Pair', pp, pr, pf), 
            pprint=pprint)

    lines.extend([
        'B-Cubed Scores:',
        'Precision: {0:.4f}'.format(bp),
        'Recall:    {0:.4f}'.format(br),
        'F-Score:   {0:.4f}'.format(bf),
        '#',
        'Pair Scores:',
        'Precision: {0:.4f}'.format(pp),
        'Recall:    {0:.4f}'.format(pr),
        'F-Score:   {0:.4f}'.format(pf),
    ])

    if tofile:
        write_text_file(filename + '.diff', lines)

    if pprint:
        return (bp, br, bf), (pp, pr, pf), lines
    else:
        return (bp, br, bf), (pp, pr, pf)
Esempio n. 36
0
def _write_file(filename, content, ext=None):
    if ext:
        filename = filename + '.' + ext
    util.write_text_file(filename, content)
Esempio n. 37
0
    def output(self, dtype, filename=None, labels=None):
        """
        Parameters
        ----------
        dtype : str {"json", "html", "nwk" }
            Specify the type of the output:
            
            * *json*: JSON format, suitable for use in d3.
            * *nwk*: Newick format (identical with input upon initialization).
            * *html*: Simple interactive HTML-representation with collapsible nodes.

        """

        if dtype == "json":
            if filename:
                with open(filename + "." + dtype, "w") as f:
                    f.write(json.dumps(self._dict, indent=2))
            else:
                return json.dumps(self._dict, indent=2)

        elif dtype == "html":

            # make simple label function
            get_label = lambda x: labels[x] if labels else x

            start = '<div id="root" class="node-container">root.content</div>'

            clean_label = lambda x: "".join([y for y in sort_tree(x) if y not in "();"]).replace(",", "_")

            template = '<div class="node-container"><div id="#node_name:label" class="node-label">#node_label</div><div class="node-content">#node_children:{node}</div></div>'

            leave = '<div id="#node_leave:label" class="node-leave"><div class="inner_leave">#node_leave</div></div>'

            txt = (
                template.format(node=self.root)
                .replace("#node_label", get_label(self[self.root]["label"]))
                .replace("#node_name", clean_label(self.root))
            )

            # transform function helps to make the transformation with check
            # for leave or child
            transform = (
                lambda x: template.format(node=x)
                .replace("#node_label", get_label(self[x]["label"]))
                .replace("#node_name", clean_label(x))
                if not self[x]["leave"]
                else leave.replace("#node_leave", get_label(x))
            )

            for i, node in enumerate(self.nodes):

                # write all children
                children = self[node]["children"]

                node_children = "\n".join([transform(child) for child in children])

                txt = txt.replace("#node_children:" + node, node_children)

            # get the templates
            html = util.read_text_file("lexical_change.html")
            css = util.read_text_file("lexical_change.css")
            js = util.read_text_file("lexical_change.js")
            title = "LingPy Tree Class"

            html = html.format(STYLE=css, SCRIPT=js, TITLE=title, TREE=txt)
            filename = filename or "lingpy.basic.newick"

            util.write_text_file(filename + ".html", html)
Esempio n. 38
0
def msa2html(msa, shorttitle='', filename='', template='', **keywords):
    """
    Convert files in ``msa``-format into colored ``html``-format.

    Parameters
    ----------
    msa : dict
        A dictionary object that contains all the information of an MSA object.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``SCA`` will be used.

    filename : str (default="")
        Define the name of the output file. If no name is defined, the name of
        the input file will be taken as a default.

    template : str (default="")
        The path to the template file. If no name is defined, the basic
        template will be used. The basic template currently used can be found
        under ``lingpy/data/templates/msa2html.html``.

    Examples
    --------
    Load the libary.

    >>> from lingpy import *
    
    Load an ``msq``-file from the test-sets.

    >>> msa = MSA('harry.msq')

    Align the data progressively and carry out a check for swapped sites.

    >>> msa.prog_align()
    >>> msa.swap_check()
    >>> print(msa)
    w    o    l    -    d    e    m    o    r    t
    w    a    l    -    d    e    m    a    r    -
    v    -    l    a    d    i    m    i    r    -

    Save the data to the file ``harry.msa``.

    >>> msa.output('msa',filename='harry')

    Save the ``msa``-object as ``html``.

    >>> msa.output('html',filename='harry')
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the ``color``
    :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted.
    

    See also
    --------
    lingpy.convert.html.alm2html
    """
    util.setdefaults(
        keywords,
        pid_mode=1,
        stress=rcParams['stress'],
        css=False,
        js=False,
        compact=False,
        class_sort=True,
        write_to_file=True,
    )

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task

    # load templates
    template = template or template_path('msa2html.html')
    if template == 'js':
        template = template_path('msa2html.js.html')
    html = util.read_text_file(template)
    css = util.read_text_file(keywords['css'] or template_path('msa.css'))
    js = util.read_text_file(keywords['js'] or template_path('msa.js'))

    # treat the msa-object as a file and try to load the file if this is the
    # case
    if isinstance(msa, string_types):
        msa = read_msa(msa, **keywords)
    else:
        raise ValueError('[!] No filename specified.')

    # load dataset, etc.
    dataset = msa['dataset']

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    infile = msa['infile']
    seq_id = msa['seq_id']

    # define the titles etc.
    if not shorttitle:
        shorttitle = 'SCA'

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    # format css file
    css = css.replace('TAXON_LENGTH', str(taxl * 10))

    out = ''
    tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n'
    td_taxon = '<td class="taxon">{0}</td>'
    perc = int(80 / len(msa['alignment'][0]) + 0.5)
    td_residue = '<td class="residue {1}">{0}</td>'
    td_swap = '<td class="residue swap {1}">{0}</td>'
    td_unaligned = '<td class="residue noalign {1}">{0}</td>'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    # check for
    local = ['*'] * len(msa['alignment'][0])
    if 'local' in msa:
        local = ['.'] * len(msa['alignment'][0])
        for i in msa['local']:
            local[i] = '*'

    # get two sorting schemas for the sequences
    if keywords['class_sort']:

        classes = [
            tokens2class(ipa2tokens(seq), rcParams['asjp'])
            for seq in msa['seqs']
        ]
        seqs = dict([
            (a[1], b) for a, b in zip(
                sorted(
                    zip(classes, msa['seqs']),
                    key=lambda x: x[0]  # list(zip(x[0],x[1]))
                ),
                range(1,
                      len(msa['seqs']) + 1))
        ])
    else:
        seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1)))
    taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1)))

    # set up a list to store unique alignments
    alignments = []

    # start iteration
    for i, taxon in enumerate(msa['taxa']):
        tmp = ''
        tmp += td_taxon.format(taxon)

        # append alignment to alignments
        alignment = ''.join(msa['alignment'][i])
        sequence = msa['seqs'][i]
        if alignment in alignments:
            unique = 'false'
        else:
            unique = 'true'
            alignments += [alignment]

        for j, char in enumerate(msa['alignment'][i]):
            if char == '-':
                d = 'dolgo_GAP'
                c = '#bbbbbb'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])
                c = token2class(char, rcParams['_color'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'

            if j in swaps:
                tmp += td_swap.format(char, d)
            elif local[j] != '*':
                tmp += td_unaligned.format(char, d)
            else:
                tmp += td_residue.format(char, d)
        out += tr.format(tmp, unique, taxa[taxon], seqs[sequence])

    html = html.format(
        table=out,
        dataset=dataset,
        pid=pid_score,
        file=infile,
        sequence=seq_id,
        shorttitle=shorttitle,
        width=len(msa['alignment'][0]),
        table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl),
        taxa=len(msa['alignment']),
        uniseqs=len(set(msa['seqs'])),
        css=css,
        js=js)

    if not filename:
        filename = rcParams['filename']

    if not filename.endswith('.html'):
        filename = filename + '.html'

    if keywords['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    if keywords['write_to_file']:
        # check, whether the outfile already exists
        util.write_text_file(filename, html)
    else:
        return html
Esempio n. 39
0
def _write_file(filename, content, ext=None):
    if ext:
        filename = filename + '.' + ext
    util.write_text_file(filename, content)
Esempio n. 40
0
def alm2html(infile,
             title='',
             shorttitle='',
             filename='',
             colored=False,
             main_template='',
             table_template='',
             dataset='',
             confidence=False,
             **keywords):
    """
    Convert files in ``alm``-format into colored ``html``-format.

    Parameters
    ----------

    title : str
        Define the title of the output file. If no title is provided, the
        default title ``LexStat - Automatic Cognate Judgments`` will be used.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``LexStat`` will be used.
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the
    ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed
    and adapted. 

    See also
    --------
    lingpy.convert.html.msa2html
    lingpy.convert.html.msa2tex

    """
    util.setdefaults(keywords, json="", labels={})

    # open the infile
    if not os.path.exists(infile):
        infile = infile + '.alm'
    data = util.read_text_file(infile)

    # create the outfile
    if not filename:
        filename = rcParams['filename']

    # read in the templates
    html = util.read_text_file(main_template or template_path('alm2html.html'))
    if not table_template:
        table_template = template_path(
            'alm2html.table.js.html' if confidence else 'alm2html.table.html')
    table = util.read_text_file(table_template)
    css = util.read_text_file(template_path('alm.css'))
    js = util.read_text_file(template_path('alm.js'))

    # define a label function for the taxa
    label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x

    # check for windows-compatibility
    data = data.replace(os.linesep, '\n')[:-1]

    # split the data into blocks
    blocks = data.split('\n\n')

    # retrieve the dataset
    dataset = dataset or blocks[0]

    # create the outstring
    tmp_str = ''

    for block in blocks[1:]:
        lines = block.split('\n')
        m = [l.split('\t') for l in lines]

        # create colordict for different colors
        dc = len(set([l[0] for l in m]))

        if colored:
            colors = {
                a: b
                for a, b in zip(
                    sorted(set([int(l[0]) for l in m])),
                    colorRange(dc, brightness=400),
                )
            }
        else:
            colors = []
            white = True
            for i in sorted(set([abs(int(l[0])) for l in m])):
                if white:
                    colors.append((i, 'white'))
                    white = False
                else:
                    colors.append((i, 'gray'))
                    white = True
            colors = dict(colors)

        # get the basic item and its id
        iName = m[0][2]
        iID = m[0][3]

        # start writing the stuff to string
        tmp_str += table.format(NAME=iName, ID=iID)
        # define the basic string for the insertion
        bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}'

        for tracer, l in enumerate(m):
            # check whether the current line is a borrowing
            if int(l[0]) < 0:
                loan_line = ' loan'
            else:
                loan_line = ''

            # assign the cognate id
            tmp = '  <td>{0}</td>\n'.format(l[0])
            tmp += '  <td>{0}</td>\n'.format(label(l[1].strip('.')))

            # check alignments for confidence scores
            ipa_string = ''.join([cell.split('/')[0]
                                  for cell in l[4:]]).replace('-', '')

            tmp += '  <td>{0}</td>\n'.format(ipa_string)
            tmp += '  <td class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '   <table class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '    <tr>\n{0}    </tr>\n   </table>\n  </td>\n </tr>\n'

            # check whether another entry follows that is also an alignment,
            # otherwise, there's no need to display a word as an alignment
            cognate_set = False
            if tracer < len(m) - 1:
                if abs(int(m[tracer + 1][0])) == abs(int(l[0])):
                    cognate_set = True
            if tracer > 0:
                if abs(int(m[tracer - 1][0])) == abs(int(l[0])):
                    cognate_set = True

            # fill out html for the cognate sets
            if cognate_set:

                alm = ''
                for char in l[4:]:

                    # check for confidence scores
                    if '/' in char:
                        try:
                            char, conf, num = char.split('/')
                            conf = int(conf)
                        except ValueError:
                            print(char.split('/'))
                            raise ValueError("Something is wrong with %s." %
                                             (char))

                    else:
                        char, conf, rgb = char, (255, 255, 255), 0.0

                    if char == '-':
                        d = 'dolgo_GAP'
                    else:
                        d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                        # bad check for three classes named differently
                        if d == 'dolgo__':
                            d = 'dolgo_X'
                        elif d == 'dolgo_1':
                            d = 'dolgo_TONE'
                        elif d == 'dolgo_0':
                            d = 'dolgo_ERROR'

                    if confidence:
                        alm += '     '
                        alm += '<td class="char {1}" confidence={0} '.format(
                            conf, d)
                        alm += 'char="{0}" '.format(char)
                        alm += 'onclick="' + "show('{0}')".format(num) + '" '
                        alm += 'num="{0}"'.format(num)
                        alm += '>\n      {0}\n     </td>\n'.format(char)
                    else:
                        alm += '     '
                        alm += '<td class="char {0}">{1}</td>\n'.format(
                            d, char)
            else:
                alm = '      '
                alm += '<td class="{0}">--</td>\n'.format(colors[abs(int(
                    l[0]))])

            # format the alignment
            try:
                tmp = tmp.format(alm)
            except ValueError:
                raise ValueError("Unknown problem in matchin %s and %s." %
                                 (alm, tmp))

            # check for last line, where a new line should be inserted (not the
            # fastest solution, but plotting is not a matter of time, and it
            # suffices it's current purpose
            if tracer < len(m) - 1:
                pass
            else:
                if confidence:
                    tmp += ' </table>\n'

                tmp += ' <tr class="empty"><td colspan="4" class="empty">'
                tmp += '<hr class="empty" /></td></tr>\n'

            # format the whole string
            tmp_str += bas.format(colors[abs(int(l[0]))], tmp, loan_line, l[1])

    if not title:
        title = "LexStat - Automatic Cognate Judgments"
    if not shorttitle:
        shorttitle = "LexStat"

    # check for json-attribute
    if keywords['json']:
        keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'],
                                                        indent=1)

    html = html.format(shorttitle=shorttitle,
                       title=title,
                       table=tmp_str,
                       dataset=dataset,
                       javascript=js,
                       css=css,
                       **keywords)
    util.write_text_file(filename + '.html', html)
    return
Esempio n. 41
0
def _graph_or_file(graph, filename):
    if filename:
        util.write_text_file(filename + '.gml', nx.generate_gml(graph))
        return
    return graph
Esempio n. 42
0
def msa2tex(infile, template='', filename='', **keywords):
    """
    Convert an MSA to a tabular representation which can easily be used in
    LaTeX documents.
    """
    util.setdefaults(keywords, pid_mode=1)

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task
    # load msa
    msa = read_msa(infile)

    ## load templates
    tex = util.read_text_file(template or template_path('msa.tex'))

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    dataset = msa['dataset']
    infile = msa['infile']
    seq_id = msa['seq_id']

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    height = len(msa['alignment'])
    width = len(msa['alignment'][0])

    start = r'\tabular{l' + width * 'c' + '}\n'
    start += r'\bf\ttfamily Taxon & \multicolumn{' + str(
        width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    body = start
    for i, taxon in enumerate(msa['taxa']):
        body += r'\ttfamily ' + taxon.replace('_', r'\_')
        for j, char in enumerate(msa['alignment'][i]):
            if char != '-':
                cls = token2class(char, rcParams['dolgo'])
            elif char == '-':
                cls = 'X'
            if char == '_':
                char = r'\#'
            if cls == '_':
                cls = '2'
            if j not in swaps:
                body += r'&\cellcolor{col' + cls + r'}' + char
            else:
                if char != '-':
                    body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char
                else:
                    body += r'&\cellcolor{col' + cls + r'}\bf ' + char
        body += r'\\' + '\n'

    body += r'&' + '&'.join([r'\color{white}XXX'
                             for i in range(width)]) + r'\\' + '\n'
    body += r'\endtabular' + '\n'

    # create the parameters etc.
    w = 1.5 * width + taxl * 0.25
    h = 0.5 * height + 1.0

    tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w))
    tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h))

    # create the rput stuff
    tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0))
    tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0))

    # insert the rest
    tex = tex.replace('<+CONTENT+>', body)

    # write to file
    if not filename:
        filename = 'lingpy-{0}'

    util.write_text_file(filename + '.tex', tex)
Esempio n. 43
0
def matrix2dst(matrix,
               taxa=None,
               stamp='',
               filename='',
               taxlen=10,
               comment='#'):
    """
    Convert matrix to dst-format.

    Parameters
    ----------
    taxa : {None, list}
        List of taxon names corresponding to the distances. Make sure that you
        only use alphanumeric characters and the understroke for assigning the
        taxon names. Especially avoid the usage of brackets, since this will
        confuse many phylogenetic programs.
    stamp : str (default='')
        Convenience stamp passed as a comment that can be used to indicate how
        the matrix was created.
    filename : str
        If you specify a filename, the data will be written to file.
    taxlen : int (default=10)
        Indicate how long the taxon names are allowed to be. The Phylip package
        only allows taxon names consisting of maximally 10 characters. Other
        packages, however, allow more. If Phylip compatibility is not important
        for you and you just want to allow for as long taxon names as possible,
        set this value to 0.
    comment : str (default = '#')
        The comment character to be used when adding additional information in
        the "stamp".
    
    Returns
    -------
    output : {str or file}
        Depending on your settings, this function returns a string in DST
        (=Phylip) format, or a file containing the string.
        
    """
    if not taxa:
        taxa = ['t_{0}'.format(i + 1) for i in range(len(matrix))]

    out = ' {0}\n'.format(len(taxa))
    for i, taxon in enumerate(taxa):

        # check for zero-taxlen
        if taxlen == 0:
            dummy = '{0}\t'
            idx = len(taxon)
            joinchar = '\t'  # normally in Phylip this is a space
        else:
            dummy = '{0:' + str(taxlen) + '}'
            idx = taxlen + 1
            joinchar = ' '

        out += dummy.format(taxon)[:idx] + joinchar
        out += joinchar.join(['{0:.2f}'.format(d) for d in matrix[i]])
        out += '\n'
    if stamp:
        out += '{1} {0}'.format(stamp, comment)
    if not filename:
        return out
    else:
        util.write_text_file(filename + '.dst', out)
Esempio n. 44
0
def wl2qlc(
        header,
        data,
        filename='',
        formatter='concept',
        **keywords):
    """
    Write the basic data of a wordlist to file.
    """
    util.setdefaults(
        keywords,
        ignore=['taxa', 'doculects', 'msa'],
        fileformat='qlc',
        prettify=True)
    if keywords['ignore'] == 'all':
        keywords['ignore'] = [
            'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json']

    formatter = formatter.upper()
    if not filename:
        filename = rcParams['filename']

    # create output string
    out = '# Wordlist\n' if keywords['prettify'] else ''

    # write meta to file
    meta = keywords.get("meta", {})
    kvpairs = {}
    jsonpairs = {}
    msapairs = {}
    trees = {}
    distances = ''
    taxa = ''
    scorer = ''

    for k, v in meta.items():
        # simple key-value-pairs
        if isinstance(v, (text_type, int)) or k == "tree":
            kvpairs[k] = v
        elif k == 'msa' and k not in keywords['ignore']:
            # go a level deeper, checking for keys
            for ref in v:
                if ref not in msapairs:
                    msapairs[ref] = {}
                for a, b in v[ref].items():
                    msapairs[ref][a] = b
        elif k == 'distances':
            distances = matrix2dst(v, meta['taxa'])
        elif k in ['taxa', 'doculect', 'taxon', 'doculects']:
            # we need to find a better solution here, since it is not nice to
            # have taxa written to json again and again
            pass
        elif k == 'trees' and k not in keywords['ignore']:
            trees = ''
            for key, value in v.items():
                trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value)
        elif k == 'scorer' and k not in keywords['ignore']:
            for key, value in v.items():
                scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format(
                    key, scorer2str(value), k)
        else:
            # check whether serialization works
            try:
                json.dumps(v)
                jsonpairs[k] = v
            except TypeError:
                pass

    if kvpairs and 'meta' not in keywords['ignore']:
        out += '\n# META\n' if keywords['prettify'] else ''
        for k, v in sorted(kvpairs.items(), key=lambda x: x[0]):
            out += '@{0}:{1}\n'.format(k, v)
    if taxa and keywords['taxa']:
        out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n'
    if jsonpairs and 'json' not in keywords['ignore']:
        out += "@json: " + json.dumps(jsonpairs) + '\n'
    if msapairs and 'msa' not in keywords['ignore']:
        for ref in msapairs:
            out += "\n# MSA reference: {0}\n".format(ref)
            for k, v in msapairs[ref].items():
                if 'consensus' in v:
                    out += '#\n<msa '
                    out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format(
                        k, ref, ' '.join(v['consensus']))
                else:
                    out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref)
                outs = msa2str(v, wordlist=True)
                out += outs
                out += "</msa>\n"

    if distances and 'distances' not in keywords['ignore']:
        out += '\n# DISTANCES\n<dst>\n'
        out += distances + '</dst>\n'

    if trees:
        out += '\n# TREES\n' + trees

    if scorer and 'scorer' not in keywords['ignore']:
        out += '\n# SCORER\n' + scorer

    out += '\n# DATA\n' if keywords['prettify'] else ''
    out += 'ID\t' + '\t'.join(header) + '\n'

    # check for gloss in header to create nice output format
    if formatter in header:
        idx = header.index(formatter)
        formatter = None
        sorted_data = sorted(data.keys(), key=lambda x: data[x][idx])
    elif len(formatter.split(',')) == 2:
        idxA, idxB = formatter.split(',')
        idxA = header.index(idxA)
        idxB = header.index(idxB)
        idx = idxA
        formatter = None
        sorted_data = sorted(data.keys(), key=lambda x: (
            data[x][idxA], data[x][idxB]))
    else:
        idx = False
        formatter = ''
        sorted_data = sorted(data.keys())

    for key in sorted_data:
        # get the line
        line = data[key]

        # check for formatter
        if idx in range(len(line)):
            if line[idx] != formatter:
                out += '#\n' if keywords['prettify'] else ''
                formatter = line[idx]

        # add the key
        out += text_type(key)

        # add the rest of the values
        for value in line:
            if type(value) == list:
                try:
                    out += '\t' + ' '.join(value)
                except:
                    out += '\t' + ' '.join([text_type(v) for v in value])
            elif type(value) == int:
                out += '\t' + text_type(value)
            elif type(value) == float:
                out += '\t{0:.4f}'.format(value)
            elif value is None:
                out += '\t'
            else:
                out += '\t{:}'.format(value)
        out += '\n'

    util.write_text_file(
        filename + '.' + keywords['fileformat'],
        out + keywords.get('stamp', ''),
        normalize="NFC")
    return
Esempio n. 45
0
def wl2qlc(header, data, filename='', formatter='concept', **keywords):
    """
    Write the basic data of a wordlist to file.
    """
    util.setdefaults(keywords,
                     ignore=['taxa', 'doculects', 'msa'],
                     fileformat='qlc',
                     prettify=True)
    if keywords['ignore'] == 'all':
        keywords['ignore'] = [
            'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json'
        ]

    formatter = formatter.upper()
    if not filename:
        filename = rcParams['filename']

    # create output string
    out = '# Wordlist\n' if keywords['prettify'] else ''

    # write meta to file
    meta = keywords.get("meta", {})
    kvpairs = {}
    jsonpairs = {}
    msapairs = {}
    trees = {}
    distances = ''
    taxa = ''
    scorer = ''

    for k, v in meta.items():
        # simple key-value-pairs
        if isinstance(v, (str, int)) or k == "tree":
            kvpairs[k] = v
        elif k == 'msa' and k not in keywords['ignore']:
            # go a level deeper, checking for keys
            for ref in v:
                if ref not in msapairs:
                    msapairs[ref] = {}
                for a, b in v[ref].items():
                    msapairs[ref][a] = b
        elif k == 'distances':
            distances = matrix2dst(v, meta['taxa'])
        elif k in ['taxa', 'doculect', 'taxon', 'doculects']:
            # we need to find a better solution here, since it is not nice to
            # have taxa written to json again and again
            pass
        elif k == 'trees' and k not in keywords['ignore']:
            trees = ''
            for key, value in v.items():
                trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value)
        elif k == 'scorer' and k not in keywords['ignore']:
            for key, value in v.items():
                scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format(
                    key, scorer2str(value), k)
        else:
            # check whether serialization works
            try:
                json.dumps(v)
                jsonpairs[k] = v
            except TypeError:
                pass

    if kvpairs and 'meta' not in keywords['ignore']:
        out += '\n# META\n' if keywords['prettify'] else ''
        for k, v in sorted(kvpairs.items(), key=lambda x: x[0]):
            out += '@{0}:{1}\n'.format(k, v)
    if taxa and keywords['taxa']:
        out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n'
    if jsonpairs and 'json' not in keywords['ignore']:
        out += "@json: " + json.dumps(jsonpairs) + '\n'
    if msapairs and 'msa' not in keywords['ignore']:
        for ref in msapairs:
            out += "\n# MSA reference: {0}\n".format(ref)
            for k, v in msapairs[ref].items():
                if 'consensus' in v:
                    out += '#\n<msa '
                    out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format(
                        k, ref, ' '.join(v['consensus']))
                else:
                    out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref)
                outs = msa2str(v, wordlist=True)
                out += outs
                out += "</msa>\n"

    if distances and 'distances' not in keywords['ignore']:
        out += '\n# DISTANCES\n<dst>\n'
        out += distances + '</dst>\n'

    if trees:
        out += '\n# TREES\n' + trees

    if scorer and 'scorer' not in keywords['ignore']:
        out += '\n# SCORER\n' + scorer

    out += '\n# DATA\n' if keywords['prettify'] else ''
    out += 'ID\t' + '\t'.join(header) + '\n'

    # check for gloss in header to create nice output format
    if formatter in header:
        idx = header.index(formatter)
        formatter = None
        sorted_data = sorted(data.keys(), key=lambda x: data[x][idx])
    elif len(formatter.split(',')) == 2:
        idxA, idxB = formatter.split(',')
        idxA = header.index(idxA)
        idxB = header.index(idxB)
        idx = idxA
        formatter = None
        sorted_data = sorted(data.keys(),
                             key=lambda x: (data[x][idxA], data[x][idxB]))
    else:
        idx = False
        formatter = ''
        sorted_data = sorted(data.keys())

    for key in sorted_data:
        # get the line
        line = data[key]

        # check for formatter
        if idx in range(len(line)):
            if line[idx] != formatter:
                out += '#\n' if keywords['prettify'] else ''
                formatter = line[idx]

        # add the key
        out += str(key)

        # add the rest of the values
        for value in line:
            if type(value) == list:
                try:
                    out += '\t' + ' '.join(value)
                except:
                    out += '\t' + ' '.join([str(v) for v in value])
            elif type(value) == int:
                out += '\t' + str(value)
            elif type(value) == float:
                out += '\t{0:.4f}'.format(value)
            elif value is None:
                out += '\t'
            else:
                out += '\t{:}'.format(value)
        out += '\n'

    util.write_text_file(filename + '.' + keywords['fileformat'],
                         out + keywords.get('stamp', ''),
                         normalize="NFC")
    return
Esempio n. 46
0
def pap2nex(
    taxa,
    paps,
    missing=0,
    filename=''
):
    """
    Function converts a list of paps into nexus file format.

    Parameters
    ----------
    taxa : list
        List of taxa.
    paps : {list, dict}
        A two-dimensional list with the first dimension being identical to the
        number of taxa and the second dimension being identical to the number
        of paps. If a dictionary is passed, each key represents a given pap.
        The following two structures will thus be treated identically::
            
          >>> paps = [[1,0],[1,0],[1,0]] # two languages, three paps
          >>> paps = {1:[1,0], 2:[1,0], 3:[1,0]} # two languages, three paps
    
    missing : {str, int} (default=0)
        Indicate how missing characters are represented in the original data.

    """
    out = '#NEXUS\n\nBEGIN DATA;\nDIMENSIONS ntax={0} NCHAR={1};\n'
    out += "FORMAT DATATYPE=STANDARD GAP=- MISSING={2} interleave=yes;\n"
    out += "MATRIX\n\n{3}\n;\n\nEND;\n"
    out += "[PAPS-REFERENCE]\n{4}"

    # get longest taxon
    maxTax = max([len(taxon) for taxon in taxa])
    paps_ref = ""

    # check whether paps are dict or list
    if hasattr(paps, 'keys'):
        new_paps = [paps[k] for k in sorted(paps)]
        reference = [k for k in sorted(paps)]
    else:
        new_paps = paps
        reference = [k for k in range(1, len(paps)+1)]
    
    # create reference
    ref_string = ''
    for i, ref in enumerate(reference):
        ref_string += '[{0} :: {1}]\n'.format(i, ref)
    # create the matrix
    matrix = ""

    for i, taxon in enumerate(taxa):
        tmp = '{0:XXX} '
        matrix += tmp.replace('XXX', str(maxTax)).format(taxon)
        matrix += ''.join([str(itm[i]) for itm in new_paps])
        matrix += '\n'

    if not filename:
        return out.format(
            len(taxa),
            len(paps),
            missing,
            matrix,
            ref_string
        )
    util.write_text_file(
        filename + '.nex',
        out.format(len(taxa), len(paps), missing, matrix, ref_string))
    return
Esempio n. 47
0
def write_nexus(taxa,
                matrix,
                custom=None,
                custom_name='lingpy',
                missing="?",
                gap="-",
                template="mrbayes.nex",
                filename="mrbayes.nex",
                dtype="RESTRICTION",
                symbols="10",
                commands=None,
                commands_name="mrbayes"):
    """Write a nexus file for phylogenetic analyses.

    Parameters
    ----------
    taxa : list
        The taxonomic units in your data. They should be valid taxon names,
        only consisting of alphanumeric characters and an underscore, usually
        also not exceeding a length of 15 characters.
    matrix : list
        The matrix with the values for each taxon in one separate row. Usually,
        the matrix contains binary values which can be passed as strings or
        integers (1 and 0), but missing values are also possible. Given
        biological common restrictions, each character can only be one ASCII
        symbol.
    custom : list {default=None)
        This information allows to add custom information to the nexus file,
        like, for example, the structure of the characters, their original concept, or their type, and it will be
        written into a custom block in the nexus file. The name of the custom
        block can be specified with help of the `custom_name` keyword. The
        content is a list of strings which will be written line by line into
        the custom block.
    custom_name : str (default="lingpy")
        The name of the custom block which will be written to the file.
    missing : str (default="?")
        The symbol for missing characters.
    gap : str (default="-")
        The symbol for gaps (not relevant for linguistic analyses).
    template : str (default="mrbayes.nex")
        The name of the template file. This file is located in the template/
        folder of the LingPy package, but a custom file can be specified by
        providing the path.
    dtype : str (default="RESTRICTION")
        The datatype, which is usually "STANDARD" or "RESTRICTION" in
        linguistic analyses, with "RESTRICTION" pointing to pure birth-death
        models.
    symbols : str (default="10")
        The symbols used for the characters.
    commands : list (default=None)
        If specified, will write an additional block containing commands for
        phylogenetic software. The commands are passed as a list, containing
        strings. The name of the block is given by the keywords commands_name.
    commands_name : str (default="mrbayes")
        Determines how the block will be called to which the commands will be
        written.

    """
    tpath = util.Path(template_path(template))
    if tpath.exists:
        _template = util.read_text_file(tpath.as_posix())
    else:
        util.read_text_file(template)
    _commands = 'BEGIN {0};\n{1}\n\n'.format(
        commands_name, '\n'.join(commands)) if commands else ''
    _custom = 'BEGIN {0};\n{1}\n\n'.format(custom_name,
                                           '\n'.join(custom)) if custom else ''

    _matrix = ""
    mtl = max([len(t) for t in taxa]) + 1
    for i, (t, m) in enumerate(zip(taxa, matrix)):
        _matrix += str(t + mtl * ' ')[:mtl] + ' '
        _matrix += ''.join(
            ['({0})'.format(c) if len(c) > 1 else str(c) for c in m]) + '\n'

    text = _template.format(matrix=_matrix,
                            ntax=len(taxa),
                            nchar=len(matrix[0]),
                            gap=gap,
                            missing=missing,
                            dtype=dtype,
                            commands=_commands,
                            custom=_custom,
                            symbols=symbols)
    util.write_text_file(filename, text)
Esempio n. 48
0
def msa2html(
    msa,
    shorttitle='',
    filename='',
    template='',
    **keywords
):
    """
    Convert files in ``msa``-format into colored ``html``-format.

    Parameters
    ----------
    msa : dict
        A dictionary object that contains all the information of an MSA object.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``SCA`` will be used.

    filename : str (default="")
        Define the name of the output file. If no name is defined, the name of
        the input file will be taken as a default.

    template : str (default="")
        The path to the template file. If no name is defined, the basic
        template will be used. The basic template currently used can be found
        under ``lingpy/data/templates/msa2html.html``.

    Examples
    --------
    Load the libary.

    >>> from lingpy import *
    
    Load an ``msq``-file from the test-sets.

    >>> msa = MSA('harry.msq')

    Align the data progressively and carry out a check for swapped sites.

    >>> msa.prog_align()
    >>> msa.swap_check()
    >>> print(msa)
    w    o    l    -    d    e    m    o    r    t
    w    a    l    -    d    e    m    a    r    -
    v    -    l    a    d    i    m    i    r    -

    Save the data to the file ``harry.msa``.

    >>> msa.output('msa',filename='harry')

    Save the ``msa``-object as ``html``.

    >>> msa.output('html',filename='harry')
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the ``color``
    :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted.
    

    See also
    --------
    lingpy.convert.html.alm2html
    """
    util.setdefaults(
        keywords,
        pid_mode=1,
        stress=rcParams['stress'],
        css=False,
        js=False,
        compact=False,
        class_sort=True,
        write_to_file=True,
    )

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task

    # load templates
    template = template or template_path('msa2html.html')
    if template == 'js':
        template = template_path('msa2html.js.html')
    html = util.read_text_file(template)
    css = util.read_text_file(keywords['css'] or template_path('msa.css'))
    js = util.read_text_file(keywords['js'] or template_path('msa.js'))

    # treat the msa-object as a file and try to load the file if this is the
    # case
    if isinstance(msa, string_types):
        msa = read_msa(msa, **keywords)
    else:
        raise ValueError('[!] No filename specified.')

    # load dataset, etc.
    dataset = msa['dataset']

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    infile = msa['infile']
    seq_id = msa['seq_id']

    # define the titles etc.
    if not shorttitle:
        shorttitle = 'SCA'

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    # format css file 
    css = css.replace('TAXON_LENGTH', str(taxl * 10))

    out = ''
    tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n'
    td_taxon = '<td class="taxon">{0}</td>'
    perc = int(80 / len(msa['alignment'][0]) + 0.5)
    td_residue = '<td class="residue {1}">{0}</td>'
    td_swap = '<td class="residue swap {1}">{0}</td>'
    td_unaligned = '<td class="residue noalign {1}">{0}</td>'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    # check for 
    local = ['*'] * len(msa['alignment'][0])
    if 'local' in msa:
        local = ['.'] * len(msa['alignment'][0])
        for i in msa['local']:
            local[i] = '*'

    # get two sorting schemas for the sequences
    if keywords['class_sort']:

        classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']]
        seqs = dict(
            [(a[1], b) for a, b in zip(
                sorted(
                    zip(classes, msa['seqs']),
                    key=lambda x: x[0]  # list(zip(x[0],x[1]))
                ),
                range(1, len(msa['seqs']) + 1)
            )]
        )
    else:
        seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1)))
    taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1)))

    # set up a list to store unique alignments
    alignments = []

    # start iteration
    for i, taxon in enumerate(msa['taxa']):
        tmp = ''
        tmp += td_taxon.format(taxon)

        # append alignment to alignments
        alignment = ''.join(msa['alignment'][i])
        sequence = msa['seqs'][i]
        if alignment in alignments:
            unique = 'false'
        else:
            unique = 'true'
            alignments += [alignment]

        for j, char in enumerate(msa['alignment'][i]):
            if char == '-':
                d = 'dolgo_GAP'
                c = '#bbbbbb'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])
                c = token2class(char, rcParams['_color'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'

            if j in swaps:
                tmp += td_swap.format(char, d)
            elif local[j] != '*':
                tmp += td_unaligned.format(char, d)
            else:
                tmp += td_residue.format(char, d)
        out += tr.format(tmp, unique, taxa[taxon], seqs[sequence])

    html = html.format(
        table=out,
        dataset=dataset,
        pid=pid_score,
        file=infile,
        sequence=seq_id,
        shorttitle=shorttitle,
        width=len(msa['alignment'][0]),
        table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl),
        taxa=len(msa['alignment']),
        uniseqs=len(set(msa['seqs'])),
        css=css,
        js=js
    )

    if not filename:
        filename = rcParams['filename']

    if not filename.endswith('.html'):
        filename = filename + '.html'

    if keywords['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    if keywords['write_to_file']:
        # check, whether the outfile already exists
        util.write_text_file(filename, html)
    else:
        return html
Esempio n. 49
0
def compile_model(model, path=None):
    """
    Function compiles customized sound-class models.

    Parameters
    ----------

    model : str
        A string indicating the name of the model which shall be created.

    path : str
        A string indication the path where the model-folder is stored.

    Notes
    -----
    A model is defined by a folder placed in :file:`data/models` directory of
    the LingPy package. The name of the folder reflects the name of the model.
    It contains three files: the file :file:`converter`, the file :file:`INFO`,
    and the optional file :file:`scorer`. The format requirements for these
    files are as follows:

    :file:`INFO`
        The ``INFO``-file serves as a reference for a given sound-class model.
        It can contain arbitrary information (and also be empty). If one wants
        to define specific characteristics, like the ``source``, the
        ``compiler``, the ``date``, or a ``description`` of a given model,
        this can be done by employing a key-value structure in which the key is
        preceded by an ``@`` and followed by a colon and the value is written
        right next to the key in the same line, e.g.::
            
            @source: Dolgopolsky (1986)

        This information will then be read from the ``INFO`` file and rendered
        when printing the model to screen with help of the :py:func:`print`
        function.

    :file:`converter`
        The ``converter`` file contains all sound classes which are matched
        with their respective sound values. Each line is reserved for one
        class, precede by the key (preferably an ASCII-letter) representing the
        class::

            B : ɸ, β, f, p͡f, p͜f, ƀ
            E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ
            D : θ, ð, ŧ, þ, đ
            G : x, ɣ, χ
            ...
    
    :file:`matrix`
        A scoring matrix indicating the alignment scores of all sound-class
        characters defined by the model. The scoring is structured as a simple
        tab-delimited text file. The first cell contains the character names,
        the following cells contain the scores in redundant form (with both
        triangles being filled)::

            B  10.0 -10.0   5.0 ...
            E -10.0   5.0 -10.0 ...
            F   5.0 -10.0  10.0 ...
            ...

    :file:`scorer`
        The ``scorer`` file (which is optional) contains the graph of
        class-transitions which is used for the calculation of the scoring
        dictionary. Each class is listed in a separate line, followed by the
        symbols ``v``,``c``, or ``t`` (indicating whether the class
        represents vowels, consonants, or tones), and by the classes it is
        directly connected to. The strength of this connection is indicated by
        digits (the smaller the value, the shorter the path between the
        classes)::

            A : v, E:1, O:1
            C : c, S:2
            B : c, W:2
            E : v, A:1, I:1
            D : c, S:2
            ...
        
        The information in such a file is automatically converted into a
        scoring dictionary (see :evobib:`List2012b` for details).

    Based on the information provided by the files, a dictionary for the
    conversion of IPA-characters to sound classes and a scoring dictionary are
    created and stored as a binary.  The model can be loaded with help of the
    :py:class:`~lingpy.data.model.Model` class and used in the various classes
    and functions provided by the library.
    
    See also
    --------
    lingpy.data.model.Model
    compile_dvt

    """
    log.info("Compiling model <" + model + ">...")
    # get the path to the models
    new_path = lambda *cmps: os.path.join(path or util.data_path('models'),
                                          model, *cmps)

    log.debug("Model-Path: %s" % new_path)

    # load the sound classes
    sound_classes = _import_sound_classes(new_path('converter'))

    # dump the data
    cache.dump(sound_classes, model + '.converter')
    log.info("... successfully created the converter.")

    # try to load the scoring function or the score tree
    scorer = False

    if os.path.isfile(new_path('matrix')):
        scorer = read_scorer(new_path('matrix'))
    elif os.path.isfile(new_path('scorer')):
        score_tree = _import_score_tree(new_path('scorer'))

        # calculate the scoring dictionary
        score_dict = _make_scoring_dictionary(score_tree)

        # make score_dict a ScoreDict instance
        chars = sorted(set([s[0] for s in score_dict.keys()]))
        matrix = [[0 for i in range(len(chars))] for j in range(len(chars))]
        for (i, charA), (j,
                         charB) in util.multicombinations2(enumerate(chars)):
            if i < j:
                matrix[i][j] = score_dict.get((charA, charB), -100)
                matrix[j][i] = score_dict.get((charB, charA), -100)
            elif i == j:
                matrix[i][j] = score_dict[charA, charB]

        scorer = misc.ScoreDict(chars, matrix)
        util.write_text_file(new_path('matrix'), scorer2str(scorer))

    if scorer:
        cache.dump(scorer, model + '.scorer')
        log.info("... successfully created the scorer.")
    else:
        log.info("... no scoring dictionary defined.")

    log.info("Model <" + model + "> was compiled successfully.")
Esempio n. 50
0
def alm2html(
    infile,
    title='',
    shorttitle='',
    filename='',
    colored=False,
    main_template='',
    table_template='',
    dataset='',
    confidence=False,
    **keywords
):
    """
    Convert files in ``alm``-format into colored ``html``-format.

    Parameters
    ----------

    title : str
        Define the title of the output file. If no title is provided, the
        default title ``LexStat - Automatic Cognate Judgments`` will be used.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``LexStat`` will be used.
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the
    ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed
    and adapted. 

    See also
    --------
    lingpy.convert.html.msa2html
    lingpy.convert.html.msa2tex

    """
    util.setdefaults(keywords, json="", labels={})

    # open the infile
    if not os.path.exists(infile):
        infile = infile + '.alm'
    data = util.read_text_file(infile)

    # create the outfile
    if not filename:
        filename = rcParams['filename']

    # read in the templates
    html = util.read_text_file(main_template or template_path('alm2html.html'))
    if not table_template:
        table_template = template_path(
            'alm2html.table.js.html' if confidence else 'alm2html.table.html')
    table = util.read_text_file(table_template)
    css = util.read_text_file(template_path('alm.css'))
    js = util.read_text_file(template_path('alm.js'))

    # define a label function for the taxa
    label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x

    # check for windows-compatibility
    data = data.replace(os.linesep, '\n')[:-1]

    # split the data into blocks
    blocks = data.split('\n\n')

    # retrieve the dataset
    dataset = dataset or blocks[0]

    # create the outstring
    tmp_str = ''

    for block in blocks[1:]:
        lines = block.split('\n')
        m = [l.split('\t') for l in lines]

        # create colordict for different colors
        dc = len(set([l[0] for l in m]))

        if colored:
            colors = {a: b for a, b in zip(
                sorted(set([int(l[0]) for l in m])),
                colorRange(dc, brightness=400),
            )}
        else:
            colors = []
            white = True
            for i in sorted(set([abs(int(l[0])) for l in m])):
                if white:
                    colors.append((i, 'white'))
                    white = False
                else:
                    colors.append((i, 'gray'))
                    white = True
            colors = dict(colors)

        # get the basic item and its id
        iName = m[0][2]
        iID = m[0][3]

        # start writing the stuff to string
        tmp_str += table.format(NAME=iName, ID=iID)
        # define the basic string for the insertion
        bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}'

        for tracer, l in enumerate(m):
            # check whether the current line is a borrowing
            if int(l[0]) < 0:
                loan_line = ' loan'
            else:
                loan_line = ''

            # assign the cognate id
            tmp = '  <td>{0}</td>\n'.format(l[0])
            tmp += '  <td>{0}</td>\n'.format(label(l[1].strip('.')))

            # check alignments for confidence scores
            ipa_string = ''.join([cell.split('/')[0] for cell in
                                  l[4:]]).replace('-', '')

            tmp += '  <td>{0}</td>\n'.format(ipa_string)
            tmp += '  <td class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '   <table class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '    <tr>\n{0}    </tr>\n   </table>\n  </td>\n </tr>\n'

            # check whether another entry follows that is also an alignment,
            # otherwise, there's no need to display a word as an alignment
            cognate_set = False
            if tracer < len(m) - 1:
                if abs(int(m[tracer + 1][0])) == abs(int(l[0])):
                    cognate_set = True
            if tracer > 0:
                if abs(int(m[tracer - 1][0])) == abs(int(l[0])):
                    cognate_set = True

            # fill out html for the cognate sets
            if cognate_set:

                alm = ''
                for char in l[4:]:

                    # check for confidence scores
                    if '/' in char:
                        try:
                            char, conf, num = char.split('/')
                            conf = int(conf)
                        except ValueError:
                            print(char.split('/'))
                            raise ValueError("Something is wrong with %s." % (char))

                    else:
                        char, conf, rgb = char, (255, 255, 255), 0.0

                    if char == '-':
                        d = 'dolgo_GAP'
                    else:
                        d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                        # bad check for three classes named differently
                        if d == 'dolgo__':
                            d = 'dolgo_X'
                        elif d == 'dolgo_1':
                            d = 'dolgo_TONE'
                        elif d == 'dolgo_0':
                            d = 'dolgo_ERROR'

                    if confidence:
                        alm += '     '
                        alm += '<td class="char {1}" confidence={0} '.format(
                            conf,
                            d
                        )
                        alm += 'char="{0}" '.format(char)
                        alm += 'onclick="' + "show('{0}')".format(num) + '" '
                        alm += 'num="{0}"'.format(num)
                        alm += '>\n      {0}\n     </td>\n'.format(char)
                    else:
                        alm += '     '
                        alm += '<td class="char {0}">{1}</td>\n'.format(d, char)
            else:
                alm = '      '
                alm += '<td class="{0}">--</td>\n'.format(colors[abs(int(l[0]))])

            # format the alignment
            try:
                tmp = tmp.format(alm)
            except ValueError:
                raise ValueError("Unknown problem in matchin %s and %s." % (alm, tmp))

            # check for last line, where a new line should be inserted (not the
            # fastest solution, but plotting is not a matter of time, and it
            # suffices it's current purpose
            if tracer < len(m) - 1:
                pass
            else:
                if confidence:
                    tmp += ' </table>\n'

                tmp += ' <tr class="empty"><td colspan="4" class="empty">'
                tmp += '<hr class="empty" /></td></tr>\n'

            # format the whole string
            tmp_str += bas.format(
                colors[abs(int(l[0]))],
                tmp,
                loan_line,
                l[1]
            )

    if not title:
        title = "LexStat - Automatic Cognate Judgments"
    if not shorttitle:
        shorttitle = "LexStat"

    # check for json-attribute
    if keywords['json']:
        keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'],
                                                        indent=1)

    html = html.format(
        shorttitle=shorttitle,
        title=title,
        table=tmp_str,
        dataset=dataset,
        javascript=js,
        css=css,
        **keywords
    )
    util.write_text_file(filename + '.html', html)
    return
Esempio n. 51
0
    def _output(self, fileformat, **keywords):
        """
        Internal function that eases its modification by daughter classes.
        """
        # check for stamp attribute
        keywords["stamp"] = getattr(self, '_stamp', '')

        # add the default parameters, they will be checked against the keywords
        util.setdefaults(
            keywords,
            cols=False,
            distances=False,
            entries=("concept", "counterpart"),
            entry='concept',
            fileformat=fileformat,
            filename=rcParams['filename'],
            formatter='concept',
            modify_ref=False,
            meta=self._meta,
            missing=0,
            prettify='false',
            ignore='all',
            ref='cogid',
            rows=False,
            subset=False,  # setup a subset of the data,
            taxa='taxa',
            threshold=0.6,  # threshold for flat clustering
            tree_calc='neighbor')

        if fileformat in ['triple', 'triples', 'triples.tsv']:
            return tsv2triple(self, keywords['filename'] + '.' + fileformat)

        if fileformat in ['paps.nex', 'paps.csv']:
            paps = self.get_paps(
                ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing'])
            kw = dict(filename=keywords['filename'] + '.paps')
            if fileformat == 'paps.nex':
                kw['missing'] = keywords['missing']
                return pap2nex(self.cols, paps, **kw)
            return pap2csv(self.cols, paps, **kw)

        # simple printing of taxa
        if fileformat == 'taxa':
            assert hasattr(self, 'taxa')
            return util.write_text_file(keywords['filename'] + '.taxa', self.cols)

        # csv-output
        if fileformat in ['csv', 'qlc', 'tsv']:

            # get the header line
            header = sorted(
                [s for s in set(self._alias.values()) if s in self._header],
                key=lambda x: self._header[x])
            header = [h.upper() for h in header]

            self._meta.setdefault('taxa', self.cols)

            # get the data, in case a subset is chosen
            if not keywords['subset']:
                # write stuff to file
                return wl2qlc(header, self._data, **keywords)

            cols, rows = keywords['cols'], keywords['rows']

            if not isinstance(cols, (list, tuple, bool)):
                raise ValueError("[i] Argument 'cols' should be list or tuple.")
            if not isinstance(rows, (dict, bool)):
                raise ValueError("[i] Argument 'rows' should be a dictionary.")

            # check for chosen header
            if cols:
                # get indices for header
                indices = [self._header[x] for x in cols]
                header = [c.upper() for c in cols]
            else:
                indices = [r for r in range(len(self.header))]

            if rows:
                stmts = []
                for key, value in rows.items():
                    if key == 'ID':
                        stmts += ["key " + value]
                    else:
                        idx = self._header[key]
                        stmts += ["line[{0}] ".format(idx) + value]

            log.debug("calculated what should be excluded")

            # get the data
            out = {}
            for key, line in self._data.items():
                log.debug(key)

                if rows:
                    if eval(" and ".join(stmts)):
                        out[key] = [line[i] for i in indices]
                else:
                    out[key] = [line[i] for i in indices]

            log.debug("passing data to wl2qlc")
            return wl2qlc(header, out, **keywords)

        # output dst-format (phylip)
        if fileformat == 'dst':
            # check for distances as keyword
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self, **keywords)

            out = matrix2dst(self._meta['distances'], self.taxa,
                    stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0))
            return _write_file(keywords['filename'], out, fileformat)

        # output tre-format (newick)
        if fileformat in ['tre', 'nwk']:  # ,'cluster','groups']:
            if 'tree' not in self._meta:
                # check for distances
                if 'distances' not in self._meta:
                    self._meta['distances'] = wl2dst(self)
                # we look up a function to calculate a tree in the cluster module:
                tree = getattr(cluster, keywords['tree_calc'])(
                    self._meta['distances'], self.cols, distances=keywords['distances'])
            else:
                tree = self._meta['tree']

            return _write_file(keywords['filename'], '{0}'.format(tree), fileformat)

        if fileformat in ['cluster', 'groups']:
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self)  # check for keywords

            if 'groups' not in self._meta:
                self._meta['groups'] = cluster.matrix2groups(
                    keywords['threshold'], self._meta['distances'], self.taxa)
            lines = []
            for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]):
                lines.append('{0}\t{1}'.format(taxon, group))
            return _write_file(keywords['filename'], lines, fileformat)

        if fileformat in ['starling', 'star.csv']:
            # make lambda inline for data-check
            l = lambda x: ['-' if x == 0 else x][0]

            lines = []
            if 'cognates' not in keywords:
                lines.append('ID\tConcept\t' + '\t'.join(self.taxa))
                for i, concept in enumerate(self.concepts):
                    for line in self.get_list(row=concept, entry=keywords['entry']):
                        lines.append(
                            str(i + 1) + '\t' + concept + '\t' + '\t'.join(
                                [l(t) for t in line]))
            else:
                lines.append(
                    'ID\tConcept\t' + '\t'.join(
                        ['{0}\t COG'.format(t) for t in self.taxa]))
                for i, concept in enumerate(self.concepts):
                    cogs = self.get_list(row=concept, entry=keywords['cognates'])
                    for j, line in enumerate(
                            self.get_list(row=concept, entry=keywords['entry'])):
                        part = '\t'.join(
                            '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j]))
                        lines.append(util.tabjoin(i + 1, concept, part))

            return _write_file(
                keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv')

        if fileformat == 'multistate.nex':
            if not keywords['filename'].endswith('.multistate.nex'):
                keywords['filename'] += '.multistate.nex'

            matrix = wl2multistate(self, keywords['ref'], keywords['missing'])
            return multistate2nex(self.taxa, matrix, keywords['filename'])

        if fileformat == 'separated':
            if not os.path.isdir(keywords['filename']):
                os.mkdir(keywords['filename'])

            for l in self.cols:
                lines = [''] if 'ignore_keys' in keywords else ['ID\t']
                lines[0] += '\t'.join(x.upper() for x in keywords['entries'])
                for key in self.get_list(col=l, flat=True):
                    line = [] if 'ignore_keys' in keywords else [key]
                    for entry in keywords['entries']:
                        tmp = self[key, entry]
                        if isinstance(tmp, list):
                            tmp = ' '.join([str(x) for x in tmp])
                        line += [tmp]
                    lines.append('\t'.join('{0}'.format(x) for x in line))
                _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')
Esempio n. 52
0
def msa2tex(
    infile,
    template='',
    filename='',
    **keywords
):
    """
    Convert an MSA to a tabular representation which can easily be used in
    LaTeX documents.
    """
    util.setdefaults(keywords, pid_mode=1)

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task
    # load msa
    msa = read_msa(infile)

    ## load templates
    tex = util.read_text_file(template or template_path('msa.tex'))

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    dataset = msa['dataset']
    infile = msa['infile']
    seq_id = msa['seq_id']

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    height = len(msa['alignment'])
    width = len(msa['alignment'][0])

    start = r'\tabular{l' + width * 'c' + '}\n'
    start += r'\bf\ttfamily Taxon & \multicolumn{' + str(
        width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    body = start
    for i, taxon in enumerate(msa['taxa']):
        body += r'\ttfamily ' + taxon.replace('_', r'\_')
        for j, char in enumerate(msa['alignment'][i]):
            if char != '-':
                cls = token2class(char, rcParams['dolgo'])
            elif char == '-':
                cls = 'X'
            if char == '_':
                char = r'\#'
            if cls == '_':
                cls = '2'
            if j not in swaps:
                body += r'&\cellcolor{col' + cls + r'}' + char
            else:
                if char != '-':
                    body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char
                else:
                    body += r'&\cellcolor{col' + cls + r'}\bf ' + char
        body += r'\\' + '\n'

    body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n'
    body += r'\endtabular' + '\n'

    # create the parameters etc.
    w = 1.5 * width + taxl * 0.25
    h = 0.5 * height + 1.0

    tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w))
    tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h))

    # create the rput stuff
    tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0))
    tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0))

    # insert the rest
    tex = tex.replace('<+CONTENT+>', body)

    # write to file
    if not filename:
        filename = 'lingpy-{0}'

    util.write_text_file(filename + '.tex', tex)
Esempio n. 53
0
    def cognate_detection(self, **keywords):
        """
        Method runs a cognate detection analysis.
        """
        kw = dict(
            align_method='progressive',
            align_mode=rcParams['align_mode'],
            align_modes=rcParams['align_modes'],
            cluster_method=rcParams['lexstat_cluster_method'],
            cognate_method='sca',
            cognate_mode='overlap',
            defaults=False,
            factor=rcParams['align_factor'],
            gap_weight=rcParams['gap_weight'],
            gop=rcParams['align_gop'],
            iteration=False,
            lexstat_modes=rcParams['lexstat_modes'],
            limit=rcParams['lexstat_limit'],
            merge_vowels=rcParams['merge_vowels'],
            model=rcParams['sca'],
            export="html",
            preprocessing=False,
            preprocessing_method=rcParams['lexstat_preprocessing_method'],
            preprocessing_threshold=rcParams['lexstat_preprocessing_threshold'],
            rands=rcParams['lexstat_rands'],
            ratio=rcParams['lexstat_ratio'],
            ref="customid",
            restricted_chars=rcParams['restricted_chars'],
            restriction='',
            runs=rcParams['lexstat_runs'],
            scale=rcParams['align_scale'],
            scoring_method=rcParams['lexstat_scoring_method'],
            swap_check=False,
            threshold=rcParams['lexstat_threshold'],
            tree_calc=rcParams['align_tree_calc'],
            vscale=rcParams['lexstat_vscale'],
            outfile=False,
            sonar=True,
        )

        # first load
        kw.update(keywords)
        if kw['defaults']:
            return kw

        # carry out lexstat cluster analysis
        self.lex = LexStat(self.infile, **kw)

        # reset filename if it is not defined
        kw['outfile'] = kw['outfile'] or self.lex.filename + '_lingpy'

        # check for traditional lexstat analysis
        if kw['cognate_method'] == 'lexstat':
            self.lex.get_scorer(
                method=kw['scoring_method'], modes=kw['lexstat_modes'], **kw)

        self.lex.cluster(method=kw['cognate_method'], mode=kw['cognate_mode'], **kw)

        # align the data
        self.alms = Alignments(self.lex, **kw)
        kw['scoredict'] = self.lex.cscorer \
            if kw['cognate_method'] == 'lexstat' else self.lex.bscorer

        self.alms.align(
            method=kw['align_method'],
            mode=kw['align_mode'],
            modes=kw['align_modes'],
            **kw)

        if 'tsv' in kw['export']:
            self.alms.output(
                'tsv',
                filename=kw['outfile'],
                ignore=['scorer', 'json', 'taxa', 'msa'],
                **kw)
        if 'html' in kw['export']:
            corrs, occs = get_correspondences(self.alms, kw['ref'])

            # serialize the wordlist
            wl = {}
            for concept in self.alms.concepts:
                entries = self.alms.get_list(concept=concept, flat=True)
                cogids = [self.alms[idx, kw['ref']] for idx in entries]
                words = [self.alms[idx, 'ipa'] for idx in entries]
                alms = [self.alms[idx, 'alignment'] for idx in entries]
                langs = [self.alms[idx, 'doculect'] for idx in entries]

                checkalm = lambda x: x if type(x) == str else ' '.join(x)

                wl[concept] = [list(k) for k in sorted(
                    zip(
                        langs,
                        [str(x) for x in entries],
                        words,
                        [str(x) for x in cogids],
                        [checkalm(x) for x in alms],
                    ),
                    key=lambda x: int(x[3]))]

                # make simple gloss id for internal use as id
                gloss2id = list(
                    zip(
                        self.alms.concepts,
                        [str(x) for x in range(1, len(self.alms.concepts) + 1)]))
                id2gloss = dict([[b, a] for a, b in gloss2id])
                gloss2id = dict(gloss2id)

                txt = ''
                txt += 'CORRS = ' + json.dumps(corrs) + ';\n'
                txt += 'LANGS = ' + json.dumps(self.alms.taxa) + ';\n'
                txt += 'OCCS = ' + json.dumps(occs) + ';\n'
                txt += 'WLS = ' + json.dumps(wl) + ';\n'
                txt += 'GlossId = ' + json.dumps(gloss2id) + ';\n'
                txt += 'IdGloss = ' + json.dumps(id2gloss) + ';\n'
                txt += 'FILE = "' + kw['outfile'] + '.tsv";\n'

                tpath = partial(util.data_path, 'templates')

                tname = 'jcov.{0}.html'.format(
                    'remote' if 'remote' in kw['export'] else 'direct')
                content = util.read_text_file(tpath(tname))

                util.write_text_file(
                    kw['outfile'] + '.html',
                    content.format(
                        CORRS=txt,
                        JCOV=util.read_text_file(tpath('jcov.js')),
                        STYLE=util.read_text_file(tpath('jcov.css')),
                        VENDOR=util.read_text_file(tpath('jcov.vendor.js')),
                        DIGHL=util.read_text_file(tpath('jcov.dighl.js'))))
Esempio n. 54
0
def psa2html(infile, **kw):
    """
    Function converts a PSA-file into colored html-format.
    """
    util.setdefaults(
        kw,
        template=False,
        css=False,
        comment='#',
        filename=infile[:-4]+'.html',
        compact=True)

    template = util.read_text_file(kw['template'] or template_path('psa.html'))
    css = util.read_text_file(kw['css'] or template_path('psa.css'))

    data = []
    for line in util.read_text_file(infile, lines=True):
        if not line.startswith(kw['comment']):
            data.append(line)

    seq_ids = []
    pairs = []
    taxa = []
    alignments = []

    del data[0]

    i = 0
    while i <= len(data) - 3:
        try:
            seq_ids.append(data[i])

            datA = data[i + 1].split('\t')
            datB = data[i + 2].split('\t')

            taxonA = datA[0].strip('.')
            taxonB = datB[0].strip('.')
            almA = datA[1:]
            almB = datB[1:]

            taxa.append((taxonA, taxonB))
            pairs.append(
                (
                    '.'.join([k for k in almA if k != '-']),
                    '.'.join([k for k in almB if k != '-'])
                )
            )
            alignments.append(
                (
                    [str(a) for a in almA],
                    [str(b) for b in almB],
                    0)
            )
            assert len(alignments[-1][0]) == len(alignments[-1][1])
            i += 4
        except AssertionError:
            log.warning("Line {0} of the data is probably miscoded.".format(i + 1))
            i += 1

    def get_classes(alm):
        classes = []
        residue = '<div class="residue {1}">{0}</div>'
        for j, char in enumerate(alm):
            if char == '-':
                d = 'dolgo_GAP'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'
            classes += [residue.format(char, d)]
        return ''.join(classes)

    out = '<table>\n'  # codecs.open(kw['filename'], 'w', 'utf-8')
    for i, (a, b, c) in enumerate(alignments):
        clsA = get_classes(a)
        clsB = get_classes(b)

        ids = int(100 * pid(a, b) + 0.5)

        out += '<tr class="head">'
        out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format(
            i + 1,
            seq_ids[i],
            ids
        )
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][0])
        out += '<td class="psa">{0}</td>'.format(clsA)
        out += '</tr>'
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][1])
        out += '<td class="psa">{0}</td>'.format(clsB)
        out += '</tr>'
        out += '<tr><td colspan=2></td></tr>'

    out += '</table>'

    html = template.format(alignments=out, css=css)

    if kw['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    util.write_text_file(kw['filename'], html)