コード例 #1
0
ファイル: test_util.py プロジェクト: anukat2015/lingpy
    def test_write_text_file(self):
        def lines_generator(n):
            for i in range(n):
                yield 'line%s' % i

        path = self.tmp_path('test')
        util.write_text_file(path, 'test')
        self.assertEqual(util.read_text_file(path), 'test')

        util.write_text_file(path, ['line1', 'line2'])
        self.assertEqual(len(util.read_text_file(path, lines=True)), 2)

        util.write_text_file(path, lines_generator(5))
        self.assertEqual(len(util.read_text_file(path, lines=True)), 5)
コード例 #2
0
def test_write_text_file(tmppath):
    def lines_generator(n):
        for i in range(n):
            yield 'line%s' % i

    path = tmppath / 'test'
    util.write_text_file(path, 'test')
    assert util.read_text_file(path) == 'test'

    util.write_text_file(path, ['line1', 'line2'])
    assert len(util.read_text_file(path, lines=True)) == 2

    util.write_text_file(path, lines_generator(5))
    assert len(util.read_text_file(path, lines=True)) == 5
コード例 #3
0
    def test_write_text_file(self):
        def lines_generator(n):
            for i in range(n):
                yield 'line%s' % i

        path = self.tmp_path('test')
        util.write_text_file(path, 'test')
        self.assertEqual(util.read_text_file(path), 'test')

        util.write_text_file(path, ['line1', 'line2'])
        self.assertEqual(len(util.read_text_file(path, lines=True)), 2)

        util.write_text_file(path, lines_generator(5))
        self.assertEqual(len(util.read_text_file(path, lines=True)), 5)
コード例 #4
0
ファイル: ops.py プロジェクト: LinguList/lingpy
def triple2tsv(triples_or_fname, output="table"):
    """
    Function reads a triple file and converts it to a tabular data structure.
    """
    D = defaultdict(dict)
    idxs = set()
    cols = set()

    if not isinstance(triples_or_fname, list):
        triples_or_fname = util.read_text_file(
            triples_or_fname, normalize='NFD', lines=True)

    for line in triples_or_fname:
        if isinstance(line, (text_type, str)):
            line = line.split('\t')
        a, b, c = line
        D[a][b.upper()] = c
        idxs.add(a)
        cols.add(b.upper())

    idxs = sorted(idxs)
    cols = sorted(cols)
    table = [[idx] + [
        D.get(idx, {}).get(col, '') for col in cols] for idx in idxs]

    if output not in ['wordlist', 'dict']:
        return [["ID"] + cols] + table

    wlD = {int(line[0]): line[1:] for line in table}
    wlD[0] = cols
    return wlD
コード例 #5
0
def triple2tsv(triples_or_fname, output="table"):
    """
    Function reads a triple file and converts it to a tabular data structure.
    """
    D = defaultdict(dict)
    idxs = set()
    cols = set()

    if not isinstance(triples_or_fname, list):
        triples_or_fname = util.read_text_file(triples_or_fname,
                                               normalize='NFD',
                                               lines=True)

    for line in triples_or_fname:
        if isinstance(line, str):
            line = line.split('\t')
        a, b, c = line
        D[a][b.upper()] = c
        idxs.add(a)
        cols.add(b.upper())

    idxs = sorted(idxs)
    cols = sorted(cols)
    table = [[idx] + [D.get(idx, {}).get(col, '') for col in cols]
             for idx in idxs]

    if output not in ['wordlist', 'dict']:
        return [["ID"] + cols] + table

    wlD = {int(line[0]): line[1:] for line in table}
    wlD[0] = cols
    return wlD
コード例 #6
0
ファイル: phylip.py プロジェクト: vermillionbee/lingpy
def read_dst(filename, taxlen=10, comment='#'):
    """
    Function reads files in Phylip dst-format.

    Parameters
    ----------
    filename : string
        Name of the file which should have the extension ``dst``.
    taxlen : int (default=10)
        Indicate how long the taxon names are allowed to be in the file from
        which you want to read. The Phylip package
        only allows taxon names consisting of maximally 10 characters (this is
        the default). Other
        packages, however, allow more. If Phylip compatibility is not important
        for you and you just want to allow for as long taxon names as possible,
        set this value to 0 and make sure to use tabstops as separators between
        values in your matrix file.
    comment : str (default = '#')
        The comment character to be used if your file contains additional
        information which should be ignored.

    Returns
    -------
    data : tuple
        A tuple consisting of a list of taxa and a matrix.

    """
    if '\n' in filename:
        lines = [f for f in filename.split('\n') if f.strip()]
    else:
        lines = read_text_file(filename, normalize="NFC", lines=True)

    taxa, matrix = [], []

    for line in lines[1:]:
        if not line.startswith(comment):
            if taxlen > 0:
                taxa.append(line[:taxlen].strip())
                matrix.append([
                    float(val)
                    for val in re.split(r'\s+', line[taxlen + 1:].strip())
                ])
            else:
                splits = line.split('\t')
                taxa.append(splits[0])
                matrix.append([float(val.strip()) for val in splits[1:]])

    return taxa, matrix
コード例 #7
0
ファイル: phylip.py プロジェクト: LinguList/lingpy
def read_dst(filename, taxlen=10, comment='#'):
    """
    Function reads files in Phylip dst-format.

    Parameters
    ----------
    filename : string
        Name of the file which should have the extension ``dst``.
    taxlen : int (default=10)
        Indicate how long the taxon names are allowed to be in the file from
        which you want to read. The Phylip package
        only allows taxon names consisting of maximally 10 characters (this is
        the default). Other
        packages, however, allow more. If Phylip compatibility is not important
        for you and you just want to allow for as long taxon names as possible,
        set this value to 0 and make sure to use tabstops as separators between
        values in your matrix file.
    comment : str (default = '#')
        The comment character to be used if your file contains additional
        information which should be ignored.

    Returns
    -------
    data : tuple
        A tuple consisting of a list of taxa and a matrix.

    """
    if '\n' in filename:
        lines = [f for f in filename.split('\n') if f.strip()]
    else:
        lines = read_text_file(filename, normalize="NFC", lines=True)

    taxa, matrix = [], []

    for line in lines[1:]:
        if not line.startswith(comment):
            if taxlen > 0:
                taxa.append(line[:taxlen].strip())
                matrix.append([float(val) for val in
                               re.split(r'\s+', line[taxlen + 1:].strip())])
            else:
                splits = line.split('\t')
                taxa.append(splits[0])
                matrix.append([float(val.strip()) for val in splits[1:]])

    return taxa, matrix
コード例 #8
0
ファイル: qlc.py プロジェクト: kadster/lingpy
def read_msa(infile,
             comment="#",
             ids=False,
             header=True,
             normalize=True,
             **keywords):
    """
    Simple function to load an MSA object.

    Parameters
    ----------
    infile : str
        The name of the input file.
    comment : str (default="#")
        The comment character. If a line starts with this character, it will be
        ignored.
    ids : bool (default=False)
        Indicate whether the MSA file contains unique IDs for all sequences or
        not.

    Returns
    -------
    d : dict
        A dictionary in which keys correspond to specific parts of a multiple
        alignment. This dictionary can be directly passed to alignment
        functions, such as :py:class:`lingpy.sca.MSA`.
    """
    if 'input_file' not in keywords:
        keywords['input_file'] = infile

    f = read_text_file(infile, normalize='NFC', lines=True)
    msa_lines = []
    for line in f:
        if line.strip() and not line.startswith(comment):
            newlines = [t.strip().rstrip('.') for t in line.split('\t')]
            if len(newlines) == 1:
                msa_lines += newlines
            else:
                msa_lines += [newlines]

    return _list2msa(msa_lines,
                     header=header,
                     ids=ids,
                     normalize=normalize,
                     **keywords)
コード例 #9
0
ファイル: model.py プロジェクト: anukat2015/lingpy
    def __init__(self, model, path=None):
        new_path = lambda *cmps: \
            os.path.join(path or util.data_path('models'), model, *cmps)
        self.name = model

        # try to load the converter
        try:
            self.converter = cache.load(model + '.converter')
        except:
            compile_model(model, path)
            self.converter = cache.load(model + '.converter')

        # give always preference to scorer matrix files
        if os.path.isfile(new_path('matrix')):
            self.scorer = read_scorer(new_path('matrix'))
        elif os.path.isfile(new_path('scorer.bin')):
            try:
                self.scorer = cache.load(model + '.scorer')
            except compat.FileNotFoundError:
                pass
        # if none of the above fits, leave it
        else:
            pass

        # read information from the info-file
        self.info = {}

        info = util.read_text_file(new_path('INFO'))
        data = ['description', 'compiler', 'source', 'date', 'vowels', 'tones']

        for line in data:
            try:
                self.info[line] = re.findall('@' + line + ': (.*)', info)[0]
            except:
                self.info[line] = 'unknown'

        # check for vowels and tones
        if "vowels" in self.info:
            self.vowels = self.info['vowels']
        if "tones" in self.info:
            self.tones = self.info['tones']
コード例 #10
0
    def __init__(self, model, path=None):
        new_path = lambda *cmps: \
            os.path.join(path or util.data_path('models'), model, *cmps)
        self.name = model

        # try to load the converter
        try:
            self.converter = cache.load(model + '.converter')
        except:
            compile_model(model, path)
            self.converter = cache.load(model + '.converter')

        # give always preference to scorer matrix files
        if os.path.isfile(new_path('matrix')):
            self.scorer = read_scorer(new_path('matrix'))
        elif os.path.isfile(new_path('scorer.bin')):
            try:
                self.scorer = cache.load(model + '.scorer')
            except compat.FileNotFoundError:
                pass
        # if none of the above fits, leave it
        else:
            pass

        # read information from the info-file
        self.info = {}

        info = util.read_text_file(new_path('INFO'))
        data = ['description', 'compiler', 'source', 'date', 'vowels', 'tones']

        for line in data:
            try:
                self.info[line] = re.findall('@' + line + ': (.*)', info)[0]
            except:
                self.info[line] = 'unknown'

        # check for vowels and tones
        if "vowels" in self.info:
            self.vowels = self.info['vowels']
        if "tones" in self.info:
            self.tones = self.info['tones']
コード例 #11
0
ファイル: qlc.py プロジェクト: xrotwang/lingpy
def read_msa(infile, comment="#", ids=False, header=True, normalize=True, **keywords):
    """
    Simple function to load an MSA object.

    Parameters
    ----------
    infile : str
        The name of the input file.
    comment : str (default="#")
        The comment character. If a line starts with this character, it will be
        ignored.
    ids : bool (default=False)
        Indicate whether the MSA file contains unique IDs for all sequences or
        not.

    Returns
    -------
    d : dict
        A dictionary in which keys correspond to specific parts of a multiple
        alignment. This dictionary can be directly passed to alignment
        functions, such as :py:class:`lingpy.sca.MSA`.
    """
    if 'input_file' not in keywords:
        keywords['input_file'] = infile

    f = read_text_file(infile, normalize='NFC', lines=True)
    msa_lines = []
    for line in f:
        if line.strip() and not line.startswith(comment):
            newlines = [t.strip().rstrip('.') for t in line.split('\t')]
            if len(newlines) == 1:
                msa_lines += newlines
            else:
                msa_lines += [newlines]

    return _list2msa(msa_lines, header=header, ids=ids, normalize=normalize, **keywords)
コード例 #12
0
ファイル: poepy.py プロジェクト: lingpy/poepy
def parser(filename):
    text = read_text_file(filename, normalize='NFD', lines=True)
    comment = '#'
    data = {
        0: [
            'poem', 'poem_number', 'stanza', 'line_in_source', 'line',
            'line_order', 'rhymeids', 'alignment', 'refrain', 'chords'
        ]
    }
    meta, M = {}, {}
    number, stanza, idx, order = 0, 0, 1, 1
    atzone = False
    for line in text:
        if line.startswith('@'):
            if not atzone:
                meta = {}
            atzone = True
            meta[line[1:line.index(':')]] = line[line.index(':') + 1:].strip()
            stanza = 0
        elif not line.strip():
            stanza += 1
            order = 1
            if atzone:
                number += 1
                atzone = False
                M[meta.get('title', 'poem-{0}'.format(number))] = {
                    k: v
                    for k, v in meta.items()
                }
                rhymes = {0: 0}
        elif line.startswith('[') and line.endswith(']'):
            pass
        else:
            if comment in line:
                line = line[line.index(comment):]

            refrain = ''
            if line.startswith('  '):
                refrain = 'R'
            if [x for x in line if is_chinese(x)]:
                nline, bracket = [], 0
                for char in line:
                    if is_chinese(char):
                        if bracket:
                            bracket -= 1
                            nline[-1] += char
                        else:
                            nline += [char]
                    else:
                        if char == '[':
                            bracket += 1
                            nline += ['']
                        nline[-1] += char
            else:
                nline = line.strip().split()
            rhymeids, alignment, nline, chords = parse_line(nline, rhymes)
            data[idx] = [
                meta.get('title', 'poem-{0}'.format(number)),
                str(number), '{0}.{1}'.format(number,
                                              stanza), line, ' + '.join(nline),
                order, rhymeids, ' + '.join(alignment), refrain, chords
            ]
            idx += 1
            order += 1
    poe = Poems(data)
    poe._meta['poems'] = M
    return poe
コード例 #13
0
 def test_TextFile(self):
     path = self.tmp_path('test')
     with util.TextFile(path) as fp:
         fp.writelines(['line1\n', 'line2\n'])
     self.assertEqual(len(util.read_text_file(path, lines=True)), 2)
コード例 #14
0
ファイル: csv.py プロジェクト: LinguList/lingpy
def csv2list(
    filename,
    fileformat='',
    dtype=None,
    comment='#',
    sep='\t',
    strip_lines=True,
    header=False
):
    """
    Very simple function to get quick (and somewhat naive) access to CSV-files.

    Parameters
    ----------
    filename : str
        Name of the input file.
    fileformat : {None str}
        If not specified the file <filename> will be loaded. Otherwise, the
        fileformat is interpreted as the specific extension of the input file.
    dtype : {list}
        If not specified, all data will be loaded as strings. Otherwise, a
        list specifying the data for each line should be provided.
    comment : string (default="#")
        Comment character in the begin of a line forces this line to be
        ignored (set to None  if you want to parse all lines of your file).
    sep : string (default = "\t")
        Specify the separator for the CSV-file.
    strip_lines : bool (default=True)
        Specify whether empty "cells" in the input file should be preserved. If
        set to c{False}, each line will be stripped first, and all whitespace
        will be cleaned. Otherwise, each line will be separated using the
        specified separator, and no stripping of whitespace will be carried
        out.
    header : bool (default=False)
        Indicate, whether the data comes along with a header.

    Returns
    -------
    l : list
        A list-representation of the CSV file.

    """
    # check for correct fileformat
    if fileformat:
        infile = filename + '.' + fileformat
    else:
        infile = filename

    if dtype is None:
        dtype = []

    l = []

    # open the file
    infile = read_text_file(infile, lines=True, normalize="NFC")

    # check for header
    idx = 0 if header else -1

    for i, line in enumerate(infile):
        if line and (not comment or not line.startswith(comment)) and idx != i:
            if strip_lines:
                cells = [c.strip() for c in line.strip().split(sep)]
            else:
                cells = [c.strip() for c in line.split(sep)]
            if not dtype:
                l += [cells]
            else:
                l += [[f(c) for f, c in zip(dtype, cells)]]

    return l
コード例 #15
0
ファイル: wordlist.py プロジェクト: LinguList/lingpy
    def _export(
            self,
            fileformat,
            sections=None,
            entries=None,
            entry_sep='',
            item_sep='',
            template='',
            exclude=None,
            entry_start='',
            entry_close='',
            **keywords):
        """
        Export a wordlist to various file formats.
        """
        if not sections:
            if fileformat == 'txt':
                sections = dict(
                    h1=('concept', '\n# Concept: {0}\n'),
                    h2=('cogid', '## Cognate-ID: {0}\n'))
            elif fileformat == 'tex':
                sections = dict(
                    h1=('concept', r'\section{{Concept: ``{0}"}}' + '\n'),
                    h2=('cogid', r'\subsection{{Cognate Set: ``{0}"}}' + '\n'))
            elif fileformat == 'html':
                sections = dict(
                    h1=('concept', '<h1>Concept: {0}</h1>'),
                    h2=('cogid', '<h2>Cognate Set: {0}</h2>'))

        if not entries:
            if fileformat == 'txt':
                entries = [('language', '{0} '), ('ipa', '{0}\n')]
            elif fileformat == 'tex':
                entries = [('language', '{0} '), ('ipa', '[{0}]' + '\n')]
            elif fileformat == 'html':
                entries = [('language', '{0}&nbsp;'), ('ipa', '[{0}]\n')]

        util.setdefaults(keywords, filename=rcParams['filename'])

        # get the temporary dictionary
        out = wl2dict(self, sections, entries, exclude)

        # assign the output string
        out_string = ''

        # iterate over the dictionary and start to fill the string
        for key in sorted(out, key=lambda x: str(x).lower()):
            # write key to file
            out_string += key[1]

            # reassign tmp
            tmp = out[key]

            # set the pointer and the index
            pointer = {0: [tmp, sorted(tmp.keys())]}

            while True:
                idx = max(pointer.keys())

                # check for type of current point
                if isinstance(tmp, dict):
                    if pointer[idx][1]:
                        next_key = pointer[idx][1].pop()
                        out_string += next_key[1]
                        tmp = pointer[idx][0][next_key]
                        if isinstance(tmp, dict):
                            pointer[idx + 1] = [tmp, sorted(tmp.keys())]
                        else:
                            pointer[idx + 1] = [tmp, tmp]
                    else:
                        del pointer[idx]
                        if idx == 0:
                            break
                else:
                    tmp_strings = []
                    for line in sorted(tmp):
                        tmp_strings += [item_sep.join(line)]
                    out_string += entry_start + entry_sep.join(tmp_strings) + entry_close
                    tmp = pointer[idx - 1][0]
                    del pointer[idx]

        if fileformat == 'tex':
            out_string = out_string.replace('_', r'\_')
        tmpl = util.read_text_file(template) if template else '{0}'
        _write_file(keywords['filename'], tmpl.format(out_string), fileformat)
コード例 #16
0
ファイル: workflow.py プロジェクト: tjade273/lingpy
    def cognate_detection(self, **keywords):
        """
        Method runs a cognate detection analysis.
        """
        kw = dict(
            align_method='progressive',
            align_mode=rcParams['align_mode'],
            align_modes=rcParams['align_modes'],
            cluster_method=rcParams['lexstat_cluster_method'],
            cognate_method='sca',
            cognate_mode='overlap',
            defaults=False,
            factor=rcParams['align_factor'],
            gap_weight=rcParams['gap_weight'],
            gop=rcParams['align_gop'],
            iteration=False,
            lexstat_modes=rcParams['lexstat_modes'],
            limit=rcParams['lexstat_limit'],
            merge_vowels=rcParams['merge_vowels'],
            model=rcParams['sca'],
            export="html",
            preprocessing=False,
            preprocessing_method=rcParams['lexstat_preprocessing_method'],
            preprocessing_threshold=rcParams[
                'lexstat_preprocessing_threshold'],
            rands=rcParams['lexstat_rands'],
            ratio=rcParams['lexstat_ratio'],
            ref="customid",
            restricted_chars=rcParams['restricted_chars'],
            restriction='',
            runs=rcParams['lexstat_runs'],
            scale=rcParams['align_scale'],
            scoring_method=rcParams['lexstat_scoring_method'],
            swap_check=False,
            threshold=rcParams['lexstat_threshold'],
            tree_calc=rcParams['align_tree_calc'],
            vscale=rcParams['lexstat_vscale'],
            outfile=False,
            sonar=True,
        )

        # first load
        kw.update(keywords)
        if kw['defaults']:
            return kw

        # carry out lexstat cluster analysis
        self.lex = LexStat(self.infile, **kw)

        # reset filename if it is not defined
        kw['outfile'] = kw['outfile'] or self.lex.filename + '_lingpy'

        # check for traditional lexstat analysis
        if kw['cognate_method'] == 'lexstat':
            self.lex.get_scorer(method=kw['scoring_method'],
                                modes=kw['lexstat_modes'],
                                **kw)

        self.lex.cluster(method=kw['cognate_method'],
                         mode=kw['cognate_mode'],
                         **kw)

        # align the data
        self.alms = Alignments(self.lex, **kw)
        kw['scoredict'] = self.lex.cscorer \
            if kw['cognate_method'] == 'lexstat' else self.lex.bscorer

        self.alms.align(method=kw['align_method'],
                        mode=kw['align_mode'],
                        modes=kw['align_modes'],
                        **kw)

        if 'tsv' in kw['export']:
            self.alms.output('tsv',
                             filename=kw['outfile'],
                             ignore=['scorer', 'json', 'taxa', 'msa'],
                             **kw)
        if 'html' in kw['export']:
            corrs, occs = get_correspondences(self.alms, kw['ref'])

            # serialize the wordlist
            wl = {}
            for concept in self.alms.concepts:
                entries = self.alms.get_list(concept=concept, flat=True)
                cogids = [self.alms[idx, kw['ref']] for idx in entries]
                words = [self.alms[idx, 'ipa'] for idx in entries]
                alms = [self.alms[idx, 'alignment'] for idx in entries]
                langs = [self.alms[idx, 'doculect'] for idx in entries]

                checkalm = lambda x: x if type(x) == str else ' '.join(x)

                wl[concept] = [
                    list(k) for k in sorted(zip(
                        langs,
                        [str(x) for x in entries],
                        words,
                        [str(x) for x in cogids],
                        [checkalm(x) for x in alms],
                    ),
                                            key=lambda x: int(x[3]))
                ]

                # make simple gloss id for internal use as id
                gloss2id = list(
                    zip(self.alms.concepts, [
                        str(x) for x in range(1,
                                              len(self.alms.concepts) + 1)
                    ]))
                id2gloss = dict([[b, a] for a, b in gloss2id])
                gloss2id = dict(gloss2id)

                txt = ''
                txt += 'CORRS = ' + json.dumps(corrs) + ';\n'
                txt += 'LANGS = ' + json.dumps(self.alms.taxa) + ';\n'
                txt += 'OCCS = ' + json.dumps(occs) + ';\n'
                txt += 'WLS = ' + json.dumps(wl) + ';\n'
                txt += 'GlossId = ' + json.dumps(gloss2id) + ';\n'
                txt += 'IdGloss = ' + json.dumps(id2gloss) + ';\n'
                txt += 'FILE = "' + kw['outfile'] + '.tsv";\n'

                tpath = partial(util.data_path, 'templates')

                tname = 'jcov.{0}.html'.format('remote' if 'remote' in
                                               kw['export'] else 'direct')
                content = util.read_text_file(tpath(tname))

                util.write_text_file(
                    kw['outfile'] + '.html',
                    content.format(
                        CORRS=txt,
                        JCOV=util.read_text_file(tpath('jcov.js')),
                        STYLE=util.read_text_file(tpath('jcov.css')),
                        VENDOR=util.read_text_file(tpath('jcov.vendor.js')),
                        DIGHL=util.read_text_file(tpath('jcov.dighl.js'))))
コード例 #17
0
ファイル: html.py プロジェクト: vermillionbee/lingpy
def msa2tex(infile, template='', filename='', **keywords):
    """
    Convert an MSA to a tabular representation which can easily be used in
    LaTeX documents.
    """
    util.setdefaults(keywords, pid_mode=1)

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task
    # load msa
    msa = read_msa(infile)

    ## load templates
    tex = util.read_text_file(template or template_path('msa.tex'))

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    dataset = msa['dataset']
    infile = msa['infile']
    seq_id = msa['seq_id']

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    height = len(msa['alignment'])
    width = len(msa['alignment'][0])

    start = r'\tabular{l' + width * 'c' + '}\n'
    start += r'\bf\ttfamily Taxon & \multicolumn{' + str(
        width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    body = start
    for i, taxon in enumerate(msa['taxa']):
        body += r'\ttfamily ' + taxon.replace('_', r'\_')
        for j, char in enumerate(msa['alignment'][i]):
            if char != '-':
                cls = token2class(char, rcParams['dolgo'])
            elif char == '-':
                cls = 'X'
            if char == '_':
                char = r'\#'
            if cls == '_':
                cls = '2'
            if j not in swaps:
                body += r'&\cellcolor{col' + cls + r'}' + char
            else:
                if char != '-':
                    body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char
                else:
                    body += r'&\cellcolor{col' + cls + r'}\bf ' + char
        body += r'\\' + '\n'

    body += r'&' + '&'.join([r'\color{white}XXX'
                             for i in range(width)]) + r'\\' + '\n'
    body += r'\endtabular' + '\n'

    # create the parameters etc.
    w = 1.5 * width + taxl * 0.25
    h = 0.5 * height + 1.0

    tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w))
    tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h))

    # create the rput stuff
    tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0))
    tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0))

    # insert the rest
    tex = tex.replace('<+CONTENT+>', body)

    # write to file
    if not filename:
        filename = 'lingpy-{0}'

    util.write_text_file(filename + '.tex', tex)
コード例 #18
0
ファイル: derive.py プロジェクト: xrotwang/lingpy
def _read(filename, normalize=None):
    res = {}
    for line in util.read_text_file(filename, normalize=normalize, lines=True):
        k, v = line.split(' : ')
        res[k] = v.split(', ')
    return res
コード例 #19
0
ファイル: strings.py プロジェクト: kadster/lingpy
def write_nexus(wordlist,
                mode='mrbayes',
                filename="mrbayes.nex",
                ref="cogid",
                missing="?",
                gap="-",
                custom=None,
                custom_name='lingpy',
                commands=None,
                commands_name="mrbayes"):
    """Write a nexus file for phylogenetic analyses.

    Parameters
    ----------
    wordlist : lingpy.basic.wordlist.Wordlist
        A Wordlist object containing cognate IDs.
    mode : str (default="mrbayes")
        The name of the output nexus style. Valid values are:
            * 'MRBAYES': a MrBayes formatted nexus file.
            * 'SPLITSTREE': a SPLITSTREE formatted nexus file.
            * 'BEAST': a BEAST formatted nexus file.
            * 'BEASTWORDS': a BEAST formatted nexus for word-partitioned
               analyses.
            * 'TRAITLAB': a TRAITLab formatted nexus.
    filename : str (default=None)
        Name of the file to which the nexus file will be written.
        If set to c{None}, then this function will not write the nexus ontent
        to a file, but simply return the content as a string.
    ref: str (default="cogid")
        Column in which you store the cognate sets in your data.
    gap : str (default="-")
        The symbol for gaps (not relevant for linguistic analyses).
    missing : str (default="?")
        The symbol for missing characters.
    custom : list {default=None)
        This information allows to add custom information to the nexus file, like, for
        example, the structure of the characters, their original concept, or their
        type, and it will be written into a custom block in the nexus file. The name of
        the custom block can be specified with help of the `custom_name` keyword. The
        content is a list of strings which will be written line by line into the custom
        block.
    custom_name : str (default="lingpy")
        The name of the custom block which will be written to the file.
    commands : list (default=None)
        If specified, will write an additional block containing commands for
        phylogenetic software. The commands are passed as a list, containing
        strings. The name of the block is given by the keywords commands_name.
    commands_name : str (default="mrbayes")
        Determines how the block will be called to which the commands will be
        written.

    Returns
    -------
    nexus : str
        A string containing nexus file output
    """
    templates = {
        'BEAST': 'beast.nex',
        'BEASTWORDS': 'beast.nex',
        'SPLITSTREE': 'splitstree.nex',
        'MRBAYES': 'mrbayes.nex',
        'TRAITLAB': 'splitstree.nex',
    }

    block = "\n\nBEGIN {0};\n{1}\nEND;\n"  # template for nexus blocks

    # check for valid mode
    mode = mode.upper()
    if mode not in templates.keys():
        raise ValueError("Unknown output mode %s" % mode)

    # check for valid template
    template = templates.get(mode)
    tpath = util.Path(template_path(template))
    if tpath.exists:
        _template = util.read_text_file(tpath.as_posix())
    else:  # pragma: no cover
        raise IOError("Unknown template %s" % template)

    # check that `ref` is a valid column
    if ref not in wordlist._alias:
        raise KeyError("Unknown _ref_ column in wordlist '%s'" % ref)

    # retrieve the matrix
    matrix = [[] for x in range(wordlist.width)]
    etd = wordlist.get_etymdict(ref=ref)
    concepts = sorted(
        [(cogid, wordlist[[x[0] for x in vals if x][0]][wordlist._rowIdx])
         for (cogid, vals) in etd.items()],
        key=lambda x: (x[1], x[0]))
    # and missing data..
    missing_ = {
        t: [
            concept for (cogid, concept) in concepts if concept not in
            wordlist.get_list(col=t, entry=wordlist._row_name, flat=True)
        ]
        for t in wordlist.cols
    }

    # add ascertainment character for mode=BEAST
    if mode == 'BEAST':
        matrix = [['0'] for m in matrix]

    # skip the constant sites for traitlab
    if mode == 'TRAITLAB':
        concepts = [(i, c) for (i, c) in concepts if not _is_constant(etd[i])]

    # fill matrix
    for i, t in enumerate(wordlist.cols):
        previous = ''
        for cogid, concept in concepts:
            if previous != concept:
                previous = concept
                # add ascertainment character for mode=BEASTWORDS. Note that if
                # a given word:language is missing, then its ascertainment
                # character is the `missing` character.
                if mode == "BEASTWORDS":
                    matrix[i] += ['0'] if concept not in missing_[t] else [
                        missing
                    ]
            matrix[i] += ['1'] if etd[cogid][i] else ['0'] if concept not in \
                missing_[t] else [missing]

    # parse characters into `charsets` (a dict of word=>siteindex positions),
    # and `chars` (a list of characters).
    charsets, chars, previous = defaultdict(list), [], ''
    for i, (cogid, concept) in enumerate(concepts, 1):
        char = util.nexus_slug(concept)
        # add label for ascertainment character in BEAST mode
        if i == 1 and mode == 'BEAST':
            chars.append("_ascertainment")
        # add label for per-word ascertainment characters in BEASTWORDS
        if mode == 'BEASTWORDS' and previous != concept:
            chars.append("%s_ascertainment" % char)
            charsets[char].append(len(chars))
        # finally add label.
        chars.append(char)
        charsets[char].append(len(chars))
        previous = concept

    # create character labels block if needed
    if mode in ('BEAST', 'BEASTWORDS'):
        charblock = ",\n".join(["\t%d %s" % o for o in enumerate(chars, 1)])
    else:
        charblock = ""

    # create charsets block
    blockname, assumptions = None, ""
    if mode in ('BEASTWORDS', 'MRBAYES'):
        charsets = [
            "\tcharset %s = %d-%d;" % (c, min(m), max(m))
            for (c, m) in charsets.items()
        ]
        blockname = 'ASSUMPTIONS' if mode == 'BEASTWORDS' else 'MRBAYES'
        assumptions = "\n".join(charsets)

    # commands
    if commands_name.upper() == blockname and len(assumptions) and commands:
        # merge commands specified in function call into output blockname
        assumptions += "\n" + "\n".join("\t%s" % c for c in commands)
    else:
        # different commands block set in commands_name.
        assumptions += block.format(commands_name,
                                    '\n'.join(commands)) if commands else ''

    # convert state matrix to string.
    _matrix = ""
    maxtaxlen = max([len(util.nexus_slug(t)) for t in wordlist.cols]) + 1
    for i, (taxon, m) in enumerate(zip(wordlist.cols, matrix)):
        _matrix += str(util.nexus_slug(taxon) +
                       maxtaxlen * ' ')[:maxtaxlen] + ' '
        _matrix += ''.join(
            ['({0})'.format(c) if len(c) > 1 else str(c) for c in m]) + '\n'
    _matrix = _matrix.rstrip()  # remove trailing

    # TODO: symbols could be more than "01" but we this function doesn't handle
    # multistate data so we just specify them here.
    symbols = '01'

    text = _template.format(
        matrix=_matrix,
        ntax=wordlist.width,
        nchar=len(matrix[0]),
        gap=gap,
        missing=missing,
        dtype='RESTRICTION' if mode == 'MRBAYES' else 'STANDARD',
        commands=block.format(blockname, assumptions),
        custom=block.format(custom_name, '\n'.join(custom)) if custom else '',
        symbols=symbols,
        chars=charblock)
    text = text.replace("\t", " " * 4)  # normalise tab-stops
    for i, (cogid, concept) in enumerate(concepts, 1):
        text += '\n[MATRIX:{0}=COGID:{1}=CONCEPT:{2}]'.format(
            i, cogid, concept)
    if filename:
        util.write_text_file(filename, text)
    return text
コード例 #20
0
def test_scorer2str(test_data):
    assert scorer2str(rc('dolgo').scorer) == read_text_file(str(test_data / 'dolgo.scorer'))
コード例 #21
0
def test_TextFile(tmppath):
    path = tmppath / 'test'
    with util.TextFile(path) as fp:
        fp.writelines(['line1\n', 'line2\n'])
    assert len(util.read_text_file(path, lines=True)) == 2
コード例 #22
0
ファイル: qlc.py プロジェクト: xrotwang/lingpy
def read_qlc(infile, comment='#'):
    """
    Simple function that loads qlc-format into a dictionary.

    Parameters
    ----------
    infile : str
        The name of the input file.
    comment : str (default="#")
        The comment character. If a line starts with this character, it will be
        ignored.

    Returns
    -------
    d : dict
        A dictionary with integer keys corresponding to the order of the lines
        of the input file. The header is given 0 as a specific key.
    """
    lines = read_text_file(infile, lines=True, normalize="NFC")
    data, meta, dtype = [], {}, False

    while lines:
        line = lines.pop(0)
        if line.startswith(comment) or not line:
            continue

        if line.startswith('@'):
            key, value = [s.strip() for s in line[1:].split(':', 1)]
            if key == 'tree':
                meta["tree"] = cg.LoadTree(treestring=value)
            elif key == 'json':
                for j1, j2 in json.loads(value).items():
                    meta[j1] = j2
            else:
                if key not in meta:
                    meta[key] = value
                else:
                    if isinstance(meta[key], list):
                        meta[key].append(value)
                    else:
                        warn(
                            "Key '{0}' in input file is not unique! Use JSON-format for "
                            "these datatypes!".format(key))
                        meta[key] = [meta[key]] + [value]
        # line starts with complex stuff
        elif line.startswith('<'):
            tmp = line[1:line.index('>')]
            # check for specific keywords
            if ' ' in tmp:
                dtype = tmp.split(' ')[0]
                keys = {k: v[1:-1]
                        for k, v in [key.split('=') for key in tmp.split(' ')[1:]]}
            else:
                dtype = tmp.strip()
                keys = {}

            tmp = []

            while True:
                line = lines.pop(0)
                if line.startswith('</' + dtype + '>'):
                    break
                tmp += [line]

            tmp = '\n'.join(tmp)

            # check for data stuff
            if dtype == "json":
                tmp = json.loads(tmp)
                if not keys:
                    for key in tmp:
                        meta[key] = tmp[key]
                elif keys:
                    meta[keys["id"]] = {}
                    for k in tmp:
                        meta[keys["id"]][k] = tmp[k]
            elif dtype in ['tre', 'nwk']:
                if "trees" not in meta:
                    meta["trees"] = {}

                if not keys:
                    keys["id"] = "1"

                # XXX consider switching to Tree here XXX
                meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp)
            elif dtype in ['csv']:
                meta[keys["id"]] = {}
                ncol = int(keys.get('ncol', 2))

                if "dtype" in keys:
                    transf = eval(keys["dtype"])
                else:
                    transf = str

                # split tmp into lines
                tmp = tmp.split('\n')
                for l in tmp:
                    if ncol == 2:
                        a, b = l.split('\t')
                        b = transf(b)
                    else:
                        l = l.split('\t')
                        a = l[0]
                        b = [transf(b) for b in l[1:]]
                    meta[keys["id"]][a] = b
            elif dtype == 'msa':
                tmp = tmp.split('\n')
                if 'msa' not in meta:
                    meta['msa'] = {}

                ref = keys.get('ref', 'cogid')
                if ref not in meta['msa']:
                    meta['msa'][ref] = {}

                tmp_msa = {}
                try:
                    tmp_msa['dataset'] = meta['dataset']
                except:
                    tmp_msa['dataset'] = infile.replace('.csv', '')

                tmp_msa['seq_id'] = keys['id']

                # add consensus string to msa, if it appears in the keys
                if "consensus" in keys:
                    tmp_msa['consensus'] = keys['consensus']

                msad = []
                for l in tmp:
                    if not l.startswith(comment):
                        msad.append([x.strip().rstrip('.') for x in l.split('\t')])
                tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa)

                try:
                    meta['msa'][ref][int(keys['id'])] = tmp_msa
                except ValueError:
                    meta['msa'][ref][keys['id']] = tmp_msa

            elif dtype == 'dst':
                taxa, matrix = read_dst(tmp)
                distances = [[0.0 for _ in matrix] for _ in matrix]
                for i, line in enumerate(matrix):
                    for j, cell in enumerate(line):
                        if i < j:
                            distances[i][j] = cell
                            distances[j][i] = cell
                meta['distances'] = distances
            elif dtype == 'scorer':
                scorer = read_scorer(tmp)
                if 'scorer' not in meta:
                    meta['scorer'] = {}
                if 'id' not in keys:
                    keys['id'] = 'basic'
                meta['scorer'][keys['id']] = scorer

            elif dtype == 'taxa':
                meta['taxa'] = [t.strip() for t in tmp.split('\n')]
        else:
            data += [[l.strip() for l in line.split('\t')]]

    # create the dictionary in which the data will be stored
    d = {}

    # check for first line, if a local ID is given in the header (or simply
    # "ID"), take this line as the ID, otherwise create it
    local_id = data[0][0].lower() in ['id', 'local_id', 'localid']

    # iterate over data and fill the dictionary (a bit inefficient, but enough
    # for the moment)
    try:
        i = 1
        for j, line in enumerate(data[1:]):
            if local_id:
                d[int(line[0])] = line[1:]
            else:
                d[i] = line
                i += 1
    except ValueError as e:
        raise Exception("Error processing line {0}:\n".format(j) +
                        str(data[1:][j]) + '\nOriginal error message: ' + str(e))

    # assign the header to d[0]
    if local_id:
        d[0] = [x.lower() for x in data[0][1:]]
    else:
        d[0] = [x.lower() for x in data[0]]

    for m in meta:
        d[m] = meta[m]

    if 'trees' in d and 'tree' not in d:
        d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1]

    return d
コード例 #23
0
ファイル: test_strings.py プロジェクト: anukat2015/lingpy
 def test_scorer2str(self):
     """
     Test conversion of scorers to strings.
     """
     self.assertEqual(scorer2str(lingpy.rc('dolgo').scorer),
                      read_text_file(test_data('dolgo.scorer')))
コード例 #24
0
ファイル: html.py プロジェクト: vermillionbee/lingpy
def msa2html(msa, shorttitle='', filename='', template='', **keywords):
    """
    Convert files in ``msa``-format into colored ``html``-format.

    Parameters
    ----------
    msa : dict
        A dictionary object that contains all the information of an MSA object.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``SCA`` will be used.

    filename : str (default="")
        Define the name of the output file. If no name is defined, the name of
        the input file will be taken as a default.

    template : str (default="")
        The path to the template file. If no name is defined, the basic
        template will be used. The basic template currently used can be found
        under ``lingpy/data/templates/msa2html.html``.

    Examples
    --------
    Load the libary.

    >>> from lingpy import *
    
    Load an ``msq``-file from the test-sets.

    >>> msa = MSA('harry.msq')

    Align the data progressively and carry out a check for swapped sites.

    >>> msa.prog_align()
    >>> msa.swap_check()
    >>> print(msa)
    w    o    l    -    d    e    m    o    r    t
    w    a    l    -    d    e    m    a    r    -
    v    -    l    a    d    i    m    i    r    -

    Save the data to the file ``harry.msa``.

    >>> msa.output('msa',filename='harry')

    Save the ``msa``-object as ``html``.

    >>> msa.output('html',filename='harry')
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the ``color``
    :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted.
    

    See also
    --------
    lingpy.convert.html.alm2html
    """
    util.setdefaults(
        keywords,
        pid_mode=1,
        stress=rcParams['stress'],
        css=False,
        js=False,
        compact=False,
        class_sort=True,
        write_to_file=True,
    )

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task

    # load templates
    template = template or template_path('msa2html.html')
    if template == 'js':
        template = template_path('msa2html.js.html')
    html = util.read_text_file(template)
    css = util.read_text_file(keywords['css'] or template_path('msa.css'))
    js = util.read_text_file(keywords['js'] or template_path('msa.js'))

    # treat the msa-object as a file and try to load the file if this is the
    # case
    if isinstance(msa, string_types):
        msa = read_msa(msa, **keywords)
    else:
        raise ValueError('[!] No filename specified.')

    # load dataset, etc.
    dataset = msa['dataset']

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    infile = msa['infile']
    seq_id = msa['seq_id']

    # define the titles etc.
    if not shorttitle:
        shorttitle = 'SCA'

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    # format css file
    css = css.replace('TAXON_LENGTH', str(taxl * 10))

    out = ''
    tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n'
    td_taxon = '<td class="taxon">{0}</td>'
    perc = int(80 / len(msa['alignment'][0]) + 0.5)
    td_residue = '<td class="residue {1}">{0}</td>'
    td_swap = '<td class="residue swap {1}">{0}</td>'
    td_unaligned = '<td class="residue noalign {1}">{0}</td>'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    # check for
    local = ['*'] * len(msa['alignment'][0])
    if 'local' in msa:
        local = ['.'] * len(msa['alignment'][0])
        for i in msa['local']:
            local[i] = '*'

    # get two sorting schemas for the sequences
    if keywords['class_sort']:

        classes = [
            tokens2class(ipa2tokens(seq), rcParams['asjp'])
            for seq in msa['seqs']
        ]
        seqs = dict([
            (a[1], b) for a, b in zip(
                sorted(
                    zip(classes, msa['seqs']),
                    key=lambda x: x[0]  # list(zip(x[0],x[1]))
                ),
                range(1,
                      len(msa['seqs']) + 1))
        ])
    else:
        seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1)))
    taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1)))

    # set up a list to store unique alignments
    alignments = []

    # start iteration
    for i, taxon in enumerate(msa['taxa']):
        tmp = ''
        tmp += td_taxon.format(taxon)

        # append alignment to alignments
        alignment = ''.join(msa['alignment'][i])
        sequence = msa['seqs'][i]
        if alignment in alignments:
            unique = 'false'
        else:
            unique = 'true'
            alignments += [alignment]

        for j, char in enumerate(msa['alignment'][i]):
            if char == '-':
                d = 'dolgo_GAP'
                c = '#bbbbbb'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])
                c = token2class(char, rcParams['_color'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'

            if j in swaps:
                tmp += td_swap.format(char, d)
            elif local[j] != '*':
                tmp += td_unaligned.format(char, d)
            else:
                tmp += td_residue.format(char, d)
        out += tr.format(tmp, unique, taxa[taxon], seqs[sequence])

    html = html.format(
        table=out,
        dataset=dataset,
        pid=pid_score,
        file=infile,
        sequence=seq_id,
        shorttitle=shorttitle,
        width=len(msa['alignment'][0]),
        table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl),
        taxa=len(msa['alignment']),
        uniseqs=len(set(msa['seqs'])),
        css=css,
        js=js)

    if not filename:
        filename = rcParams['filename']

    if not filename.endswith('.html'):
        filename = filename + '.html'

    if keywords['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    if keywords['write_to_file']:
        # check, whether the outfile already exists
        util.write_text_file(filename, html)
    else:
        return html
コード例 #25
0
ファイル: qlc.py プロジェクト: kadster/lingpy
def read_qlc(infile, comment='#'):
    """
    Simple function that loads qlc-format into a dictionary.

    Parameters
    ----------
    infile : str
        The name of the input file.
    comment : str (default="#")
        The comment character. If a line starts with this character, it will be
        ignored.

    Returns
    -------
    d : dict
        A dictionary with integer keys corresponding to the order of the lines
        of the input file. The header is given 0 as a specific key.
    """
    lines = read_text_file(infile, lines=True, normalize="NFC")
    data, meta, dtype = [], {}, False

    while lines:
        line = lines.pop(0)
        if line.startswith(comment) or not line:
            continue

        if line.startswith('@'):
            key, value = [s.strip() for s in line[1:].split(':', 1)]
            if key == 'tree':
                meta["tree"] = cg.LoadTree(treestring=value)
            elif key == 'json':
                for j1, j2 in json.loads(value).items():
                    meta[j1] = j2
            else:
                if key not in meta:
                    meta[key] = value
                else:
                    if isinstance(meta[key], list):
                        meta[key].append(value)
                    else:
                        log.warning(
                            "Key '{0}' in input file is not unique! Use JSON-format for "
                            "these datatypes!".format(key))
                        meta[key] = [meta[key]] + [value]
        # line starts with complex stuff
        elif line.startswith('<'):
            tmp = line[1:line.index('>')]
            # check for specific keywords
            if ' ' in tmp:
                dtype = tmp.split(' ')[0]
                keys = {
                    k: v[1:-1]
                    for k, v in [key.split('=') for key in tmp.split(' ')[1:]]
                }
            else:
                dtype = tmp.strip()
                keys = {}

            tmp = []

            while True:
                line = lines.pop(0)
                if line.startswith('</' + dtype + '>'):
                    break
                tmp += [line]

            tmp = '\n'.join(tmp)

            # check for data stuff
            if dtype == "json":
                tmp = json.loads(tmp)
                if not keys:
                    for key in tmp:
                        meta[key] = tmp[key]
                elif keys:
                    meta[keys["id"]] = {}
                    for k in tmp:
                        meta[keys["id"]][k] = tmp[k]
            elif dtype in ['tre', 'nwk']:
                if "trees" not in meta:
                    meta["trees"] = {}

                if not keys:
                    keys["id"] = "1"

                # XXX consider switching to Tree here XXX
                meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp)
            elif dtype in ['csv']:
                meta[keys["id"]] = {}
                ncol = int(keys.get('ncol', 2))

                if "dtype" in keys:
                    transf = eval(keys["dtype"])
                else:
                    transf = str

                # split tmp into lines
                tmp = tmp.split('\n')
                for l in tmp:
                    if ncol == 2:
                        a, b = l.split('\t')
                        b = transf(b)
                    else:
                        l = l.split('\t')
                        a = l[0]
                        b = [transf(b) for b in l[1:]]
                    meta[keys["id"]][a] = b
            elif dtype == 'msa':
                tmp = tmp.split('\n')
                if 'msa' not in meta:
                    meta['msa'] = {}

                ref = keys.get('ref', 'cogid')
                if ref not in meta['msa']:
                    meta['msa'][ref] = {}

                tmp_msa = {}
                try:
                    tmp_msa['dataset'] = meta['dataset']
                except:
                    tmp_msa['dataset'] = infile.replace('.csv', '')

                tmp_msa['seq_id'] = keys['id']

                # add consensus string to msa, if it appears in the keys
                if "consensus" in keys:
                    tmp_msa['consensus'] = keys['consensus']

                msad = []
                for l in tmp:
                    if not l.startswith(comment):
                        msad.append(
                            [x.strip().rstrip('.') for x in l.split('\t')])
                tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa)

                try:
                    meta['msa'][ref][int(keys['id'])] = tmp_msa
                except ValueError:
                    meta['msa'][ref][keys['id']] = tmp_msa

            elif dtype == 'dst':
                taxa, matrix = read_dst(tmp)
                distances = [[0.0 for _ in matrix] for _ in matrix]
                for i, line in enumerate(matrix):
                    for j, cell in enumerate(line):
                        if i < j:
                            distances[i][j] = cell
                            distances[j][i] = cell
                meta['distances'] = distances
            elif dtype == 'scorer':
                scorer = read_scorer(tmp)
                if 'scorer' not in meta:
                    meta['scorer'] = {}
                keys.setdefault('id', 'basic')
                meta['scorer'][keys['id']] = scorer

            elif dtype == 'taxa':
                meta['taxa'] = [t.strip() for t in tmp.split('\n')]
        else:
            data += [[l.strip() for l in line.split('\t')]]

    # create the dictionary in which the data will be stored
    d = {}

    # check for first line, if a local ID is given in the header (or simply
    # "ID"), take this line as the ID, otherwise create it
    local_id = data[0][0].lower() in ['id', 'local_id', 'localid']

    # iterate over data and fill the dictionary (a bit inefficient, but enough
    # for the moment)
    try:
        i = 1
        for j, line in enumerate(data[1:]):
            if local_id:
                d[int(line[0])] = line[1:]
            else:
                d[i] = line
                i += 1
    except ValueError as e:  # pragma: no cover
        raise Exception("Error processing line {0}:\n".format(j) +
                        str(data[1:][j]) + '\nOriginal error message: ' +
                        str(e))

    # assign the header to d[0]
    if local_id:
        d[0] = [x.lower() for x in data[0][1:]]
    else:
        d[0] = [x.lower() for x in data[0]]

    for m in meta:
        d[m] = meta[m]

    if 'trees' in d and 'tree' not in d:
        d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1]

    return d
コード例 #26
0
ファイル: html.py プロジェクト: vermillionbee/lingpy
def alm2html(infile,
             title='',
             shorttitle='',
             filename='',
             colored=False,
             main_template='',
             table_template='',
             dataset='',
             confidence=False,
             **keywords):
    """
    Convert files in ``alm``-format into colored ``html``-format.

    Parameters
    ----------

    title : str
        Define the title of the output file. If no title is provided, the
        default title ``LexStat - Automatic Cognate Judgments`` will be used.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``LexStat`` will be used.
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the
    ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed
    and adapted. 

    See also
    --------
    lingpy.convert.html.msa2html
    lingpy.convert.html.msa2tex

    """
    util.setdefaults(keywords, json="", labels={})

    # open the infile
    if not os.path.exists(infile):
        infile = infile + '.alm'
    data = util.read_text_file(infile)

    # create the outfile
    if not filename:
        filename = rcParams['filename']

    # read in the templates
    html = util.read_text_file(main_template or template_path('alm2html.html'))
    if not table_template:
        table_template = template_path(
            'alm2html.table.js.html' if confidence else 'alm2html.table.html')
    table = util.read_text_file(table_template)
    css = util.read_text_file(template_path('alm.css'))
    js = util.read_text_file(template_path('alm.js'))

    # define a label function for the taxa
    label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x

    # check for windows-compatibility
    data = data.replace(os.linesep, '\n')[:-1]

    # split the data into blocks
    blocks = data.split('\n\n')

    # retrieve the dataset
    dataset = dataset or blocks[0]

    # create the outstring
    tmp_str = ''

    for block in blocks[1:]:
        lines = block.split('\n')
        m = [l.split('\t') for l in lines]

        # create colordict for different colors
        dc = len(set([l[0] for l in m]))

        if colored:
            colors = {
                a: b
                for a, b in zip(
                    sorted(set([int(l[0]) for l in m])),
                    colorRange(dc, brightness=400),
                )
            }
        else:
            colors = []
            white = True
            for i in sorted(set([abs(int(l[0])) for l in m])):
                if white:
                    colors.append((i, 'white'))
                    white = False
                else:
                    colors.append((i, 'gray'))
                    white = True
            colors = dict(colors)

        # get the basic item and its id
        iName = m[0][2]
        iID = m[0][3]

        # start writing the stuff to string
        tmp_str += table.format(NAME=iName, ID=iID)
        # define the basic string for the insertion
        bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}'

        for tracer, l in enumerate(m):
            # check whether the current line is a borrowing
            if int(l[0]) < 0:
                loan_line = ' loan'
            else:
                loan_line = ''

            # assign the cognate id
            tmp = '  <td>{0}</td>\n'.format(l[0])
            tmp += '  <td>{0}</td>\n'.format(label(l[1].strip('.')))

            # check alignments for confidence scores
            ipa_string = ''.join([cell.split('/')[0]
                                  for cell in l[4:]]).replace('-', '')

            tmp += '  <td>{0}</td>\n'.format(ipa_string)
            tmp += '  <td class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '   <table class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '    <tr>\n{0}    </tr>\n   </table>\n  </td>\n </tr>\n'

            # check whether another entry follows that is also an alignment,
            # otherwise, there's no need to display a word as an alignment
            cognate_set = False
            if tracer < len(m) - 1:
                if abs(int(m[tracer + 1][0])) == abs(int(l[0])):
                    cognate_set = True
            if tracer > 0:
                if abs(int(m[tracer - 1][0])) == abs(int(l[0])):
                    cognate_set = True

            # fill out html for the cognate sets
            if cognate_set:

                alm = ''
                for char in l[4:]:

                    # check for confidence scores
                    if '/' in char:
                        try:
                            char, conf, num = char.split('/')
                            conf = int(conf)
                        except ValueError:
                            print(char.split('/'))
                            raise ValueError("Something is wrong with %s." %
                                             (char))

                    else:
                        char, conf, rgb = char, (255, 255, 255), 0.0

                    if char == '-':
                        d = 'dolgo_GAP'
                    else:
                        d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                        # bad check for three classes named differently
                        if d == 'dolgo__':
                            d = 'dolgo_X'
                        elif d == 'dolgo_1':
                            d = 'dolgo_TONE'
                        elif d == 'dolgo_0':
                            d = 'dolgo_ERROR'

                    if confidence:
                        alm += '     '
                        alm += '<td class="char {1}" confidence={0} '.format(
                            conf, d)
                        alm += 'char="{0}" '.format(char)
                        alm += 'onclick="' + "show('{0}')".format(num) + '" '
                        alm += 'num="{0}"'.format(num)
                        alm += '>\n      {0}\n     </td>\n'.format(char)
                    else:
                        alm += '     '
                        alm += '<td class="char {0}">{1}</td>\n'.format(
                            d, char)
            else:
                alm = '      '
                alm += '<td class="{0}">--</td>\n'.format(colors[abs(int(
                    l[0]))])

            # format the alignment
            try:
                tmp = tmp.format(alm)
            except ValueError:
                raise ValueError("Unknown problem in matchin %s and %s." %
                                 (alm, tmp))

            # check for last line, where a new line should be inserted (not the
            # fastest solution, but plotting is not a matter of time, and it
            # suffices it's current purpose
            if tracer < len(m) - 1:
                pass
            else:
                if confidence:
                    tmp += ' </table>\n'

                tmp += ' <tr class="empty"><td colspan="4" class="empty">'
                tmp += '<hr class="empty" /></td></tr>\n'

            # format the whole string
            tmp_str += bas.format(colors[abs(int(l[0]))], tmp, loan_line, l[1])

    if not title:
        title = "LexStat - Automatic Cognate Judgments"
    if not shorttitle:
        shorttitle = "LexStat"

    # check for json-attribute
    if keywords['json']:
        keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'],
                                                        indent=1)

    html = html.format(shorttitle=shorttitle,
                       title=title,
                       table=tmp_str,
                       dataset=dataset,
                       javascript=js,
                       css=css,
                       **keywords)
    util.write_text_file(filename + '.html', html)
    return
コード例 #27
0
def _read(filename, normalize=None):
    res = {}
    for line in util.read_text_file(filename, normalize=normalize, lines=True):
        k, v = line.split(' : ')
        res[k] = v.split(', ')
    return res
コード例 #28
0
ファイル: html.py プロジェクト: vermillionbee/lingpy
def psa2html(infile, **kw):
    """
    Function converts a PSA-file into colored html-format.
    """
    util.setdefaults(kw,
                     template=False,
                     css=False,
                     comment='#',
                     filename=infile[:-4] + '.html',
                     compact=True)

    template = util.read_text_file(kw['template'] or template_path('psa.html'))
    css = util.read_text_file(kw['css'] or template_path('psa.css'))

    data = []
    for line in util.read_text_file(infile, lines=True):
        if not line.startswith(kw['comment']):
            data.append(line)

    seq_ids = []
    pairs = []
    taxa = []
    alignments = []

    del data[0]

    i = 0
    while i <= len(data) - 3:
        try:
            seq_ids.append(data[i])

            datA = data[i + 1].split('\t')
            datB = data[i + 2].split('\t')

            taxonA = datA[0].strip('.')
            taxonB = datB[0].strip('.')
            almA = datA[1:]
            almB = datB[1:]

            taxa.append((taxonA, taxonB))
            pairs.append(('.'.join([k for k in almA if k != '-']),
                          '.'.join([k for k in almB if k != '-'])))
            alignments.append(
                ([str(a) for a in almA], [str(b) for b in almB], 0))
            assert len(alignments[-1][0]) == len(alignments[-1][1])
            i += 4
        except AssertionError:
            log.warning("Line {0} of the data is probably miscoded.".format(i +
                                                                            1))
            i += 1

    def get_classes(alm):
        classes = []
        residue = '<div class="residue {1}">{0}</div>'
        for j, char in enumerate(alm):
            if char == '-':
                d = 'dolgo_GAP'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'
            classes += [residue.format(char, d)]
        return ''.join(classes)

    out = '<table>\n'  # codecs.open(kw['filename'], 'w', 'utf-8')
    for i, (a, b, c) in enumerate(alignments):
        clsA = get_classes(a)
        clsB = get_classes(b)

        ids = int(100 * pid(a, b) + 0.5)

        out += '<tr class="head">'
        out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format(
            i + 1, seq_ids[i], ids)
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][0])
        out += '<td class="psa">{0}</td>'.format(clsA)
        out += '</tr>'
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][1])
        out += '<td class="psa">{0}</td>'.format(clsB)
        out += '</tr>'
        out += '<tr><td colspan=2></td></tr>'

    out += '</table>'

    html = template.format(alignments=out, css=css)

    if kw['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    util.write_text_file(kw['filename'], html)
コード例 #29
0
ファイル: wordlist.py プロジェクト: kadster/lingpy
    def _export(
            self,
            fileformat,
            sections=None,
            entries=None,
            entry_sep='',
            item_sep='',
            template='',
            exclude=None,
            entry_start='',
            entry_close='',
            **keywords):
        """
        Export a wordlist to various file formats.
        """
        if not sections:
            if fileformat == 'txt':
                sections = dict(
                    h1=('concept', '\n# Concept: {0}\n'),
                    h2=('cogid', '## Cognate-ID: {0}\n'))
            elif fileformat == 'tex':
                sections = dict(
                    h1=('concept', r'\section{{Concept: ``{0}"}}' + '\n'),
                    h2=('cogid', r'\subsection{{Cognate Set: ``{0}"}}' + '\n'))
            elif fileformat == 'html':
                sections = dict(
                    h1=('concept', '<h1>Concept: {0}</h1>'),
                    h2=('cogid', '<h2>Cognate Set: {0}</h2>'))

        if not entries:
            if fileformat == 'txt':
                entries = [('language', '{0} '), ('ipa', '{0}\n')]
            elif fileformat == 'tex':
                entries = [('language', '{0} '), ('ipa', '[{0}]' + '\n')]
            elif fileformat == 'html':
                entries = [('language', '{0}&nbsp;'), ('ipa', '[{0}]\n')]

        util.setdefaults(keywords, filename=rcParams['filename'])

        # get the temporary dictionary
        out = wl2dict(self, sections, entries, exclude)

        # assign the output string
        out_string = ''

        # iterate over the dictionary and start to fill the string
        for key in sorted(out, key=lambda x: str(x).lower()):
            # write key to file
            out_string += key[1]

            # reassign tmp
            tmp = out[key]

            # set the pointer and the index
            pointer = {0: [tmp, sorted(tmp.keys())]}

            while True:
                idx = max(pointer.keys())

                # check for type of current point
                if isinstance(tmp, dict):
                    if pointer[idx][1]:
                        next_key = pointer[idx][1].pop()
                        out_string += next_key[1]
                        tmp = pointer[idx][0][next_key]
                        if isinstance(tmp, dict):
                            pointer[idx + 1] = [tmp, sorted(tmp.keys())]
                        else:
                            pointer[idx + 1] = [tmp, tmp]
                    else:
                        del pointer[idx]
                        if idx == 0:
                            break
                else:
                    tmp_strings = []
                    for line in sorted(tmp):
                        tmp_strings += [item_sep.join(line)]
                    out_string += entry_start + entry_sep.join(tmp_strings) + entry_close
                    tmp = pointer[idx - 1][0]
                    del pointer[idx]

        if fileformat == 'tex':
            out_string = out_string.replace('_', r'\_')
        tmpl = util.read_text_file(template) if template else '{0}'
        _write_file(keywords['filename'], tmpl.format(out_string), fileformat)
コード例 #30
0
from csvw.metadata import TableGroup
from lingpy import util
from lingpy.convert.html import template_path

# receive the template path from lingpy for splitstree
tpath = util.Path(template_path('splitstree.nex'))
if tpath.exists:
    _template = util.read_text_file(tpath.as_posix())
else:  # pragma: no cover
    raise IOError("Unknown template %s" % template)

tbg = TableGroup.from_file('cldf/StructureDataset-metadata.json')
taxa = {t['ID']: (i, t['Name']) for i, t in
        enumerate(tbg.tabledict['languages.csv'])}
params = {t['ID']: (i, t['Name']) for i, t in
        enumerate(tbg.tabledict['parameters.csv'])}
matrix = [[0 for p in params] for t in taxa]
for row in tbg.tabledict['values.csv']:
    tidx, tname = taxa[row['Language_ID']]
    pidx, pname = params[row['Parameter_ID']]
    if row['Value'] == '+':
        matrix[tidx][pidx] = 1
        
alpha = 'abcdefghijklmnopqrstuvwxyz'
alpha += alpha.upper()
alpha += '0123456789'

matrix_string = ''
tax_list = sorted([t[1] for t in taxa.items()], key=lambda x: x[0])
for i, line in enumerate(matrix):
    matrix_string += '{0:12}'.format(''.join([x for x in tax_list[i][1] if 
コード例 #31
0
ファイル: test_util.py プロジェクト: anukat2015/lingpy
 def test_TextFile(self):
     path = self.tmp_path('test')
     with util.TextFile(path) as fp:
         fp.writelines(['line1\n', 'line2\n'])
     self.assertEqual(len(util.read_text_file(path, lines=True)), 2)
コード例 #32
0
ファイル: test_strings.py プロジェクト: vermillionbee/lingpy
 def test_scorer2str(self):
     """
     Test conversion of scorers to strings.
     """
     self.assertEqual(scorer2str(rc('dolgo').scorer),
                      read_text_file(test_data('dolgo.scorer')))
コード例 #33
0
ファイル: html.py プロジェクト: LinguList/lingpy
def msa2tex(
    infile,
    template='',
    filename='',
    **keywords
):
    """
    Convert an MSA to a tabular representation which can easily be used in
    LaTeX documents.
    """
    util.setdefaults(keywords, pid_mode=1)

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task
    # load msa
    msa = read_msa(infile)

    ## load templates
    tex = util.read_text_file(template or template_path('msa.tex'))

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    dataset = msa['dataset']
    infile = msa['infile']
    seq_id = msa['seq_id']

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    height = len(msa['alignment'])
    width = len(msa['alignment'][0])

    start = r'\tabular{l' + width * 'c' + '}\n'
    start += r'\bf\ttfamily Taxon & \multicolumn{' + str(
        width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    body = start
    for i, taxon in enumerate(msa['taxa']):
        body += r'\ttfamily ' + taxon.replace('_', r'\_')
        for j, char in enumerate(msa['alignment'][i]):
            if char != '-':
                cls = token2class(char, rcParams['dolgo'])
            elif char == '-':
                cls = 'X'
            if char == '_':
                char = r'\#'
            if cls == '_':
                cls = '2'
            if j not in swaps:
                body += r'&\cellcolor{col' + cls + r'}' + char
            else:
                if char != '-':
                    body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char
                else:
                    body += r'&\cellcolor{col' + cls + r'}\bf ' + char
        body += r'\\' + '\n'

    body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n'
    body += r'\endtabular' + '\n'

    # create the parameters etc.
    w = 1.5 * width + taxl * 0.25
    h = 0.5 * height + 1.0

    tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w))
    tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h))

    # create the rput stuff
    tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0))
    tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0))

    # insert the rest
    tex = tex.replace('<+CONTENT+>', body)

    # write to file
    if not filename:
        filename = 'lingpy-{0}'

    util.write_text_file(filename + '.tex', tex)
コード例 #34
0
def csv2list(filename,
             fileformat='',
             dtype=None,
             comment='#',
             sep='\t',
             strip_lines=True,
             header=False):
    """
    Very simple function to get quick (and somewhat naive) access to CSV-files.

    Parameters
    ----------
    filename : str
        Name of the input file.
    fileformat : {None str}
        If not specified the file <filename> will be loaded. Otherwise, the
        fileformat is interpreted as the specific extension of the input file.
    dtype : {list}
        If not specified, all data will be loaded as strings. Otherwise, a
        list specifying the data for each line should be provided.
    comment : string (default="#")
        Comment character in the begin of a line forces this line to be
        ignored (set to None  if you want to parse all lines of your file).
    sep : string (default = "\t")
        Specify the separator for the CSV-file.
    strip_lines : bool (default=True)
        Specify whether empty "cells" in the input file should be preserved. If
        set to c{False}, each line will be stripped first, and all whitespace
        will be cleaned. Otherwise, each line will be separated using the
        specified separator, and no stripping of whitespace will be carried
        out.
    header : bool (default=False)
        Indicate, whether the data comes along with a header.

    Returns
    -------
    l : list
        A list-representation of the CSV file.

    """
    # check for correct fileformat
    if fileformat:
        infile = filename + '.' + fileformat
    else:
        infile = filename

    if dtype is None:
        dtype = []

    l = []

    # open the file
    infile = read_text_file(infile, lines=True, normalize="NFC")

    # check for header
    idx = 0 if header else -1

    for i, line in enumerate(infile):
        if line and (not comment or not line.startswith(comment)) and idx != i:
            if strip_lines:
                cells = [c.strip() for c in line.strip().split(sep)]
            else:
                cells = [c.strip() for c in line.split(sep)]
            if not dtype:
                l += [cells]
            else:
                l += [[f(c) for f, c in zip(dtype, cells)]]

    return l
コード例 #35
0
ファイル: html.py プロジェクト: LinguList/lingpy
def psa2html(infile, **kw):
    """
    Function converts a PSA-file into colored html-format.
    """
    util.setdefaults(
        kw,
        template=False,
        css=False,
        comment='#',
        filename=infile[:-4]+'.html',
        compact=True)

    template = util.read_text_file(kw['template'] or template_path('psa.html'))
    css = util.read_text_file(kw['css'] or template_path('psa.css'))

    data = []
    for line in util.read_text_file(infile, lines=True):
        if not line.startswith(kw['comment']):
            data.append(line)

    seq_ids = []
    pairs = []
    taxa = []
    alignments = []

    del data[0]

    i = 0
    while i <= len(data) - 3:
        try:
            seq_ids.append(data[i])

            datA = data[i + 1].split('\t')
            datB = data[i + 2].split('\t')

            taxonA = datA[0].strip('.')
            taxonB = datB[0].strip('.')
            almA = datA[1:]
            almB = datB[1:]

            taxa.append((taxonA, taxonB))
            pairs.append(
                (
                    '.'.join([k for k in almA if k != '-']),
                    '.'.join([k for k in almB if k != '-'])
                )
            )
            alignments.append(
                (
                    [str(a) for a in almA],
                    [str(b) for b in almB],
                    0)
            )
            assert len(alignments[-1][0]) == len(alignments[-1][1])
            i += 4
        except AssertionError:
            log.warning("Line {0} of the data is probably miscoded.".format(i + 1))
            i += 1

    def get_classes(alm):
        classes = []
        residue = '<div class="residue {1}">{0}</div>'
        for j, char in enumerate(alm):
            if char == '-':
                d = 'dolgo_GAP'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'
            classes += [residue.format(char, d)]
        return ''.join(classes)

    out = '<table>\n'  # codecs.open(kw['filename'], 'w', 'utf-8')
    for i, (a, b, c) in enumerate(alignments):
        clsA = get_classes(a)
        clsB = get_classes(b)

        ids = int(100 * pid(a, b) + 0.5)

        out += '<tr class="head">'
        out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format(
            i + 1,
            seq_ids[i],
            ids
        )
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][0])
        out += '<td class="psa">{0}</td>'.format(clsA)
        out += '</tr>'
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][1])
        out += '<td class="psa">{0}</td>'.format(clsB)
        out += '</tr>'
        out += '<tr><td colspan=2></td></tr>'

    out += '</table>'

    html = template.format(alignments=out, css=css)

    if kw['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    util.write_text_file(kw['filename'], html)
コード例 #36
0
ファイル: derive.py プロジェクト: xrotwang/lingpy
 def _read_string(name):
     # normalize stuff
     # TODO: this is potentially dangerous and it is important to decide whether
     # TODO: switching to NFD might not be a better choice
     return util.read_text_file(
         os.path.join(file_path, name), normalize='NFC').replace('\n', '')
コード例 #37
0
ファイル: newick.py プロジェクト: pombredanne/parsimony
    def output(self, dtype, filename=None, labels=None):
        """
        Parameters
        ----------
        dtype : str {"json", "html", "nwk" }
            Specify the type of the output:
            
            * *json*: JSON format, suitable for use in d3.
            * *nwk*: Newick format (identical with input upon initialization).
            * *html*: Simple interactive HTML-representation with collapsible nodes.

        """

        if dtype == "json":
            if filename:
                with open(filename + "." + dtype, "w") as f:
                    f.write(json.dumps(self._dict, indent=2))
            else:
                return json.dumps(self._dict, indent=2)

        elif dtype == "html":

            # make simple label function
            get_label = lambda x: labels[x] if labels else x

            start = '<div id="root" class="node-container">root.content</div>'

            clean_label = lambda x: "".join([y for y in sort_tree(x) if y not in "();"]).replace(",", "_")

            template = '<div class="node-container"><div id="#node_name:label" class="node-label">#node_label</div><div class="node-content">#node_children:{node}</div></div>'

            leave = '<div id="#node_leave:label" class="node-leave"><div class="inner_leave">#node_leave</div></div>'

            txt = (
                template.format(node=self.root)
                .replace("#node_label", get_label(self[self.root]["label"]))
                .replace("#node_name", clean_label(self.root))
            )

            # transform function helps to make the transformation with check
            # for leave or child
            transform = (
                lambda x: template.format(node=x)
                .replace("#node_label", get_label(self[x]["label"]))
                .replace("#node_name", clean_label(x))
                if not self[x]["leave"]
                else leave.replace("#node_leave", get_label(x))
            )

            for i, node in enumerate(self.nodes):

                # write all children
                children = self[node]["children"]

                node_children = "\n".join([transform(child) for child in children])

                txt = txt.replace("#node_children:" + node, node_children)

            # get the templates
            html = util.read_text_file("lexical_change.html")
            css = util.read_text_file("lexical_change.css")
            js = util.read_text_file("lexical_change.js")
            title = "LingPy Tree Class"

            html = html.format(STYLE=css, SCRIPT=js, TITLE=title, TREE=txt)
            filename = filename or "lingpy.basic.newick"

            util.write_text_file(filename + ".html", html)
コード例 #38
0
def write_nexus(taxa,
                matrix,
                custom=None,
                custom_name='lingpy',
                missing="?",
                gap="-",
                template="mrbayes.nex",
                filename="mrbayes.nex",
                dtype="RESTRICTION",
                symbols="10",
                commands=None,
                commands_name="mrbayes"):
    """Write a nexus file for phylogenetic analyses.

    Parameters
    ----------
    taxa : list
        The taxonomic units in your data. They should be valid taxon names,
        only consisting of alphanumeric characters and an underscore, usually
        also not exceeding a length of 15 characters.
    matrix : list
        The matrix with the values for each taxon in one separate row. Usually,
        the matrix contains binary values which can be passed as strings or
        integers (1 and 0), but missing values are also possible. Given
        biological common restrictions, each character can only be one ASCII
        symbol.
    custom : list {default=None)
        This information allows to add custom information to the nexus file,
        like, for example, the structure of the characters, their original concept, or their type, and it will be
        written into a custom block in the nexus file. The name of the custom
        block can be specified with help of the `custom_name` keyword. The
        content is a list of strings which will be written line by line into
        the custom block.
    custom_name : str (default="lingpy")
        The name of the custom block which will be written to the file.
    missing : str (default="?")
        The symbol for missing characters.
    gap : str (default="-")
        The symbol for gaps (not relevant for linguistic analyses).
    template : str (default="mrbayes.nex")
        The name of the template file. This file is located in the template/
        folder of the LingPy package, but a custom file can be specified by
        providing the path.
    dtype : str (default="RESTRICTION")
        The datatype, which is usually "STANDARD" or "RESTRICTION" in
        linguistic analyses, with "RESTRICTION" pointing to pure birth-death
        models.
    symbols : str (default="10")
        The symbols used for the characters.
    commands : list (default=None)
        If specified, will write an additional block containing commands for
        phylogenetic software. The commands are passed as a list, containing
        strings. The name of the block is given by the keywords commands_name.
    commands_name : str (default="mrbayes")
        Determines how the block will be called to which the commands will be
        written.

    """
    tpath = util.Path(template_path(template))
    if tpath.exists:
        _template = util.read_text_file(tpath.as_posix())
    else:
        util.read_text_file(template)
    _commands = 'BEGIN {0};\n{1}\n\n'.format(
        commands_name, '\n'.join(commands)) if commands else ''
    _custom = 'BEGIN {0};\n{1}\n\n'.format(custom_name,
                                           '\n'.join(custom)) if custom else ''

    _matrix = ""
    mtl = max([len(t) for t in taxa]) + 1
    for i, (t, m) in enumerate(zip(taxa, matrix)):
        _matrix += str(t + mtl * ' ')[:mtl] + ' '
        _matrix += ''.join(
            ['({0})'.format(c) if len(c) > 1 else str(c) for c in m]) + '\n'

    text = _template.format(matrix=_matrix,
                            ntax=len(taxa),
                            nchar=len(matrix[0]),
                            gap=gap,
                            missing=missing,
                            dtype=dtype,
                            commands=_commands,
                            custom=_custom,
                            symbols=symbols)
    util.write_text_file(filename, text)
コード例 #39
0
ファイル: html.py プロジェクト: LinguList/lingpy
def msa2html(
    msa,
    shorttitle='',
    filename='',
    template='',
    **keywords
):
    """
    Convert files in ``msa``-format into colored ``html``-format.

    Parameters
    ----------
    msa : dict
        A dictionary object that contains all the information of an MSA object.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``SCA`` will be used.

    filename : str (default="")
        Define the name of the output file. If no name is defined, the name of
        the input file will be taken as a default.

    template : str (default="")
        The path to the template file. If no name is defined, the basic
        template will be used. The basic template currently used can be found
        under ``lingpy/data/templates/msa2html.html``.

    Examples
    --------
    Load the libary.

    >>> from lingpy import *
    
    Load an ``msq``-file from the test-sets.

    >>> msa = MSA('harry.msq')

    Align the data progressively and carry out a check for swapped sites.

    >>> msa.prog_align()
    >>> msa.swap_check()
    >>> print(msa)
    w    o    l    -    d    e    m    o    r    t
    w    a    l    -    d    e    m    a    r    -
    v    -    l    a    d    i    m    i    r    -

    Save the data to the file ``harry.msa``.

    >>> msa.output('msa',filename='harry')

    Save the ``msa``-object as ``html``.

    >>> msa.output('html',filename='harry')
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the ``color``
    :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted.
    

    See also
    --------
    lingpy.convert.html.alm2html
    """
    util.setdefaults(
        keywords,
        pid_mode=1,
        stress=rcParams['stress'],
        css=False,
        js=False,
        compact=False,
        class_sort=True,
        write_to_file=True,
    )

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task

    # load templates
    template = template or template_path('msa2html.html')
    if template == 'js':
        template = template_path('msa2html.js.html')
    html = util.read_text_file(template)
    css = util.read_text_file(keywords['css'] or template_path('msa.css'))
    js = util.read_text_file(keywords['js'] or template_path('msa.js'))

    # treat the msa-object as a file and try to load the file if this is the
    # case
    if isinstance(msa, string_types):
        msa = read_msa(msa, **keywords)
    else:
        raise ValueError('[!] No filename specified.')

    # load dataset, etc.
    dataset = msa['dataset']

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    infile = msa['infile']
    seq_id = msa['seq_id']

    # define the titles etc.
    if not shorttitle:
        shorttitle = 'SCA'

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    # format css file 
    css = css.replace('TAXON_LENGTH', str(taxl * 10))

    out = ''
    tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n'
    td_taxon = '<td class="taxon">{0}</td>'
    perc = int(80 / len(msa['alignment'][0]) + 0.5)
    td_residue = '<td class="residue {1}">{0}</td>'
    td_swap = '<td class="residue swap {1}">{0}</td>'
    td_unaligned = '<td class="residue noalign {1}">{0}</td>'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    # check for 
    local = ['*'] * len(msa['alignment'][0])
    if 'local' in msa:
        local = ['.'] * len(msa['alignment'][0])
        for i in msa['local']:
            local[i] = '*'

    # get two sorting schemas for the sequences
    if keywords['class_sort']:

        classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']]
        seqs = dict(
            [(a[1], b) for a, b in zip(
                sorted(
                    zip(classes, msa['seqs']),
                    key=lambda x: x[0]  # list(zip(x[0],x[1]))
                ),
                range(1, len(msa['seqs']) + 1)
            )]
        )
    else:
        seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1)))
    taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1)))

    # set up a list to store unique alignments
    alignments = []

    # start iteration
    for i, taxon in enumerate(msa['taxa']):
        tmp = ''
        tmp += td_taxon.format(taxon)

        # append alignment to alignments
        alignment = ''.join(msa['alignment'][i])
        sequence = msa['seqs'][i]
        if alignment in alignments:
            unique = 'false'
        else:
            unique = 'true'
            alignments += [alignment]

        for j, char in enumerate(msa['alignment'][i]):
            if char == '-':
                d = 'dolgo_GAP'
                c = '#bbbbbb'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])
                c = token2class(char, rcParams['_color'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'

            if j in swaps:
                tmp += td_swap.format(char, d)
            elif local[j] != '*':
                tmp += td_unaligned.format(char, d)
            else:
                tmp += td_residue.format(char, d)
        out += tr.format(tmp, unique, taxa[taxon], seqs[sequence])

    html = html.format(
        table=out,
        dataset=dataset,
        pid=pid_score,
        file=infile,
        sequence=seq_id,
        shorttitle=shorttitle,
        width=len(msa['alignment'][0]),
        table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl),
        taxa=len(msa['alignment']),
        uniseqs=len(set(msa['seqs'])),
        css=css,
        js=js
    )

    if not filename:
        filename = rcParams['filename']

    if not filename.endswith('.html'):
        filename = filename + '.html'

    if keywords['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    if keywords['write_to_file']:
        # check, whether the outfile already exists
        util.write_text_file(filename, html)
    else:
        return html
コード例 #40
0
 def _read_string(name):
     # normalize stuff
     # TODO: this is potentially dangerous and it is important to decide whether
     # TODO: switching to NFD might not be a better choice
     return util.read_text_file(os.path.join(file_path, name),
                                normalize='NFC').replace('\n', '')
コード例 #41
0
ファイル: html.py プロジェクト: LinguList/lingpy
def alm2html(
    infile,
    title='',
    shorttitle='',
    filename='',
    colored=False,
    main_template='',
    table_template='',
    dataset='',
    confidence=False,
    **keywords
):
    """
    Convert files in ``alm``-format into colored ``html``-format.

    Parameters
    ----------

    title : str
        Define the title of the output file. If no title is provided, the
        default title ``LexStat - Automatic Cognate Judgments`` will be used.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``LexStat`` will be used.
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the
    ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed
    and adapted. 

    See also
    --------
    lingpy.convert.html.msa2html
    lingpy.convert.html.msa2tex

    """
    util.setdefaults(keywords, json="", labels={})

    # open the infile
    if not os.path.exists(infile):
        infile = infile + '.alm'
    data = util.read_text_file(infile)

    # create the outfile
    if not filename:
        filename = rcParams['filename']

    # read in the templates
    html = util.read_text_file(main_template or template_path('alm2html.html'))
    if not table_template:
        table_template = template_path(
            'alm2html.table.js.html' if confidence else 'alm2html.table.html')
    table = util.read_text_file(table_template)
    css = util.read_text_file(template_path('alm.css'))
    js = util.read_text_file(template_path('alm.js'))

    # define a label function for the taxa
    label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x

    # check for windows-compatibility
    data = data.replace(os.linesep, '\n')[:-1]

    # split the data into blocks
    blocks = data.split('\n\n')

    # retrieve the dataset
    dataset = dataset or blocks[0]

    # create the outstring
    tmp_str = ''

    for block in blocks[1:]:
        lines = block.split('\n')
        m = [l.split('\t') for l in lines]

        # create colordict for different colors
        dc = len(set([l[0] for l in m]))

        if colored:
            colors = {a: b for a, b in zip(
                sorted(set([int(l[0]) for l in m])),
                colorRange(dc, brightness=400),
            )}
        else:
            colors = []
            white = True
            for i in sorted(set([abs(int(l[0])) for l in m])):
                if white:
                    colors.append((i, 'white'))
                    white = False
                else:
                    colors.append((i, 'gray'))
                    white = True
            colors = dict(colors)

        # get the basic item and its id
        iName = m[0][2]
        iID = m[0][3]

        # start writing the stuff to string
        tmp_str += table.format(NAME=iName, ID=iID)
        # define the basic string for the insertion
        bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}'

        for tracer, l in enumerate(m):
            # check whether the current line is a borrowing
            if int(l[0]) < 0:
                loan_line = ' loan'
            else:
                loan_line = ''

            # assign the cognate id
            tmp = '  <td>{0}</td>\n'.format(l[0])
            tmp += '  <td>{0}</td>\n'.format(label(l[1].strip('.')))

            # check alignments for confidence scores
            ipa_string = ''.join([cell.split('/')[0] for cell in
                                  l[4:]]).replace('-', '')

            tmp += '  <td>{0}</td>\n'.format(ipa_string)
            tmp += '  <td class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '   <table class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '    <tr>\n{0}    </tr>\n   </table>\n  </td>\n </tr>\n'

            # check whether another entry follows that is also an alignment,
            # otherwise, there's no need to display a word as an alignment
            cognate_set = False
            if tracer < len(m) - 1:
                if abs(int(m[tracer + 1][0])) == abs(int(l[0])):
                    cognate_set = True
            if tracer > 0:
                if abs(int(m[tracer - 1][0])) == abs(int(l[0])):
                    cognate_set = True

            # fill out html for the cognate sets
            if cognate_set:

                alm = ''
                for char in l[4:]:

                    # check for confidence scores
                    if '/' in char:
                        try:
                            char, conf, num = char.split('/')
                            conf = int(conf)
                        except ValueError:
                            print(char.split('/'))
                            raise ValueError("Something is wrong with %s." % (char))

                    else:
                        char, conf, rgb = char, (255, 255, 255), 0.0

                    if char == '-':
                        d = 'dolgo_GAP'
                    else:
                        d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                        # bad check for three classes named differently
                        if d == 'dolgo__':
                            d = 'dolgo_X'
                        elif d == 'dolgo_1':
                            d = 'dolgo_TONE'
                        elif d == 'dolgo_0':
                            d = 'dolgo_ERROR'

                    if confidence:
                        alm += '     '
                        alm += '<td class="char {1}" confidence={0} '.format(
                            conf,
                            d
                        )
                        alm += 'char="{0}" '.format(char)
                        alm += 'onclick="' + "show('{0}')".format(num) + '" '
                        alm += 'num="{0}"'.format(num)
                        alm += '>\n      {0}\n     </td>\n'.format(char)
                    else:
                        alm += '     '
                        alm += '<td class="char {0}">{1}</td>\n'.format(d, char)
            else:
                alm = '      '
                alm += '<td class="{0}">--</td>\n'.format(colors[abs(int(l[0]))])

            # format the alignment
            try:
                tmp = tmp.format(alm)
            except ValueError:
                raise ValueError("Unknown problem in matchin %s and %s." % (alm, tmp))

            # check for last line, where a new line should be inserted (not the
            # fastest solution, but plotting is not a matter of time, and it
            # suffices it's current purpose
            if tracer < len(m) - 1:
                pass
            else:
                if confidence:
                    tmp += ' </table>\n'

                tmp += ' <tr class="empty"><td colspan="4" class="empty">'
                tmp += '<hr class="empty" /></td></tr>\n'

            # format the whole string
            tmp_str += bas.format(
                colors[abs(int(l[0]))],
                tmp,
                loan_line,
                l[1]
            )

    if not title:
        title = "LexStat - Automatic Cognate Judgments"
    if not shorttitle:
        shorttitle = "LexStat"

    # check for json-attribute
    if keywords['json']:
        keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'],
                                                        indent=1)

    html = html.format(
        shorttitle=shorttitle,
        title=title,
        table=tmp_str,
        dataset=dataset,
        javascript=js,
        css=css,
        **keywords
    )
    util.write_text_file(filename + '.html', html)
    return
コード例 #42
0
ファイル: workflow.py プロジェクト: xrotwang/lingpy
    def cognate_detection(self, **keywords):
        """
        Method runs a cognate detection analysis.
        """
        kw = dict(
            align_method='progressive',
            align_mode=rcParams['align_mode'],
            align_modes=rcParams['align_modes'],
            cluster_method=rcParams['lexstat_cluster_method'],
            cognate_method='sca',
            cognate_mode='overlap',
            defaults=False,
            factor=rcParams['align_factor'],
            gap_weight=rcParams['gap_weight'],
            gop=rcParams['align_gop'],
            iteration=False,
            lexstat_modes=rcParams['lexstat_modes'],
            limit=rcParams['lexstat_limit'],
            merge_vowels=rcParams['merge_vowels'],
            model=rcParams['sca'],
            export="html",
            preprocessing=False,
            preprocessing_method=rcParams['lexstat_preprocessing_method'],
            preprocessing_threshold=rcParams['lexstat_preprocessing_threshold'],
            rands=rcParams['lexstat_rands'],
            ratio=rcParams['lexstat_ratio'],
            ref="customid",
            restricted_chars=rcParams['restricted_chars'],
            restriction='',
            runs=rcParams['lexstat_runs'],
            scale=rcParams['align_scale'],
            scoring_method=rcParams['lexstat_scoring_method'],
            swap_check=False,
            threshold=rcParams['lexstat_threshold'],
            tree_calc=rcParams['align_tree_calc'],
            vscale=rcParams['lexstat_vscale'],
            outfile=False,
            sonar=True,
        )

        # first load
        kw.update(keywords)
        if kw['defaults']:
            return kw

        # carry out lexstat cluster analysis
        self.lex = LexStat(self.infile, **kw)

        # reset filename if it is not defined
        kw['outfile'] = kw['outfile'] or self.lex.filename + '_lingpy'

        # check for traditional lexstat analysis
        if kw['cognate_method'] == 'lexstat':
            self.lex.get_scorer(
                method=kw['scoring_method'], modes=kw['lexstat_modes'], **kw)

        self.lex.cluster(method=kw['cognate_method'], mode=kw['cognate_mode'], **kw)

        # align the data
        self.alms = Alignments(self.lex, **kw)
        kw['scoredict'] = self.lex.cscorer \
            if kw['cognate_method'] == 'lexstat' else self.lex.bscorer

        self.alms.align(
            method=kw['align_method'],
            mode=kw['align_mode'],
            modes=kw['align_modes'],
            **kw)

        if 'tsv' in kw['export']:
            self.alms.output(
                'tsv',
                filename=kw['outfile'],
                ignore=['scorer', 'json', 'taxa', 'msa'],
                **kw)
        if 'html' in kw['export']:
            corrs, occs = get_correspondences(self.alms, kw['ref'])

            # serialize the wordlist
            wl = {}
            for concept in self.alms.concepts:
                entries = self.alms.get_list(concept=concept, flat=True)
                cogids = [self.alms[idx, kw['ref']] for idx in entries]
                words = [self.alms[idx, 'ipa'] for idx in entries]
                alms = [self.alms[idx, 'alignment'] for idx in entries]
                langs = [self.alms[idx, 'doculect'] for idx in entries]

                checkalm = lambda x: x if type(x) == str else ' '.join(x)

                wl[concept] = [list(k) for k in sorted(
                    zip(
                        langs,
                        [str(x) for x in entries],
                        words,
                        [str(x) for x in cogids],
                        [checkalm(x) for x in alms],
                    ),
                    key=lambda x: int(x[3]))]

                # make simple gloss id for internal use as id
                gloss2id = list(
                    zip(
                        self.alms.concepts,
                        [str(x) for x in range(1, len(self.alms.concepts) + 1)]))
                id2gloss = dict([[b, a] for a, b in gloss2id])
                gloss2id = dict(gloss2id)

                txt = ''
                txt += 'CORRS = ' + json.dumps(corrs) + ';\n'
                txt += 'LANGS = ' + json.dumps(self.alms.taxa) + ';\n'
                txt += 'OCCS = ' + json.dumps(occs) + ';\n'
                txt += 'WLS = ' + json.dumps(wl) + ';\n'
                txt += 'GlossId = ' + json.dumps(gloss2id) + ';\n'
                txt += 'IdGloss = ' + json.dumps(id2gloss) + ';\n'
                txt += 'FILE = "' + kw['outfile'] + '.tsv";\n'

                tpath = partial(util.data_path, 'templates')

                tname = 'jcov.{0}.html'.format(
                    'remote' if 'remote' in kw['export'] else 'direct')
                content = util.read_text_file(tpath(tname))

                util.write_text_file(
                    kw['outfile'] + '.html',
                    content.format(
                        CORRS=txt,
                        JCOV=util.read_text_file(tpath('jcov.js')),
                        STYLE=util.read_text_file(tpath('jcov.css')),
                        VENDOR=util.read_text_file(tpath('jcov.vendor.js')),
                        DIGHL=util.read_text_file(tpath('jcov.dighl.js'))))
コード例 #43
0
ファイル: strings.py プロジェクト: lingpy/lingpy
def write_nexus(
        wordlist,
        mode='mrbayes',
        filename="mrbayes.nex",
        ref="cogid",
        missing="?", gap="-",
        custom=None,
        custom_name='lingpy',
        commands=None, commands_name="mrbayes"):
    """Write a nexus file for phylogenetic analyses.

    Parameters
    ----------
    wordlist : lingpy.basic.wordlist.Wordlist
        A Wordlist object containing cognate IDs.
    mode : str (default="mrbayes")
        The name of the output nexus style. Valid values are:
            * 'MRBAYES': a MrBayes formatted nexus file.
            * 'SPLITSTREE': a SPLITSTREE formatted nexus file.
            * 'BEAST': a BEAST formatted nexus file.
            * 'BEASTWORDS': a BEAST formatted nexus for word-partitioned
               analyses.
            * 'TRAITLAB': a TRAITLab formatted nexus.
    filename : str (default=None)
        Name of the file to which the nexus file will be written.
        If set to c{None}, then this function will not write the nexus ontent
        to a file, but simply return the content as a string.
    ref: str (default="cogid")
        Column in which you store the cognate sets in your data.
    gap : str (default="-")
        The symbol for gaps (not relevant for linguistic analyses).
    missing : str (default="?")
        The symbol for missing characters.
    custom : list {default=None)
        This information allows to add custom information to the nexus file, like, for
        example, the structure of the characters, their original concept, or their
        type, and it will be written into a custom block in the nexus file. The name of
        the custom block can be specified with help of the `custom_name` keyword. The
        content is a list of strings which will be written line by line into the custom
        block.
    custom_name : str (default="lingpy")
        The name of the custom block which will be written to the file.
    commands : list (default=None)
        If specified, will write an additional block containing commands for
        phylogenetic software. The commands are passed as a list, containing
        strings. The name of the block is given by the keywords commands_name.
    commands_name : str (default="mrbayes")
        Determines how the block will be called to which the commands will be
        written.

    Returns
    -------
    nexus : str
        A string containing nexus file output
    """
    templates = {
        'BEAST': 'beast.nex',
        'BEASTWORDS': 'beast.nex',
        'SPLITSTREE': 'splitstree.nex',
        'MRBAYES': 'mrbayes.nex',
        'TRAITLAB': 'splitstree.nex',
    }
    
    block = "\n\nBEGIN {0};\n{1}\nEND;\n"  # template for nexus blocks
    
    # check for valid mode
    mode = mode.upper()
    if mode not in templates.keys():
        raise ValueError("Unknown output mode %s" % mode)

    # check for valid template
    template = templates.get(mode)
    tpath = util.Path(template_path(template))
    if tpath.exists:
        _template = util.read_text_file(tpath.as_posix())
    else:  # pragma: no cover
        raise IOError("Unknown template %s" % template)

    # check that `ref` is a valid column
    if ref not in wordlist._alias:
        raise KeyError("Unknown _ref_ column in wordlist '%s'" % ref)

    # retrieve the matrix
    matrix = [[] for x in range(wordlist.width)]
    etd = wordlist.get_etymdict(ref=ref)
    concepts = sorted([(cogid, wordlist[[
        x[0] for x in vals if x][0]][wordlist._rowIdx]) for (cogid, vals) in
        etd.items()],
        key=lambda x: (x[1], x[0]))
    # and missing data..
    missing_ = {t: [concept for (cogid, concept) in concepts if concept not in wordlist.get_list(
                col=t, entry=wordlist._row_name, flat=True)] for t in
                wordlist.cols}
    
    # add ascertainment character for mode=BEAST
    if mode == 'BEAST':
        matrix = [['0'] for m in matrix]
    
    # skip the constant sites for traitlab
    if mode == 'TRAITLAB':
        concepts = [(i, c) for (i, c) in concepts if not _is_constant(etd[i])]
    
    # fill matrix
    for i, t in enumerate(wordlist.cols):
        previous = ''
        for cogid, concept in concepts:
            if previous != concept:
                previous = concept
                # add ascertainment character for mode=BEASTWORDS. Note that if
                # a given word:language is missing, then its ascertainment
                # character is the `missing` character.
                if mode == "BEASTWORDS":
                    matrix[i] += ['0'] if concept not in missing_[t] else [missing]
            matrix[i] += ['1'] if etd[cogid][i] else ['0'] if concept not in \
                missing_[t] else [missing]

    # parse characters into `charsets` (a dict of word=>siteindex positions),
    # and `chars` (a list of characters).
    charsets, chars, previous = defaultdict(list), [], ''
    for i, (cogid, concept) in enumerate(concepts, 1):
        char = util.nexus_slug(concept)
        # add label for ascertainment character in BEAST mode
        if i == 1 and mode == 'BEAST':
            chars.append("_ascertainment")
        # add label for per-word ascertainment characters in BEASTWORDS
        if mode == 'BEASTWORDS' and previous != concept:
            chars.append("%s_ascertainment" % char)
            charsets[char].append(len(chars))
        # finally add label.
        chars.append(char)
        charsets[char].append(len(chars))
        previous = concept
    
    # create character labels block if needed
    if mode in ('BEAST', 'BEASTWORDS'):
        charblock = ",\n".join(["\t%d %s" % o for o in enumerate(chars, 1)])
    else:
        charblock = ""
    
    # create charsets block
    blockname, assumptions = None, ""
    if mode in ('BEASTWORDS', 'MRBAYES'):
        charsets = ["\tcharset %s = %d-%d;" % (
            c, min(m), max(m)) for (c, m) in charsets.items()
        ]
        blockname = 'ASSUMPTIONS' if mode == 'BEASTWORDS' else 'MRBAYES'
        assumptions = "\n".join(charsets)
    
    # commands
    if commands_name.upper() == blockname and len(assumptions) and commands:
        # merge commands specified in function call into output blockname
        assumptions += "\n" + "\n".join("\t%s" % c for c in commands)
    else:
        # different commands block set in commands_name.
        assumptions += block.format(commands_name, '\n'.join(commands)) if commands else ''
    
    # convert state matrix to string.
    _matrix = ""
    maxtaxlen = max([len(util.nexus_slug(t)) for t in wordlist.cols]) + 1
    for i, (taxon, m) in enumerate(zip(wordlist.cols, matrix)):
        _matrix += str(util.nexus_slug(taxon) + maxtaxlen * ' ')[:maxtaxlen] + ' '
        _matrix += ''.join([
            '({0})'.format(c) if len(c) > 1 else str(c) for c in m
        ]) + '\n'
    _matrix = _matrix.rstrip()  # remove trailing

    # TODO: symbols could be more than "01" but we this function doesn't handle
    # multistate data so we just specify them here.
    symbols = '01'

    text = _template.format(
        matrix=_matrix,
        ntax=wordlist.width,
        nchar=len(matrix[0]),
        gap=gap, missing=missing,
        dtype='RESTRICTION' if mode == 'MRBAYES' else 'STANDARD',
        commands=block.format(blockname, assumptions),
        custom=block.format(custom_name, '\n'.join(custom)) if custom else '',
        symbols=symbols, chars=charblock
    )
    text = text.replace("\t", " " * 4)  # normalise tab-stops
    for i, (cogid, concept) in enumerate(concepts, 1):
        text += '\n[MATRIX:{0}=COGID:{1}=CONCEPT:{2}]'.format(i, cogid, concept)
    if filename:
        util.write_text_file(filename, text)
    return text