Python tokens2classの例、lingpy.sequence.sound_classes.tokens2class Pythonの例

コード例 #1

0

ファイルを表示

    def evaluate_string(self, string, tokens=False, **keywords):
        if not tokens:
            tokens = ipa2tokens(string)
        score = 1
        dist = self.dist['#']

        prostring = prosodic_string(
            tokens2class(tokens, model=rcParams['art'], **keywords), **keywords)
        if self.classes:
            c = tokens2class(tokens, model=self.model)
            teststring = list(zip(prostring, c))
        else:
            teststring = list(zip(prostring, tokens))

        scores = []

        while len(teststring) > 0:
            segment = teststring.pop(0)
            freq = dist.count(segment)
            allf = len(dist)
            s = freq / allf
            score = score * s
            scores += [s]
            dist = self.dist[segment]
        score = score * s
        scores += [s]
        lscore = np.log10(score)
        lscore = lscore / len(tokens)
        return score, lscore  # np.log10(score)

コード例 #2

0

ファイルを表示

    def __init__(self,
                 words,
                 tokens=False,
                 prostrings=[],
                 classes=False,
                 class_model=rcParams['model'],
                 **keywords):
        setdefaults(keywords,
                    stress=rcParams['stress'],
                    diacritics=rcParams['diacritics'],
                    cldf=False)
        self.model = class_model
        self.words = words
        self.tokens = []
        self.bigrams = []
        self.classes = []

        # start filling the dictionary
        for i, w in enumerate(words):

            # check for tokenized string
            if not tokens:
                tk = ipa2tokens(w, **keywords)
            else:
                tk = w[:]
            self.tokens += [tk]

            # create prosodic string
            if prostrings:
                p = prostrings[i]
            else:
                print(w, tk)
                tt = tokens2class(tk, rcParams['art'])
                print(tt)
                p = prosodic_string(tk,
                                    rcParams['art'],
                                    cldf=keywords['cldf'],
                                    diacritics=keywords['diacritics'],
                                    stress=keywords['stress'])
            # create classes
            if classes:
                c = tokens2class(tk,
                                 class_model,
                                 cldf=keywords['cldf'],
                                 diacritics=keywords['diacritics'],
                                 stress=keywords['stress'])
                bigrams = list(zip(p, c))
                self.classes += [c]
            else:
                # zip the stuff
                bigrams = list(zip(p, tk))

            # start appending the stuff
            self.bigrams += [bigrams]

            # init the mother object
            MCBasic.__init__(self, self.bigrams)

コード例 #3

0

ファイルを表示

ファイル: generate.py プロジェクト: LinguList/lingpy

    def __init__(
        self,
        words,
        tokens=False,
        prostrings=[],
        classes=False,
        class_model=rcParams['model'],
        **keywords
    ):
        setdefaults(keywords, stress=rcParams['stress'],
                diacritics=rcParams['diacritics'], cldf=False)
        self.model = class_model
        self.words = words
        self.tokens = []
        self.bigrams = []
        self.classes = []

        # start filling the dictionary
        for i, w in enumerate(words):

            # check for tokenized string
            if not tokens:
                tk = ipa2tokens(w, **keywords)
            else:
                tk = w[:]
            self.tokens += [tk]

            # create prosodic string
            if prostrings:
                p = prostrings[i]
            else:
                tt = tokens2class(tk, rcParams['art'])
                p = prosodic_string(
                        tk, 
                        rcParams['art'],
                        cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
            # create classes
            if classes:
                c = tokens2class(tk, class_model, cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
                bigrams = list(zip(p, c))
                self.classes += [c]
            else:
                # zip the stuff
                bigrams = list(zip(p, tk))

            # start appending the stuff
            self.bigrams += [bigrams]

            # init the mother object
            MCBasic.__init__(self, self.bigrams)

コード例 #4

0

ファイルを表示

def test_tokens2class():

    seq = 'tʰ ɔ x ˈth ə r A ˈI ʲ'.split(' ')
    seq2 = 'th o ?/x a'.split(' ')
    seq3 = 'th o ?/ a'.split(' ')

    assert tokens2class(seq, 'dolgo') == list('TVKTVR000')
    assert tokens2class(seq2, 'cv')[2] == '0'
    assert tokens2class(seq2, 'cv', clpa=True)[2] == 'C'
    assert tokens2class(seq3, 'cv', clpa=True)[2] == '0'

    assert_raises(IndexError, tokens2class, 'b  l'.split(' '), 'dolgo')

コード例 #5

0

ファイルを表示

    def test_tokens2class(self):
        seq = 'tʰ ɔ x ˈth ə r A ˈI ʲ'.split(' ')
        seq2 = 'th o ?/x a'.split(' ')
        seq3 = 'th o ?/ a'.split(' ')

        assert tokens2class(seq, 'dolgo') == list('TVKTVR000')
        assert tokens2class(seq2, 'cv', cldf=False)[2] == '0'
        assert tokens2class(seq2, 'cv')[2] == 'C'
        assert tokens2class(seq3, 'cv', cldf=True)[2] == '0'

        assert_raises(ValueError, tokens2class, ['A'], 'dolgo')
        assert_raises(ValueError, tokens2class, 'bla', 'sca')

コード例 #6

0

ファイルを表示

ファイル: test_sound_classes.py プロジェクト: LinguList/lingpy

    def test_tokens2class(self):
        seq = 'tʰ ɔ x ˈth ə r A ˈI ʲ'.split(' ')
        seq2 = 'th o ?/x a'.split(' ')
        seq3 = 'th o ?/ a'.split(' ')

        assert tokens2class(seq, 'dolgo') == list('TVKTVR000')
        assert tokens2class(seq2, 'cv', cldf=False)[2] == '0'
        assert tokens2class(seq2, 'cv')[2] == 'C'
        assert tokens2class(seq3, 'cv', cldf=True)[2] == '0'

        assert_raises(ValueError, tokens2class, ['A'], 'dolgo')
        assert_raises(ValueError, tokens2class, 'bla', 'sca')

コード例 #7

0

ファイルを表示

def test_tokens2class():

    seq = 'tʰ ɔ x ˈth ə r A ˈI'.split(' ')

    assert tokens2class(seq, 'dolgo') == list('TVKTVR00')

    assert_raises(ValueError, tokens2class, 'b  l'.split(' '), 'dolgo')

コード例 #8

0

ファイルを表示

ファイル: test_sound_classes.py プロジェクト: sflavier/lingpy

def test_tokens2class():

    seq = 'tʰ ɔ x ˈth ə r A ˈI'.split(' ')

    assert tokens2class(seq, 'dolgo') == list('TVKTVR00')

    assert_raises(ValueError, tokens2class, 'b  l'.split(' '), 'dolgo')

コード例 #9

0

ファイルを表示

ファイル: pairwise.py プロジェクト: anukat2015/lingpy

def turchin(seqA, seqB, model='dolgo', **keywords):
    """
    Return cognate judgment based on the method by :evobib:`Turchin2010`.

    Parameters
    ----------
    seqA, seqB : {str, list, tuple}
        The input strings. These should be iterables, so you can use tuples,
        lists, or strings.
    model : {"asjp", "sca", "dolgo"} (default="dolgo")
        A sound-class model instance or a string that denotes one of the
        standard sound class models used in LingPy.

    Returns
    -------
    cognacy : {0, 1}
        The cognacy assertion which is either 0 (words are probably cognate) or
        1 (words are not likely to be cognate).

    """
    if text_type(model) == model:
        model = rcParams[model]
    elif hasattr(model, 'info'):
        pass
    else:
        raise ValueError("[!] No valid model instance selected.")

    if isinstance(seqA, (text_type, str)):
        seqA = ipa2tokens(seqA)
        seqB = ipa2tokens(seqB)

    classA = tokens2class(seqA, model)
    classB = tokens2class(seqB, model)

    if classA[0] in model.vowels:
        classA[0] = 'H'
    if classB[0] in model.vowels:
        classB[0] = 'H'

    if ''.join([k for k in classA if k not in model.vowels])[:2] == \
            ''.join([k for k in classB if k not in model.vowels])[:2]:
        return 0
    else:
        return 1

コード例 #10

0

ファイルを表示

ファイル: utility.py プロジェクト: peterdekker/prediction-histling

def ipa_to_asjp(w):
    """
    Lingpy IPA-to-ASJP converter plus some cleanup.
    This function is called on IPA datasets.
    """
    w = w.replace('\"', '').replace('-', '').replace(' ', '')
    wA = ''.join(tokens2class(ipa2tokens(w, merge_vowels=False), 'asjp'))
    wAA = clean_asjp(wA.replace('0', '').replace('I', '3').replace('H', 'N'))
    asjp = ''.join([x for x in wAA if x in sounds])
    return asjp

コード例 #11

0

ファイルを表示

    def _set_model(self, **keywords):
        """
        Define the sequence model for the calculation.

        Parameters
        ----------
        model : { None, Model } (default=None)
            Specify the sound-class model to which the strings shall be
            converted.
        """
        defaults = dict(
            model=rcParams['sca'],
            stress=rcParams['stress'],
            transform=rcParams['align_transform'])
        for k in defaults:
            if k not in keywords:
                keywords[k] = defaults[k]

        if isinstance(keywords['model'], (text_type, str)):
            self.model = rcParams[keywords['model']]
        else:
            self.model = keywords['model']

        self.classes = []
        for clA, clB in map(
            lambda x: (
                tokens2class(x[0], self.model, stress=keywords['stress']),
                tokens2class(x[1], self.model, stress=keywords['stress'])),
            self.tokens
        ):
            self.classes += [(clA, clB)]

        self.weights = []
        for prA, prB in self.prostrings:
            self.weights += [(
                prosodic_weights(prA, _transform=keywords['transform']),
                prosodic_weights(prB, _transform=keywords['transform'])
            )]

        self.scoredict = self.model.scorer

コード例 #12

0

ファイルを表示

ファイル: pairwise.py プロジェクト: LinguList/lingpy

    def _set_model(self, **keywords):
        """
        Define the sequence model for the calculation.

        Parameters
        ----------
        model : { None, Model } (default=None)
            Specify the sound-class model to which the strings shall be
            converted.
        """
        defaults = dict(
            model=rcParams['sca'],
            stress=rcParams['stress'],
            transform=rcParams['align_transform'])
        for k in defaults:
            if k not in keywords:
                keywords[k] = defaults[k]

        if isinstance(keywords['model'], (text_type, str)):
            self.model = rcParams[keywords['model']]
        else:
            self.model = keywords['model']

        self.classes = []
        for clA, clB in map(
            lambda x: (
                tokens2class(x[0], self.model, stress=keywords['stress']),
                tokens2class(x[1], self.model, stress=keywords['stress'])),
            self.tokens
        ):
            self.classes += [(clA, clB)]

        self.weights = []
        for prA, prB in self.prostrings:
            self.weights += [(
                prosodic_weights(prA, _transform=keywords['transform']),
                prosodic_weights(prB, _transform=keywords['transform'])
            )]

        self.scoredict = self.model.scorer

コード例 #13

0

ファイルを表示

def turchin(seqA, seqB, model='dolgo', **keywords):
    """
    Return cognate judgment based on the method by :evobib:`Turchin2010`.

    Parameters
    ----------
    seqA, seqB : {str, list, tuple}
        The input strings. These should be iterables, so you can use tuples,
        lists, or strings.
    model : {"asjp", "sca", "dolgo"} (default="dolgo")
        A sound-class model instance or a string that denotes one of the
        standard sound class models used in LingPy.

    Returns
    -------
    cognacy : {0, 1}
        The cognacy assertion which is either 0 (words are probably cognate) or
        1 (words are not likely to be cognate).

    """
    if text_type(model) == model:
        model = rcParams[model]
    elif not hasattr(model, 'info'):
        raise ValueError("[!] No valid model instance selected.")

    if isinstance(seqA, string_types):
        seqA = ipa2tokens(seqA)
        seqB = ipa2tokens(seqB)

    classA = tokens2class(seqA, model)
    classB = tokens2class(seqB, model)

    if classA[0] in model.vowels:
        classA[0] = 'H'
    if classB[0] in model.vowels:
        classB[0] = 'H'

    return int(''.join([k for k in classA if k not in model.vowels])[:2] !=
               ''.join([k for k in classB if k not in model.vowels])[:2])

コード例 #14

0

ファイルを表示

ファイル: utils.py プロジェクト: PhyloStar/svmcc

def ipa_to_asjp(w, params):
    """
	Lingpy IPA-to-ASJP converter plus some cleanup.
	Expects the params {} to contain the key: sounds.
	
	This function is called on IPA datasets.
	"""
    w = w.replace('\"', '').replace('-', '').replace(' ', '')
    wA = ''.join(tokens2class(ipa2tokens(w, merge_vowels=False), 'asjp'))
    wAA = clean_asjp(wA.replace('0', '').replace('I', '3').replace('H', 'N'))
    asjp = ''.join([x for x in wAA if x in params['sounds']])
    assert len(asjp) > 0
    return asjp

コード例 #15

0

ファイルを表示

def get_structure(word,
                  sep='+',
                  zipped=False,
                  semi_diacritics='hsʃʂʒʐzθɕʑfvθðnmȵ'):
    if not isinstance(word, (list, tuple)):
        word = lingpy.ipa2tokens(word,
                                 expand_nasals=True,
                                 merge_vowels=False,
                                 semi_diacritics=semi_diacritics)

    # check for unknown chars
    try:
        tokens2class(word, 'cv', cldf=True)
    except ValueError:
        print('problem with {0}'.format(''.join(word)))
        return []

    # get the morphemes
    if sep in word:
        words = tokens2morphemes(word, cldf=True)
        morphemes = []
        for w in words:
            morphemes += tokens2morphemes(w, sep=sep)
    else:
        morphemes = tokens2morphemes(word, cldf=True)
    # get the basic structure for each morpheme
    for morpheme in morphemes:
        try:
            segments = parse_chinese_morphemes(morpheme)
        except:
            if not zipped:
                yield ['NULL']
            else:
                yield ([('NULL', 'NULL')], morpheme)
        if not zipped:
            yield [x for x, y in zip('imnct', segments) if y != '-']
        else:
            yield ([x for x in zip('imnct', segments)
                    if x[1] != '-'], morpheme)

コード例 #16

0

ファイルを表示

ファイル: lingpy_util.py プロジェクト: marctang/lexibank-data-old

def iter_cognates(dataset,
                  column='Segments',
                  method='turchin',
                  threshold=0.5,
                  **keywords):
    """
    Compute cognates automatically for a given dataset.
    """
    if method == 'turchin':
        for row in dataset.rows:
            sounds = ''.join(tokens2class(row[column].split(' '), 'dolgo'))
            if sounds.startswith('V'):
                sounds = 'H' + sounds
            sounds = '-'.join([s for s in sounds if s != 'V'][:2])
            cogid = slug(row['Parameter_name']) + '-' + sounds
            if '0' not in sounds:
                yield (
                    row['ID'],
                    dataset.name,
                    row['Value'],
                    cogid,
                    '',
                    'CMM',
                    '',  # cognate source
                    '',  # alignment
                    '',  # alignment method
                    '',  # alignment source
                )

    if method in ['sca', 'lexstat']:
        lex = _cldf2lexstat(dataset)
        if method == 'lexstat':
            lex.get_scorer(**keywords)
        lex.cluster(method=method, threshold=threshold, ref='cogid')
        for k in lex:
            yield (
                lex[k, 'lid'],
                dataset.name,
                lex[k, 'value'],
                lex[k, 'cogid'],
                '',
                method + '-t{0:.2f}'.format(threshold),
                '',  # cognate source
                '',  # alignment
                '',  # alignment method
                '',  # alignment source
            )

コード例 #17

0

ファイルを表示

ファイル: lingpy_util.py プロジェクト: marctang/lexibank-data-old

def test_sequence(sequence, **keywords):
    """
    Test a sequence for compatibility with CLPA and LingPy.
    """
    invalid = Counter()
    segment_count = Counter()
    lingpy_errors = set()
    clpa_errors = set()
    clpa_repl = defaultdict(set)
    general_errors = 0

    # clean the string at first, we only take the first item, ignore the rest
    try:
        segments = clean_string(sequence, **keywords)[0].split(' ')
        lingpy_analysis = [
            x if y != '0' else '?'
            for x, y in zip(segments, tokens2class(segments, 'dolgo'))
        ]
        clpa_analysis, _sounds, _errors = clpa.check_sequence(segments)
        general_errors = len(
            ['?' for x in zip(lingpy_analysis, clpa_analysis) if '?' in x])
    except (ValueError, IndexError, AttributeError):
        invalid.update([sequence])
        segments, clpa_analysis = [], []

    if segments:
        for a, b, c in zip(segments, lingpy_analysis, clpa_analysis):
            if a[0] in clpa.accents:
                a = a[1:]
            if c[0] in clpa.accents:
                c = c[1:]
            segment_count.update([a])
            if b == '?':
                lingpy_errors.add(a)
            if c != a:
                if c == '?':
                    clpa_errors.add(a)
                else:
                    clpa_repl[a].add(c)

    return (segments, [clpa.segment2clpa(x)
                       for x in clpa_analysis], invalid, segment_count,
            lingpy_errors, clpa_errors, clpa_repl, general_errors)

コード例 #18

0

ファイルを表示

ファイル: html.py プロジェクト: LinguList/lingpy

def msa2html(
    msa,
    shorttitle='',
    filename='',
    template='',
    **keywords
):
    """
    Convert files in ``msa``-format into colored ``html``-format.

    Parameters
    ----------
    msa : dict
        A dictionary object that contains all the information of an MSA object.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``SCA`` will be used.

    filename : str (default="")
        Define the name of the output file. If no name is defined, the name of
        the input file will be taken as a default.

    template : str (default="")
        The path to the template file. If no name is defined, the basic
        template will be used. The basic template currently used can be found
        under ``lingpy/data/templates/msa2html.html``.

    Examples
    --------
    Load the libary.

    >>> from lingpy import *
    
    Load an ``msq``-file from the test-sets.

    >>> msa = MSA('harry.msq')

    Align the data progressively and carry out a check for swapped sites.

    >>> msa.prog_align()
    >>> msa.swap_check()
    >>> print(msa)
    w    o    l    -    d    e    m    o    r    t
    w    a    l    -    d    e    m    a    r    -
    v    -    l    a    d    i    m    i    r    -

    Save the data to the file ``harry.msa``.

    >>> msa.output('msa',filename='harry')

    Save the ``msa``-object as ``html``.

    >>> msa.output('html',filename='harry')
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the ``color``
    :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted.
    

    See also
    --------
    lingpy.convert.html.alm2html
    """
    util.setdefaults(
        keywords,
        pid_mode=1,
        stress=rcParams['stress'],
        css=False,
        js=False,
        compact=False,
        class_sort=True,
        write_to_file=True,
    )

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task

    # load templates
    template = template or template_path('msa2html.html')
    if template == 'js':
        template = template_path('msa2html.js.html')
    html = util.read_text_file(template)
    css = util.read_text_file(keywords['css'] or template_path('msa.css'))
    js = util.read_text_file(keywords['js'] or template_path('msa.js'))

    # treat the msa-object as a file and try to load the file if this is the
    # case
    if isinstance(msa, string_types):
        msa = read_msa(msa, **keywords)
    else:
        raise ValueError('[!] No filename specified.')

    # load dataset, etc.
    dataset = msa['dataset']

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    infile = msa['infile']
    seq_id = msa['seq_id']

    # define the titles etc.
    if not shorttitle:
        shorttitle = 'SCA'

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    # format css file 
    css = css.replace('TAXON_LENGTH', str(taxl * 10))

    out = ''
    tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n'
    td_taxon = '<td class="taxon">{0}</td>'
    perc = int(80 / len(msa['alignment'][0]) + 0.5)
    td_residue = '<td class="residue {1}">{0}</td>'
    td_swap = '<td class="residue swap {1}">{0}</td>'
    td_unaligned = '<td class="residue noalign {1}">{0}</td>'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    # check for 
    local = ['*'] * len(msa['alignment'][0])
    if 'local' in msa:
        local = ['.'] * len(msa['alignment'][0])
        for i in msa['local']:
            local[i] = '*'

    # get two sorting schemas for the sequences
    if keywords['class_sort']:

        classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']]
        seqs = dict(
            [(a[1], b) for a, b in zip(
                sorted(
                    zip(classes, msa['seqs']),
                    key=lambda x: x[0]  # list(zip(x[0],x[1]))
                ),
                range(1, len(msa['seqs']) + 1)
            )]
        )
    else:
        seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1)))
    taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1)))

    # set up a list to store unique alignments
    alignments = []

    # start iteration
    for i, taxon in enumerate(msa['taxa']):
        tmp = ''
        tmp += td_taxon.format(taxon)

        # append alignment to alignments
        alignment = ''.join(msa['alignment'][i])
        sequence = msa['seqs'][i]
        if alignment in alignments:
            unique = 'false'
        else:
            unique = 'true'
            alignments += [alignment]

        for j, char in enumerate(msa['alignment'][i]):
            if char == '-':
                d = 'dolgo_GAP'
                c = '#bbbbbb'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])
                c = token2class(char, rcParams['_color'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'

            if j in swaps:
                tmp += td_swap.format(char, d)
            elif local[j] != '*':
                tmp += td_unaligned.format(char, d)
            else:
                tmp += td_residue.format(char, d)
        out += tr.format(tmp, unique, taxa[taxon], seqs[sequence])

    html = html.format(
        table=out,
        dataset=dataset,
        pid=pid_score,
        file=infile,
        sequence=seq_id,
        shorttitle=shorttitle,
        width=len(msa['alignment'][0]),
        table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl),
        taxa=len(msa['alignment']),
        uniseqs=len(set(msa['seqs'])),
        css=css,
        js=js
    )

    if not filename:
        filename = rcParams['filename']

    if not filename.endswith('.html'):
        filename = filename + '.html'

    if keywords['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    if keywords['write_to_file']:
        # check, whether the outfile already exists
        util.write_text_file(filename, html)
    else:
        return html

コード例 #19

0

ファイルを表示

ファイル: html.py プロジェクト: vermillionbee/lingpy

def msa2html(msa, shorttitle='', filename='', template='', **keywords):
    """
    Convert files in ``msa``-format into colored ``html``-format.

    Parameters
    ----------
    msa : dict
        A dictionary object that contains all the information of an MSA object.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``SCA`` will be used.

    filename : str (default="")
        Define the name of the output file. If no name is defined, the name of
        the input file will be taken as a default.

    template : str (default="")
        The path to the template file. If no name is defined, the basic
        template will be used. The basic template currently used can be found
        under ``lingpy/data/templates/msa2html.html``.

    Examples
    --------
    Load the libary.

    >>> from lingpy import *
    
    Load an ``msq``-file from the test-sets.

    >>> msa = MSA('harry.msq')

    Align the data progressively and carry out a check for swapped sites.

    >>> msa.prog_align()
    >>> msa.swap_check()
    >>> print(msa)
    w    o    l    -    d    e    m    o    r    t
    w    a    l    -    d    e    m    a    r    -
    v    -    l    a    d    i    m    i    r    -

    Save the data to the file ``harry.msa``.

    >>> msa.output('msa',filename='harry')

    Save the ``msa``-object as ``html``.

    >>> msa.output('html',filename='harry')
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the ``color``
    :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted.
    

    See also
    --------
    lingpy.convert.html.alm2html
    """
    util.setdefaults(
        keywords,
        pid_mode=1,
        stress=rcParams['stress'],
        css=False,
        js=False,
        compact=False,
        class_sort=True,
        write_to_file=True,
    )

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task

    # load templates
    template = template or template_path('msa2html.html')
    if template == 'js':
        template = template_path('msa2html.js.html')
    html = util.read_text_file(template)
    css = util.read_text_file(keywords['css'] or template_path('msa.css'))
    js = util.read_text_file(keywords['js'] or template_path('msa.js'))

    # treat the msa-object as a file and try to load the file if this is the
    # case
    if isinstance(msa, string_types):
        msa = read_msa(msa, **keywords)
    else:
        raise ValueError('[!] No filename specified.')

    # load dataset, etc.
    dataset = msa['dataset']

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    infile = msa['infile']
    seq_id = msa['seq_id']

    # define the titles etc.
    if not shorttitle:
        shorttitle = 'SCA'

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    # format css file
    css = css.replace('TAXON_LENGTH', str(taxl * 10))

    out = ''
    tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n'
    td_taxon = '<td class="taxon">{0}</td>'
    perc = int(80 / len(msa['alignment'][0]) + 0.5)
    td_residue = '<td class="residue {1}">{0}</td>'
    td_swap = '<td class="residue swap {1}">{0}</td>'
    td_unaligned = '<td class="residue noalign {1}">{0}</td>'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    # check for
    local = ['*'] * len(msa['alignment'][0])
    if 'local' in msa:
        local = ['.'] * len(msa['alignment'][0])
        for i in msa['local']:
            local[i] = '*'

    # get two sorting schemas for the sequences
    if keywords['class_sort']:

        classes = [
            tokens2class(ipa2tokens(seq), rcParams['asjp'])
            for seq in msa['seqs']
        ]
        seqs = dict([
            (a[1], b) for a, b in zip(
                sorted(
                    zip(classes, msa['seqs']),
                    key=lambda x: x[0]  # list(zip(x[0],x[1]))
                ),
                range(1,
                      len(msa['seqs']) + 1))
        ])
    else:
        seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1)))
    taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1)))

    # set up a list to store unique alignments
    alignments = []

    # start iteration
    for i, taxon in enumerate(msa['taxa']):
        tmp = ''
        tmp += td_taxon.format(taxon)

        # append alignment to alignments
        alignment = ''.join(msa['alignment'][i])
        sequence = msa['seqs'][i]
        if alignment in alignments:
            unique = 'false'
        else:
            unique = 'true'
            alignments += [alignment]

        for j, char in enumerate(msa['alignment'][i]):
            if char == '-':
                d = 'dolgo_GAP'
                c = '#bbbbbb'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])
                c = token2class(char, rcParams['_color'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'

            if j in swaps:
                tmp += td_swap.format(char, d)
            elif local[j] != '*':
                tmp += td_unaligned.format(char, d)
            else:
                tmp += td_residue.format(char, d)
        out += tr.format(tmp, unique, taxa[taxon], seqs[sequence])

    html = html.format(
        table=out,
        dataset=dataset,
        pid=pid_score,
        file=infile,
        sequence=seq_id,
        shorttitle=shorttitle,
        width=len(msa['alignment'][0]),
        table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl),
        taxa=len(msa['alignment']),
        uniseqs=len(set(msa['seqs'])),
        css=css,
        js=js)

    if not filename:
        filename = rcParams['filename']

    if not filename.endswith('.html'):
        filename = filename + '.html'

    if keywords['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    if keywords['write_to_file']:
        # check, whether the outfile already exists
        util.write_text_file(filename, html)
    else:
        return html