Ejemplo n.º 1
0
def test_token2class():

    seq = 'tʰ ɔ x ˈth ə r A'.split(' ')

    assert token2class(seq[0], rc('dolgo')) == 'T'
    assert token2class(seq[3], 'dolgo') == 'T'
    assert token2class(seq[-1], 'dolgo') == '0'
Ejemplo n.º 2
0
    def test_token2class(self):
        seq = 'tʰ ɔ x ˈth ə r A'.split(' ')

        assert token2class(seq[0], rc('dolgo')) == 'T'
        assert token2class(seq[3], 'dolgo') == 'T'
        assert token2class(seq[-1], 'dolgo') == '0'
        assert token2class('', 'dolgo') == '0'
Ejemplo n.º 3
0
def seg2class(segment, sca=False):
    if segment in ['#', '-']:
        return segment
    if sca:
        contexts.add((token2class(segment, 'sca'), segment))
        return token2class(segment, 'sca')
    cl = token2class(segment, 'dolgo')
    return 'vowel' if cl == 'V' else 'cons'
Ejemplo n.º 4
0
def _make_package(args):  # pragma: no cover
    """Prepare transcriptiondata from the transcription sources."""
    from lingpy.sequence.sound_classes import token2class
    from lingpy.data import Model

    columns = ['LATEX', 'FEATURES', 'SOUND', 'IMAGE', 'COUNT', 'NOTE']
    bipa = TranscriptionSystem('bipa')
    for src, rows in args.repos.iter_sources(type='td'):
        args.log.info('TranscriptionData {0} ...'.format(src['NAME']))
        uritemplate = URITemplate(
            src['URITEMPLATE']) if src['URITEMPLATE'] else None
        out = [[
            'BIPA_GRAPHEME', 'CLTS_NAME', 'GENERATED', 'EXPLICIT', 'GRAPHEME',
            'URL'
        ] + columns]
        graphemes = set()
        for row in rows:
            if row['GRAPHEME'] in graphemes:
                args.log.warn('skipping duplicate grapheme: {0}'.format(
                    row['GRAPHEME']))
                continue
            graphemes.add(row['GRAPHEME'])
            if not row['BIPA']:
                bipa_sound = bipa[row['GRAPHEME']]
                explicit = ''
            else:
                bipa_sound = bipa[row['BIPA']]
                explicit = '+'
            generated = '+' if bipa_sound.generated else ''
            if is_valid_sound(bipa_sound, bipa):
                bipa_grapheme = bipa_sound.s
                bipa_name = bipa_sound.name
            else:
                bipa_grapheme, bipa_name = '<NA>', '<NA>'
            url = uritemplate.expand(
                **row) if uritemplate else row.get('URL', '')
            out.append([
                bipa_grapheme, bipa_name, generated, explicit, row['GRAPHEME'],
                url
            ] + [row.get(c, '') for c in columns])
        found = len([o for o in out if o[0] != '<NA>'])
        args.log.info('... {0} of {1} graphemes found ({2:.0f}%)'.format(
            found, len(out), found / len(out) * 100))
        with UnicodeWriter(pkg_path('transcriptiondata',
                                    '{0}.tsv'.format(src['NAME'])),
                           delimiter='\t') as writer:
            writer.writerows(out)

    count = 0
    with UnicodeWriter(pkg_path('soundclasses', 'lingpy.tsv'),
                       delimiter='\t') as writer:
        writer.writerow(['CLTS_NAME', 'BIPA_GRAPHEME'] + SOUNDCLASS_SYSTEMS)
        for grapheme, sound in sorted(bipa.sounds.items()):
            if not sound.alias:
                writer.writerow([sound.name, grapheme] + [
                    token2class(grapheme, Model(cls))
                    for cls in SOUNDCLASS_SYSTEMS
                ])
                count += 1
    args.log.info('SoundClasses: {0} written to file.'.format(count))
Ejemplo n.º 5
0
    def get_classes(alm):
        classes = []
        residue = '<div class="residue {1}">{0}</div>'
        for j, char in enumerate(alm):
            if char == '-':
                d = 'dolgo_GAP'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'
            classes += [residue.format(char, d)]
        return ''.join(classes)
Ejemplo n.º 6
0
    def get_classes(alm):
        classes = []
        residue = '<div class="residue {1}">{0}</div>'
        for j, char in enumerate(alm):
            if char == '-':
                d = 'dolgo_GAP'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'
            classes += [residue.format(char, d)]
        return ''.join(classes)
Ejemplo n.º 7
0
 for sA, dictB in sorted(all_changes.items(),
                         key=lambda x: len(x[1]),
                         reverse=True):
     for sB, items in dictB.items():
         table += [[
             sA, sB,
             len(items),
             len(set(items)), ', '.join([
                 '{0} > {1} ({2})'.format(a, b, items.count((a, b)))
                 for a, b in set(items)
             ])
         ]]
 table = sorted(
     table,
     key=lambda x:
     (x[2], x[3], token2class(x[0], 'cv'), token2class(x[1], 'cv'),
      token2class(x[0], 'dolgo'), token2class(x[1], 'dolgo'),
      token2class(x[0], 'sca'), token2class(x[1], 'sca')),
     reverse=True)
 print('[i] found {0} distinct changes in the data'.format(len(table)))
 with codecs.open("sound-change-frequencies.tsv", 'w', 'utf-8') as f:
     f.write(
         '\t'.join(['Source', 'Target', 'Frequency', 'RelFreq', 'Pairs']) +
         '\n')
     for line in table:
         f.write('\t'.join([str(x) for x in line]) + '\n')
 print('[i] most frequent 10 changes')
 print(
     tabulate(
         [line[:-1] for line in table][:10],
         tablefmt='pipe',
gs = gridspec.GridSpec(len(wordlists)+2, 1)
all_cols = []
all_sounds = defaultdict(int)
all_colors = {}
for i, w in enumerate(wordlists):
    wl = Wordlist(w)
    colors = {}
    tmp = defaultdict(int)
    sylen = []
    clen = []
    for k in wl:
        dolgos = tokens2class(wl[k, 'tokens'], 'dolgo')
        for idx, t in zip(dolgos, wl[k, 'tokens']):
            if idx not in '+':
                tmp[idx] += 1
                colors[idx] = token2class(t, color)
                all_cols += [(k, colors[idx])]
                all_sounds[idx] += 1
                all_colors[idx] = colors[idx]
        sylen += [len(syllabify(' '.join(wl[k, 'tokens']), output='nested'))]
        clen += [len([x for x in dolgos if x not in '1V'])]
    print(w, sum(sylen) / len(sylen), sum(clen) / len(clen))
    ax = plt.subplot(gs[i])
    labels = [x for x, y in sorted(tmp.items(), key=lambda x: x[0])]
    ax.pie([y for x, y in sorted(tmp.items(), key=lambda x: x[0])],
            colors=[y for x, y in sorted(colors.items(), key=lambda x: x[0])],
            radius = 0.95, frame=True, shadow=True)
    ax.set_autoscale_on(False)
    plt.ylim(-1, 1)
    plt.xlim(-1, 1)
    plt.title(w.split('_')[2].split('-')[0])
Ejemplo n.º 9
0
color.converter["³¹"] = "Brown"
color.converter["¹"] = "White"
color.converter["²¹"] = "DarkOrange"
color.converter["³³"] = "CornflowerBlue"
color.converter["⁵³"] = "#c86496"
color.converter["⁵¹"] = "cyan"
_conv = {}
_conv["A"] = "LightBlue"
_conv["E"] = "Orange"
_conv["I"] = "LightGreen"
_conv["O"] = "white"
_conv["U"] = "Crimson"
_conv["Y"] = "LightYellow"

for sound in color.converter:
    cls = token2class(sound, "sca")
    if cls in "AEIOUY":
        color.converter[sound] = _conv[cls]


def contains(syllable, sound):
    _s = normalize("NFD", "".join(syllable))
    if sound in _s:
        return True
    return False


def is_aspirated(syllable):
    return contains(syllable, "ʰ")

Ejemplo n.º 10
0
def get_confidence(alms, scorer, ref='lexstatid', gap_weight=1):
    """
    Function creates confidence scores for a given set of alignments.

    Parameters
    ----------
    alms : :py:class`~lingpy.align.sca.Alignments`
        An *Alignments* object containing already aligned strings.
    scorer : :py:class:`~lingpy.algorithm._misc.ScoreDict`
        A *ScoreDict* object which gives similarity scores for all segments in
        the alignment.
    ref : str (default="lexstatid")
        The reference entry-type, referring to the cognate-set to be used for
        the analysis.
    """
    # store all values for average scores
    values = []

    # store all correspondences
    corrs = {}

    # store occurrences
    occs = {}

    for key, msa in alms.msa[ref].items():
        # get basic stuff
        idxs = msa['ID']
        taxa = msa['taxa']
        concept = cgi.escape(alms[idxs[0], 'concept'], True)

        # get numerical representation of alignments
        if scorer:
            alignment = [class2tokens(
                alms[idxs[i], 'numbers'],
                msa['alignment'][i]) for i in range(len(idxs))]
        else:
            alignment = msa['alignment']

        # create new array for confidence
        confidence_matrix = []
        character_matrix = []

        # iterate over each taxon
        for i, taxon in enumerate(taxa):
            idx = alms.taxa.index(taxon) + 1

            # get the numerical sequence
            nums = alignment[i]

            # store confidences per line
            confidences = []

            # store chars per line
            chars = []

            # iterate over the sequence
            for j, num in enumerate(nums):
                col = [alm[j] for alm in alignment]
                score = 0
                count = 0

                # get the char
                if num != '-':
                    charA = dotjoin(taxa[i], msa['alignment'][i][j], num.split('.')[2])
                    chars += [charA]
                    try:
                        occs[charA] += [concept]
                    except:
                        occs[charA] = [concept]
                else:
                    chars += ['-']

                for k, numB in enumerate(col):
                    if k != i:
                        if num == '-' and numB == '-':
                            pass
                        else:
                            if numB != '-' and num != '-':
                                # get the second char
                                charB = dotjoin(
                                    taxa[k], msa['alignment'][k][j], numB.split('.')[2])
                                try:
                                    corrs[charA][charB] += 1
                                except:
                                    try:
                                        corrs[charA][charB] = 1
                                    except:
                                        corrs[charA] = {charB: 1}

                            gaps = False
                            if num == '-' and numB != '-':
                                numA = charstring(idx)
                                gaps = True
                            elif numB == '-' and num != '-':
                                numB = charstring(alms.taxa.index(taxa[k]))
                                numA = num
                                gaps = True
                            else:
                                numA = num

                            scoreA = scorer[numA, numB]
                            scoreB = scorer[numB, numA]
                            this_score = max(scoreA, scoreB)

                            if not gaps:
                                score += this_score
                                count += 1
                            else:
                                score += this_score * gap_weight
                                count += gap_weight

                if count:
                    score = score / count
                else:
                    score = -25

                confidences += [int(score + 0.5)]
                values += [int(score + 0.5)]
            confidence_matrix += [confidences]
            character_matrix += [chars]

        # append confidence matrix to alignments
        alms.msa[ref][key]['confidence'] = confidence_matrix
        alms.msa[ref][key]['_charmat'] = character_matrix

    # sort the values
    values = sorted(set(values + [1]))

    # make conversion to scale of 100 values
    converter = {}
    valsA = values[:values.index(1)]
    valsB = values[values.index(1):]
    stepA = 50 / (len(valsA) + 1)
    stepB = 75 / (len(valsB) + 1)
    for i, score in enumerate(valsA):  # values[:values.index(0)):
        converter[score] = int((stepA * i) / 4 + 0.5)
    for i, score in enumerate(valsB):
        converter[score] = int(stepB * i + 0.5) + 50

    # iterate over keys again
    for key, msa in alms.msa[ref].items():
        # get basic stuff
        for i, line in enumerate(msa['confidence']):
            for j, cell in enumerate(line):
                alms.msa[ref][key]['confidence'][i][j] = converter[cell]

    jsond = {}
    for key, corr in corrs.items():
        splits = [c.split('.') + [o] for c, o in corr.items()]
        sorts = sorted(splits, key=lambda x: (x[0], -x[3]))
        new_sorts = []

        # check for rowspan
        spans = {}
        for a, b, c, d in sorts:
            if a in spans:
                if spans[a] < 3 and d > 1:
                    spans[a] += 1
                    new_sorts += [[a, b, c, d]]
            else:
                if d > 1:
                    spans[a] = 1
                    new_sorts += [[a, b, c, d]]

        bestis = []
        old_lang = ''
        counter = 0
        for a, b, c, d in new_sorts:
            new_lang = a
            if new_lang != old_lang:
                old_lang = new_lang

                tmp = '<tr class="display">'
                tmp += '<td class="display" rowspan={0}>'.format(spans[a])
                tmp += a + '</td>'
                tmp += '<td class="display" onclick="show({0});"><span '.format(
                    "'" + dotjoin(a, b, c) + "'")
                tmp += 'class="char {0}">' + b + '</span></td>'
                tmp += '<td class="display">'
                tmp += c + '</td>'
                tmp += '<td class="display">' + str(d) + '</td>'
                tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>'
                tmp += '</tr>'
                t = 'dolgo_' + token2class(b, rcParams['dolgo'])

                # bad check for three classes named differently
                if t == 'dolgo__':
                    t = 'dolgo_X'
                elif t == 'dolgo_1':
                    t = 'dolgo_TONE'
                elif t == 'dolgo_0':
                    t = 'dolgo_ERROR'

                bestis += [tmp.format(t)]
                counter += 1

            elif counter > 0:
                tmp = '<tr class="display">'
                tmp += '<td class="display" onclick="show({0});"><span '.format(
                    "'" + dotjoin(a, b, c) + "'")
                tmp += 'class="char {0}">' + b + '</span></td>'
                tmp += '<td class="display">' + c + '</td>'
                tmp += '<td class="display">' + str(d) + '</td>'
                tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>'
                tmp += '</tr>'

                t = 'dolgo_' + token2class(b, rcParams['dolgo'])

                # bad check for three classes named differently
                if t == 'dolgo__':
                    t = 'dolgo_X'
                elif t == 'dolgo_1':
                    t = 'dolgo_TONE'
                elif t == 'dolgo_0':
                    t = 'dolgo_ERROR'

                bestis += [tmp.format(t)]
                counter += 1
                old_lang = new_lang
            else:
                old_lang = new_lang
                counter = 0

        jsond[key] = [''.join(bestis), occs[key]]

    return jsond
Ejemplo n.º 11
0
def msa2tex(infile, template='', filename='', **keywords):
    """
    Convert an MSA to a tabular representation which can easily be used in
    LaTeX documents.
    """
    util.setdefaults(keywords, pid_mode=1)

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task
    # load msa
    msa = read_msa(infile)

    ## load templates
    tex = util.read_text_file(template or template_path('msa.tex'))

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    dataset = msa['dataset']
    infile = msa['infile']
    seq_id = msa['seq_id']

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    height = len(msa['alignment'])
    width = len(msa['alignment'][0])

    start = r'\tabular{l' + width * 'c' + '}\n'
    start += r'\bf\ttfamily Taxon & \multicolumn{' + str(
        width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    body = start
    for i, taxon in enumerate(msa['taxa']):
        body += r'\ttfamily ' + taxon.replace('_', r'\_')
        for j, char in enumerate(msa['alignment'][i]):
            if char != '-':
                cls = token2class(char, rcParams['dolgo'])
            elif char == '-':
                cls = 'X'
            if char == '_':
                char = r'\#'
            if cls == '_':
                cls = '2'
            if j not in swaps:
                body += r'&\cellcolor{col' + cls + r'}' + char
            else:
                if char != '-':
                    body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char
                else:
                    body += r'&\cellcolor{col' + cls + r'}\bf ' + char
        body += r'\\' + '\n'

    body += r'&' + '&'.join([r'\color{white}XXX'
                             for i in range(width)]) + r'\\' + '\n'
    body += r'\endtabular' + '\n'

    # create the parameters etc.
    w = 1.5 * width + taxl * 0.25
    h = 0.5 * height + 1.0

    tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w))
    tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h))

    # create the rput stuff
    tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0))
    tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0))

    # insert the rest
    tex = tex.replace('<+CONTENT+>', body)

    # write to file
    if not filename:
        filename = 'lingpy-{0}'

    util.write_text_file(filename + '.tex', tex)
Ejemplo n.º 12
0
def simple_profile(wordlist,
                   ref='ipa',
                   semi_diacritics='hsʃ̢ɕʂʐʑʒw',
                   merge_vowels=False,
                   brackets=None,
                   splitters='/,;~',
                   merge_geminates=True,
                   bad_word="<???>",
                   bad_sound="<?>",
                   clts=None,
                   unknown_sound="!{0}"):
    """
    Create an initial Orthography Profile using Lingpy's clean_string procedure.

    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A wordlist from which you want to derive an initial
        orthography profile.
    ref : str (default="ipa")
        The name of the reference column in which the words are stored.
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    splitters : str
        The characters which force the automatic splitting of an entry.
    clts : dict (default=None)
        A dictionary(like) object that converts a given source sound into a
        potential target sound, using the get()-method of the dictionary.
        Normally, we think of a CLTS instance here (that is: a cross-linguistic
        transcription system as defined in the pyclts package).
    bad_word : str (default="«???»")
        Indicate how words that could not be parsed should be handled. Note
        that both "bad_word" and "bad_sound" are format-strings, so you can add
        formatting information here.
    bad_sound : str (default="«?»")
        Indicate how sounds that could not be converted to a sound class be
        handled. Note that both "bad_word" and "bad_sound" are format-strings,
        so you can add formatting information here.
    unknown_sound : str (default="!{0}")
        If with_clts is set to True, use this string to indicate that sounds
        are classified as "unknown sound" in the CLTS framework.    
    
    Returns
    -------
    profile : generator
        A generator of tuples (three items), indicating the segment, its frequency,
        the conversion to sound classes in the Dolgopolsky sound-class model,
        and the unicode-codepoints.
    """
    clts = clts or {}
    nulls = set()
    bad_words = set()
    brackets = brackets or "([{『(₍⁽«)]})』⁾₎"
    profile = defaultdict(int)
    words = [wordlist[idx, ref] for idx in wordlist]
    for word in pb(words, desc='iterating over words'):
        if isinstance(word, list):
            word = ' '.join(word)
        cleaned_string = clean_string(word,
                                      semi_diacritics=semi_diacritics,
                                      merge_vowels=merge_vowels,
                                      brackets=None,
                                      ignore_brackets=False,
                                      split_entries=False,
                                      preparse=None,
                                      rules=None,
                                      merge_geminates=merge_geminates)[0]

        # retain whole word if there are splitters in the word
        if [x for x in cleaned_string if x in brackets + splitters]:
            profile[word] += 1
            bad_words.add(word)
        else:
            for segment in cleaned_string.split(' '):
                profile[segment] += 1
            for segment in [x for x in word if x not in cleaned_string]:
                profile[segment] += 1
                nulls.add(segment)

    for s, f in pb(sorted(profile.items(), key=lambda x: x[1], reverse=True),
                   desc='preparing profile'):
        sclass = token2class(s, 'dolgo')
        if s in bad_words:
            ipa = bad_word.format(s)
        elif sclass == '0' and s not in nulls:
            ipa = bad_sound.format(s)
        elif s in nulls:
            ipa = 'NULL'
        elif clts:
            sound = clts.get(s, False)
            if not sound:
                ipa = '!' + s
            else:
                ipa = text_type(sound)
        else:
            ipa = s
        yield s, ipa, text_type(f), codepoint(s)
Ejemplo n.º 13
0
for msa, vals in alm.msa[target].items():

    langs = vals['taxa']
    seqs = vals['alignment']

    alm_len = len(seqs[0])
    len_alms += alm_len
    #print alm_len
    for i, lang in enumerate(alm.cols):
        raxml_alm_str = ""
        if lang not in langs:
            alm_str = alm_len * '?'
            raxml_alm_str = list(alm_str)
        else:
            raxml_alm_str = [
                token2class(x, 'sca') if x != '-' else '-'
                for x in seqs[langs.index(lang)]
            ]
            for ch in raxml_alm_str:
                if ch == '-':
                    continue
                if ch not in uniq_chars:
                    uniq_chars.append(ch)
            #raxml_alm_str = ' '.join([map_chars[uniq_chars.index(x)] if x != '-' else '-' for x in alm_str])
            #print raxml_alm_str

        phylip[lang] += raxml_alm_str
#        phylip[lang] += alm_str

print len(uniq_chars), " ALPHABET"
print sorted(uniq_chars)
Ejemplo n.º 14
0
def get_confidence(alms, scorer, ref='lexstatid', gap_weight=1):
    """
    Function creates confidence scores for a given set of alignments.

    Parameters
    ----------
    alms : :py:class`~lingpy.align.sca.Alignments`
        An *Alignments* object containing already aligned strings.
    scorer : :py:class:`~lingpy.algorithm._misc.ScoreDict`
        A *ScoreDict* object which gives similarity scores for all segments in
        the alignment.
    ref : str (default="lexstatid")
        The reference entry-type, referring to the cognate-set to be used for
        the analysis.
    """
    # store all values for average scores
    values = []

    # store all correspondences
    corrs = {}

    # store occurrences
    occs = {}

    for key, msa in alms.msa[ref].items():
        # get basic stuff
        idxs = msa['ID']
        taxa = msa['taxa']
        concept = cgi.escape(alms[idxs[0], 'concept'], True)

        # get numerical representation of alignments
        if scorer:
            alignment = [class2tokens(
                alms[idxs[i], 'numbers'],
                msa['alignment'][i]) for i in range(len(idxs))]
        else:
            alignment = msa['alignment']

        # create new array for confidence
        confidence_matrix = []
        character_matrix = []

        # iterate over each taxon
        for i, taxon in enumerate(taxa):
            idx = alms.taxa.index(taxon) + 1

            # get the numerical sequence
            nums = alignment[i]

            # store confidences per line
            confidences = []

            # store chars per line
            chars = []

            # iterate over the sequence
            for j, num in enumerate(nums):
                col = [alm[j] for alm in alignment]
                score = 0
                count = 0

                # get the char
                if num != '-':
                    charA = dotjoin(taxa[i], msa['alignment'][i][j], num.split('.')[2])
                    chars += [charA]
                    try:
                        occs[charA] += [concept]
                    except:
                        occs[charA] = [concept]
                else:
                    chars += ['-']

                for k, numB in enumerate(col):
                    if k != i:
                        if num == '-' and numB == '-':
                            pass
                        else:
                            if numB != '-' and num != '-':
                                # get the second char
                                charB = dotjoin(
                                    taxa[k], msa['alignment'][k][j], numB.split('.')[2])
                                try:
                                    corrs[charA][charB] += 1
                                except:
                                    try:
                                        corrs[charA][charB] = 1
                                    except:
                                        corrs[charA] = {charB: 1}

                            gaps = False
                            if num == '-' and numB != '-':
                                numA = charstring(idx)
                                gaps = True
                            elif numB == '-' and num != '-':
                                numB = charstring(alms.taxa.index(taxa[k]))
                                numA = num
                                gaps = True
                            else:
                                numA = num

                            scoreA = scorer[numA, numB]
                            scoreB = scorer[numB, numA]
                            this_score = max(scoreA, scoreB)

                            if not gaps:
                                score += this_score
                                count += 1
                            else:
                                score += this_score * gap_weight
                                count += gap_weight

                if count:
                    score = score / count
                else:
                    score = -25

                confidences += [int(score + 0.5)]
                values += [int(score + 0.5)]
            confidence_matrix += [confidences]
            character_matrix += [chars]

        # append confidence matrix to alignments
        alms.msa[ref][key]['confidence'] = confidence_matrix
        alms.msa[ref][key]['_charmat'] = character_matrix

    # sort the values
    values = sorted(set(values + [1]))

    # make conversion to scale of 100 values
    converter = {}
    valsA = values[:values.index(1)]
    valsB = values[values.index(1):]
    stepA = 50 / (len(valsA) + 1)
    stepB = 75 / (len(valsB) + 1)
    for i, score in enumerate(valsA):  # values[:values.index(0)):
        converter[score] = int((stepA * i) / 4 + 0.5)
    for i, score in enumerate(valsB):
        converter[score] = int(stepB * i + 0.5) + 50

    # iterate over keys again
    for key, msa in alms.msa[ref].items():
        # get basic stuff
        for i, line in enumerate(msa['confidence']):
            for j, cell in enumerate(line):
                alms.msa[ref][key]['confidence'][i][j] = converter[cell]

    jsond = {}
    for key, corr in corrs.items():
        splits = [c.split('.') + [o] for c, o in corr.items()]
        sorts = sorted(splits, key=lambda x: (x[0], -x[3]))
        new_sorts = []

        # check for rowspan
        spans = {}
        for a, b, c, d in sorts:
            if a in spans:
                if spans[a] < 3 and d > 1:
                    spans[a] += 1
                    new_sorts += [[a, b, c, d]]
            else:
                if d > 1:
                    spans[a] = 1
                    new_sorts += [[a, b, c, d]]

        bestis = []
        old_lang = ''
        counter = 0
        for a, b, c, d in new_sorts:
            new_lang = a
            if new_lang != old_lang:
                old_lang = new_lang

                tmp = '<tr class="display">'
                tmp += '<td class="display" rowspan={0}>'.format(spans[a])
                tmp += a + '</td>'
                tmp += '<td class="display" onclick="show({0});"><span '.format(
                    "'" + dotjoin(a, b, c) + "'")
                tmp += 'class="char {0}">' + b + '</span></td>'
                tmp += '<td class="display">'
                tmp += c + '</td>'
                tmp += '<td class="display">' + str(d) + '</td>'
                tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>'
                tmp += '</tr>'
                t = 'dolgo_' + token2class(b, rcParams['dolgo'])

                # bad check for three classes named differently
                if t == 'dolgo__':
                    t = 'dolgo_X'
                elif t == 'dolgo_1':
                    t = 'dolgo_TONE'
                elif t == 'dolgo_0':
                    t = 'dolgo_ERROR'

                bestis += [tmp.format(t)]
                counter += 1

            elif counter > 0:
                tmp = '<tr class="display">'
                tmp += '<td class="display" onclick="show({0});"><span '.format(
                    "'" + dotjoin(a, b, c) + "'")
                tmp += 'class="char {0}">' + b + '</span></td>'
                tmp += '<td class="display">' + c + '</td>'
                tmp += '<td class="display">' + str(d) + '</td>'
                tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>'
                tmp += '</tr>'

                t = 'dolgo_' + token2class(b, rcParams['dolgo'])

                # bad check for three classes named differently
                if t == 'dolgo__':
                    t = 'dolgo_X'
                elif t == 'dolgo_1':
                    t = 'dolgo_TONE'
                elif t == 'dolgo_0':
                    t = 'dolgo_ERROR'

                bestis += [tmp.format(t)]
                counter += 1
                old_lang = new_lang
            else:
                old_lang = new_lang
                counter = 0

        jsond[key] = [''.join(bestis), occs[key]]

    return jsond
Ejemplo n.º 15
0
def msa2tex(
    infile,
    template='',
    filename='',
    **keywords
):
    """
    Convert an MSA to a tabular representation which can easily be used in
    LaTeX documents.
    """
    util.setdefaults(keywords, pid_mode=1)

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task
    # load msa
    msa = read_msa(infile)

    ## load templates
    tex = util.read_text_file(template or template_path('msa.tex'))

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    dataset = msa['dataset']
    infile = msa['infile']
    seq_id = msa['seq_id']

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    height = len(msa['alignment'])
    width = len(msa['alignment'][0])

    start = r'\tabular{l' + width * 'c' + '}\n'
    start += r'\bf\ttfamily Taxon & \multicolumn{' + str(
        width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    body = start
    for i, taxon in enumerate(msa['taxa']):
        body += r'\ttfamily ' + taxon.replace('_', r'\_')
        for j, char in enumerate(msa['alignment'][i]):
            if char != '-':
                cls = token2class(char, rcParams['dolgo'])
            elif char == '-':
                cls = 'X'
            if char == '_':
                char = r'\#'
            if cls == '_':
                cls = '2'
            if j not in swaps:
                body += r'&\cellcolor{col' + cls + r'}' + char
            else:
                if char != '-':
                    body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char
                else:
                    body += r'&\cellcolor{col' + cls + r'}\bf ' + char
        body += r'\\' + '\n'

    body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n'
    body += r'\endtabular' + '\n'

    # create the parameters etc.
    w = 1.5 * width + taxl * 0.25
    h = 0.5 * height + 1.0

    tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w))
    tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h))

    # create the rput stuff
    tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0))
    tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0))

    # insert the rest
    tex = tex.replace('<+CONTENT+>', body)

    # write to file
    if not filename:
        filename = 'lingpy-{0}'

    util.write_text_file(filename + '.tex', tex)
Ejemplo n.º 16
0
def alm2html(
    infile,
    title='',
    shorttitle='',
    filename='',
    colored=False,
    main_template='',
    table_template='',
    dataset='',
    confidence=False,
    **keywords
):
    """
    Convert files in ``alm``-format into colored ``html``-format.

    Parameters
    ----------

    title : str
        Define the title of the output file. If no title is provided, the
        default title ``LexStat - Automatic Cognate Judgments`` will be used.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``LexStat`` will be used.
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the
    ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed
    and adapted. 

    See also
    --------
    lingpy.convert.html.msa2html
    lingpy.convert.html.msa2tex

    """
    util.setdefaults(keywords, json="", labels={})

    # open the infile
    if not os.path.exists(infile):
        infile = infile + '.alm'
    data = util.read_text_file(infile)

    # create the outfile
    if not filename:
        filename = rcParams['filename']

    # read in the templates
    html = util.read_text_file(main_template or template_path('alm2html.html'))
    if not table_template:
        table_template = template_path(
            'alm2html.table.js.html' if confidence else 'alm2html.table.html')
    table = util.read_text_file(table_template)
    css = util.read_text_file(template_path('alm.css'))
    js = util.read_text_file(template_path('alm.js'))

    # define a label function for the taxa
    label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x

    # check for windows-compatibility
    data = data.replace(os.linesep, '\n')[:-1]

    # split the data into blocks
    blocks = data.split('\n\n')

    # retrieve the dataset
    dataset = dataset or blocks[0]

    # create the outstring
    tmp_str = ''

    for block in blocks[1:]:
        lines = block.split('\n')
        m = [l.split('\t') for l in lines]

        # create colordict for different colors
        dc = len(set([l[0] for l in m]))

        if colored:
            colors = {a: b for a, b in zip(
                sorted(set([int(l[0]) for l in m])),
                colorRange(dc, brightness=400),
            )}
        else:
            colors = []
            white = True
            for i in sorted(set([abs(int(l[0])) for l in m])):
                if white:
                    colors.append((i, 'white'))
                    white = False
                else:
                    colors.append((i, 'gray'))
                    white = True
            colors = dict(colors)

        # get the basic item and its id
        iName = m[0][2]
        iID = m[0][3]

        # start writing the stuff to string
        tmp_str += table.format(NAME=iName, ID=iID)
        # define the basic string for the insertion
        bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}'

        for tracer, l in enumerate(m):
            # check whether the current line is a borrowing
            if int(l[0]) < 0:
                loan_line = ' loan'
            else:
                loan_line = ''

            # assign the cognate id
            tmp = '  <td>{0}</td>\n'.format(l[0])
            tmp += '  <td>{0}</td>\n'.format(label(l[1].strip('.')))

            # check alignments for confidence scores
            ipa_string = ''.join([cell.split('/')[0] for cell in
                                  l[4:]]).replace('-', '')

            tmp += '  <td>{0}</td>\n'.format(ipa_string)
            tmp += '  <td class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '   <table class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '    <tr>\n{0}    </tr>\n   </table>\n  </td>\n </tr>\n'

            # check whether another entry follows that is also an alignment,
            # otherwise, there's no need to display a word as an alignment
            cognate_set = False
            if tracer < len(m) - 1:
                if abs(int(m[tracer + 1][0])) == abs(int(l[0])):
                    cognate_set = True
            if tracer > 0:
                if abs(int(m[tracer - 1][0])) == abs(int(l[0])):
                    cognate_set = True

            # fill out html for the cognate sets
            if cognate_set:

                alm = ''
                for char in l[4:]:

                    # check for confidence scores
                    if '/' in char:
                        try:
                            char, conf, num = char.split('/')
                            conf = int(conf)
                        except ValueError:
                            print(char.split('/'))
                            raise ValueError("Something is wrong with %s." % (char))

                    else:
                        char, conf, rgb = char, (255, 255, 255), 0.0

                    if char == '-':
                        d = 'dolgo_GAP'
                    else:
                        d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                        # bad check for three classes named differently
                        if d == 'dolgo__':
                            d = 'dolgo_X'
                        elif d == 'dolgo_1':
                            d = 'dolgo_TONE'
                        elif d == 'dolgo_0':
                            d = 'dolgo_ERROR'

                    if confidence:
                        alm += '     '
                        alm += '<td class="char {1}" confidence={0} '.format(
                            conf,
                            d
                        )
                        alm += 'char="{0}" '.format(char)
                        alm += 'onclick="' + "show('{0}')".format(num) + '" '
                        alm += 'num="{0}"'.format(num)
                        alm += '>\n      {0}\n     </td>\n'.format(char)
                    else:
                        alm += '     '
                        alm += '<td class="char {0}">{1}</td>\n'.format(d, char)
            else:
                alm = '      '
                alm += '<td class="{0}">--</td>\n'.format(colors[abs(int(l[0]))])

            # format the alignment
            try:
                tmp = tmp.format(alm)
            except ValueError:
                raise ValueError("Unknown problem in matchin %s and %s." % (alm, tmp))

            # check for last line, where a new line should be inserted (not the
            # fastest solution, but plotting is not a matter of time, and it
            # suffices it's current purpose
            if tracer < len(m) - 1:
                pass
            else:
                if confidence:
                    tmp += ' </table>\n'

                tmp += ' <tr class="empty"><td colspan="4" class="empty">'
                tmp += '<hr class="empty" /></td></tr>\n'

            # format the whole string
            tmp_str += bas.format(
                colors[abs(int(l[0]))],
                tmp,
                loan_line,
                l[1]
            )

    if not title:
        title = "LexStat - Automatic Cognate Judgments"
    if not shorttitle:
        shorttitle = "LexStat"

    # check for json-attribute
    if keywords['json']:
        keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'],
                                                        indent=1)

    html = html.format(
        shorttitle=shorttitle,
        title=title,
        table=tmp_str,
        dataset=dataset,
        javascript=js,
        css=css,
        **keywords
    )
    util.write_text_file(filename + '.html', html)
    return
Ejemplo n.º 17
0
def msa2html(
    msa,
    shorttitle='',
    filename='',
    template='',
    **keywords
):
    """
    Convert files in ``msa``-format into colored ``html``-format.

    Parameters
    ----------
    msa : dict
        A dictionary object that contains all the information of an MSA object.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``SCA`` will be used.

    filename : str (default="")
        Define the name of the output file. If no name is defined, the name of
        the input file will be taken as a default.

    template : str (default="")
        The path to the template file. If no name is defined, the basic
        template will be used. The basic template currently used can be found
        under ``lingpy/data/templates/msa2html.html``.

    Examples
    --------
    Load the libary.

    >>> from lingpy import *
    
    Load an ``msq``-file from the test-sets.

    >>> msa = MSA('harry.msq')

    Align the data progressively and carry out a check for swapped sites.

    >>> msa.prog_align()
    >>> msa.swap_check()
    >>> print(msa)
    w    o    l    -    d    e    m    o    r    t
    w    a    l    -    d    e    m    a    r    -
    v    -    l    a    d    i    m    i    r    -

    Save the data to the file ``harry.msa``.

    >>> msa.output('msa',filename='harry')

    Save the ``msa``-object as ``html``.

    >>> msa.output('html',filename='harry')
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the ``color``
    :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted.
    

    See also
    --------
    lingpy.convert.html.alm2html
    """
    util.setdefaults(
        keywords,
        pid_mode=1,
        stress=rcParams['stress'],
        css=False,
        js=False,
        compact=False,
        class_sort=True,
        write_to_file=True,
    )

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task

    # load templates
    template = template or template_path('msa2html.html')
    if template == 'js':
        template = template_path('msa2html.js.html')
    html = util.read_text_file(template)
    css = util.read_text_file(keywords['css'] or template_path('msa.css'))
    js = util.read_text_file(keywords['js'] or template_path('msa.js'))

    # treat the msa-object as a file and try to load the file if this is the
    # case
    if isinstance(msa, string_types):
        msa = read_msa(msa, **keywords)
    else:
        raise ValueError('[!] No filename specified.')

    # load dataset, etc.
    dataset = msa['dataset']

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    infile = msa['infile']
    seq_id = msa['seq_id']

    # define the titles etc.
    if not shorttitle:
        shorttitle = 'SCA'

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    # format css file 
    css = css.replace('TAXON_LENGTH', str(taxl * 10))

    out = ''
    tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n'
    td_taxon = '<td class="taxon">{0}</td>'
    perc = int(80 / len(msa['alignment'][0]) + 0.5)
    td_residue = '<td class="residue {1}">{0}</td>'
    td_swap = '<td class="residue swap {1}">{0}</td>'
    td_unaligned = '<td class="residue noalign {1}">{0}</td>'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    # check for 
    local = ['*'] * len(msa['alignment'][0])
    if 'local' in msa:
        local = ['.'] * len(msa['alignment'][0])
        for i in msa['local']:
            local[i] = '*'

    # get two sorting schemas for the sequences
    if keywords['class_sort']:

        classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']]
        seqs = dict(
            [(a[1], b) for a, b in zip(
                sorted(
                    zip(classes, msa['seqs']),
                    key=lambda x: x[0]  # list(zip(x[0],x[1]))
                ),
                range(1, len(msa['seqs']) + 1)
            )]
        )
    else:
        seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1)))
    taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1)))

    # set up a list to store unique alignments
    alignments = []

    # start iteration
    for i, taxon in enumerate(msa['taxa']):
        tmp = ''
        tmp += td_taxon.format(taxon)

        # append alignment to alignments
        alignment = ''.join(msa['alignment'][i])
        sequence = msa['seqs'][i]
        if alignment in alignments:
            unique = 'false'
        else:
            unique = 'true'
            alignments += [alignment]

        for j, char in enumerate(msa['alignment'][i]):
            if char == '-':
                d = 'dolgo_GAP'
                c = '#bbbbbb'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])
                c = token2class(char, rcParams['_color'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'

            if j in swaps:
                tmp += td_swap.format(char, d)
            elif local[j] != '*':
                tmp += td_unaligned.format(char, d)
            else:
                tmp += td_residue.format(char, d)
        out += tr.format(tmp, unique, taxa[taxon], seqs[sequence])

    html = html.format(
        table=out,
        dataset=dataset,
        pid=pid_score,
        file=infile,
        sequence=seq_id,
        shorttitle=shorttitle,
        width=len(msa['alignment'][0]),
        table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl),
        taxa=len(msa['alignment']),
        uniseqs=len(set(msa['seqs'])),
        css=css,
        js=js
    )

    if not filename:
        filename = rcParams['filename']

    if not filename.endswith('.html'):
        filename = filename + '.html'

    if keywords['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    if keywords['write_to_file']:
        # check, whether the outfile already exists
        util.write_text_file(filename, html)
    else:
        return html
Ejemplo n.º 18
0
def run(args):

    ds = Dataset()

    alms = Alignments(ds.dir.joinpath('workflow',
                                      'D_Chen_aligned.tsv').as_posix(),
                      ref='cogids',
                      transcription='form')

    sounds = defaultdict(lambda: defaultdict(int))
    for cogid, msa in alms.msa['cogids'].items():
        for (i, tA), (j, tB) in combinations(enumerate(msa['taxa']), r=2):
            for soundA, soundB in zip(msa['alignment'][i],
                                      msa['alignment'][j]):
                soundA = soundA.split('/')[1] if '/' in soundA else soundA
                soundB = soundB.split('/')[1] if '/' in soundB else soundB
                sounds[soundA][soundB] += 1
                sounds[soundB][soundA] += 1

    #args.log.info('found {0} sounds in data'.format(len(sounds)))

    soundlist = [
        s for s in sorted(sounds,
                          key=lambda x: (token2class(x, 'cv', cldf=True),
                                         token2class(x, 'dolgo', cldf=True),
                                         token2class(x, 'sca', cldf=True),
                                         token2class(x, 'asjp')),
                          reverse=True)
        if token2class(s, 'cv', cldf=True) in 'T'
    ]  #['K', 'G', 'C', 'D', 'T']]
    matrix = [[0 for x in soundlist] for y in soundlist]

    # iterate over sounds and try to bin the values
    for i, soundA in enumerate(soundlist):
        targets = sounds[soundA]
        soundsB = [
            s
            for s in sorted(targets.items(), key=lambda x: x[1], reverse=True)
            if s[0] in soundlist
        ]
        total = sum([targets[x[0]] for x in soundsB])
        bins = [(a, int(round(b / total * 100, 0))) for a, b in soundsB]
        print(total, soundA, sum([x[1] for x in bins]), bins)

        for soundB, score in bins:
            j = soundlist.index(soundB)
            if i < j:
                matrix[i][j] = score
    # iterate over sounds and try to bin the values
    for i, soundA in enumerate(soundlist):
        targets = sounds[soundA]
        soundsB = [
            s
            for s in sorted(targets.items(), key=lambda x: x[1], reverse=True)
            if s[0] in soundlist
        ]
        total = sum([targets[x[0]] for x in soundsB])
        print(total, soundA, soundsB)
        bins = [(a, int(round(b / total * 100, 0))) for a, b in soundsB]
        for soundB, score in bins:
            j = soundlist.index(soundB)
            if i >= j:
                matrix[i][j] = score

    args.log.info('calculated the matrix')
    plt.imshow(matrix, cmap='jet', vmax=100)
    plt.title('Sound correspondence frequency across Hmong-Mien languages')
    cb = plt.colorbar()
    cb.set_label('Frequency')
    plt.xticks(range(0, len(soundlist)), soundlist, fontsize=3)
    plt.yticks(range(0, len(soundlist)), soundlist, fontsize=3)
    plt.savefig(ds.dir.joinpath('workflow', 'plots.pdf').as_posix())
Ejemplo n.º 19
0
def context_profile(wordlist,
                    ref='ipa',
                    col="doculect",
                    semi_diacritics='hsʃ̢ɕʂʐʑʒw',
                    merge_vowels=False,
                    brackets=None,
                    splitters='/,;~',
                    merge_geminates=True,
                    clts=False,
                    bad_word="<???>",
                    bad_sound="<?>",
                    unknown_sound="!{0}",
                    examples=2,
                    max_entries=100):
    """
    Create an advanced Orthography Profile with context and doculect information.

    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A wordlist from which you want to derive an initial
        orthography profile.
    ref : str (default="ipa")
        The name of the reference column in which the words are stored.
    col : str (default="doculect")
        Indicate in which column the information on the language variety is
        stored.
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    splitters : str
        The characters which force the automatic splitting of an entry.
    clts : dict (default=None)
        A dictionary(like) object that converts a given source sound into a
        potential target sound, using the get()-method of the dictionary.
        Normally, we think of a CLTS instance here (that is: a cross-linguistic
        transcription system as defined in the pyclts package).
    bad_word : str (default="«???»")
        Indicate how words that could not be parsed should be handled. Note
        that both "bad_word" and "bad_sound" are format-strings, so you can add
        formatting information here.
    bad_sound : str (default="«?»")
        Indicate how sounds that could not be converted to a sound class be
        handled. Note that both "bad_word" and "bad_sound" are format-strings,
        so you can add formatting information here.
    unknown_sound : str (default="!{0}")
        If with_clts is set to True, use this string to indicate that sounds
        are classified as "unknown sound" in the CLTS framework.
    examples : int(default=2)
        Indicate the number of examples that should be printed out.

    Returns
    -------
    profile : generator
        A generator of tuples (three items), indicating the segment, its frequency,
        the conversion to sound classes in the Dolgopolsky sound-class model,
        and the unicode-codepoints.
    """
    clts_ = clts or {}
    nulls = set()
    bad_words = set()
    brackets = brackets or "([{『(₍⁽«)]})』⁾₎"
    profile = defaultdict(list)
    errors = set()
    for idx, word, language in pb(wordlist.iter_rows(ref, col),
                                  desc='iter words',
                                  total=len(wordlist)):
        log.info('processing {0}-{1}'.format(idx, word))
        if isinstance(word, list):
            word = ' '.join(word)
        if word.strip():
            try:
                cleaned_string = clean_string(
                    word,
                    semi_diacritics=semi_diacritics,
                    merge_vowels=merge_vowels,
                    brackets=None,
                    ignore_brackets=False,
                    split_entries=False,
                    preparse=None,
                    rules=None,
                    merge_geminates=merge_geminates)[0].split(' ')

                # retain whole word if there are splitters in the word
                if [x for x in cleaned_string if x in brackets + splitters]:
                    profile[word] += [(language, word)]
                    bad_words.add(word)
                else:
                    context_pre = ['^'] + (len(cleaned_string) - 1) * ['']
                    context_post = (len(cleaned_string) - 1) * [''] + ['$']
                    for ctxA, ctxB, segment in zip(context_pre, context_post,
                                                   cleaned_string):
                        profile[ctxA + segment + ctxB] += [(language, word)]
                    for segment in [
                            x for x in word
                            if x not in ' '.join(cleaned_string)
                    ]:
                        profile[segment] += [(language, word)]
                        nulls.add(segment)
            except:
                errors.add(idx)
                log.warn('problem parsing {0}'.format(word))

    for s in '^$':
        yield s, 'NULL', '', '', '', ''

    for idx, (s, entries) in pb(enumerate(
            sorted(profile.items(), key=lambda x: len(x[1]), reverse=True)),
                                desc='yielding entries',
                                total=len(profile)):
        sclass = token2class(s.strip('^$'), 'dolgo')
        words, langs = [l[1] for l in entries
                        ][:max_entries], [l[0] for l in entries][:max_entries]
        languages = ', '.join(
            sorted(set(langs), key=lambda x: langs.count(x), reverse=True))
        frequency = str(len(langs))
        codepoints = codepoint(s)
        examples_ = ', '.join(
            sorted(set(words), key=lambda x: words.count(x),
                   reverse=True)[:examples])
        if s in bad_words:
            ipa = bad_word.format(s)
        elif sclass == '0':
            ipa = bad_sound.format(s)
        elif s in nulls:
            ipa = 'NULL'
        elif clts_:
            sound = clts_.get(s.strip('^$'), False)
            if not sound:
                ipa = '!' + s.strip('^$')
            else:
                ipa = text_type(sound)
        else:
            ipa = s.strip('^$')

        yield s, ipa, examples_, languages, frequency, codepoints
Ejemplo n.º 20
0
def msa2html(msa, shorttitle='', filename='', template='', **keywords):
    """
    Convert files in ``msa``-format into colored ``html``-format.

    Parameters
    ----------
    msa : dict
        A dictionary object that contains all the information of an MSA object.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``SCA`` will be used.

    filename : str (default="")
        Define the name of the output file. If no name is defined, the name of
        the input file will be taken as a default.

    template : str (default="")
        The path to the template file. If no name is defined, the basic
        template will be used. The basic template currently used can be found
        under ``lingpy/data/templates/msa2html.html``.

    Examples
    --------
    Load the libary.

    >>> from lingpy import *
    
    Load an ``msq``-file from the test-sets.

    >>> msa = MSA('harry.msq')

    Align the data progressively and carry out a check for swapped sites.

    >>> msa.prog_align()
    >>> msa.swap_check()
    >>> print(msa)
    w    o    l    -    d    e    m    o    r    t
    w    a    l    -    d    e    m    a    r    -
    v    -    l    a    d    i    m    i    r    -

    Save the data to the file ``harry.msa``.

    >>> msa.output('msa',filename='harry')

    Save the ``msa``-object as ``html``.

    >>> msa.output('html',filename='harry')
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the ``color``
    :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted.
    

    See also
    --------
    lingpy.convert.html.alm2html
    """
    util.setdefaults(
        keywords,
        pid_mode=1,
        stress=rcParams['stress'],
        css=False,
        js=False,
        compact=False,
        class_sort=True,
        write_to_file=True,
    )

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task

    # load templates
    template = template or template_path('msa2html.html')
    if template == 'js':
        template = template_path('msa2html.js.html')
    html = util.read_text_file(template)
    css = util.read_text_file(keywords['css'] or template_path('msa.css'))
    js = util.read_text_file(keywords['js'] or template_path('msa.js'))

    # treat the msa-object as a file and try to load the file if this is the
    # case
    if isinstance(msa, string_types):
        msa = read_msa(msa, **keywords)
    else:
        raise ValueError('[!] No filename specified.')

    # load dataset, etc.
    dataset = msa['dataset']

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    infile = msa['infile']
    seq_id = msa['seq_id']

    # define the titles etc.
    if not shorttitle:
        shorttitle = 'SCA'

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    # format css file
    css = css.replace('TAXON_LENGTH', str(taxl * 10))

    out = ''
    tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n'
    td_taxon = '<td class="taxon">{0}</td>'
    perc = int(80 / len(msa['alignment'][0]) + 0.5)
    td_residue = '<td class="residue {1}">{0}</td>'
    td_swap = '<td class="residue swap {1}">{0}</td>'
    td_unaligned = '<td class="residue noalign {1}">{0}</td>'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    # check for
    local = ['*'] * len(msa['alignment'][0])
    if 'local' in msa:
        local = ['.'] * len(msa['alignment'][0])
        for i in msa['local']:
            local[i] = '*'

    # get two sorting schemas for the sequences
    if keywords['class_sort']:

        classes = [
            tokens2class(ipa2tokens(seq), rcParams['asjp'])
            for seq in msa['seqs']
        ]
        seqs = dict([
            (a[1], b) for a, b in zip(
                sorted(
                    zip(classes, msa['seqs']),
                    key=lambda x: x[0]  # list(zip(x[0],x[1]))
                ),
                range(1,
                      len(msa['seqs']) + 1))
        ])
    else:
        seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1)))
    taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1)))

    # set up a list to store unique alignments
    alignments = []

    # start iteration
    for i, taxon in enumerate(msa['taxa']):
        tmp = ''
        tmp += td_taxon.format(taxon)

        # append alignment to alignments
        alignment = ''.join(msa['alignment'][i])
        sequence = msa['seqs'][i]
        if alignment in alignments:
            unique = 'false'
        else:
            unique = 'true'
            alignments += [alignment]

        for j, char in enumerate(msa['alignment'][i]):
            if char == '-':
                d = 'dolgo_GAP'
                c = '#bbbbbb'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])
                c = token2class(char, rcParams['_color'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'

            if j in swaps:
                tmp += td_swap.format(char, d)
            elif local[j] != '*':
                tmp += td_unaligned.format(char, d)
            else:
                tmp += td_residue.format(char, d)
        out += tr.format(tmp, unique, taxa[taxon], seqs[sequence])

    html = html.format(
        table=out,
        dataset=dataset,
        pid=pid_score,
        file=infile,
        sequence=seq_id,
        shorttitle=shorttitle,
        width=len(msa['alignment'][0]),
        table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl),
        taxa=len(msa['alignment']),
        uniseqs=len(set(msa['seqs'])),
        css=css,
        js=js)

    if not filename:
        filename = rcParams['filename']

    if not filename.endswith('.html'):
        filename = filename + '.html'

    if keywords['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    if keywords['write_to_file']:
        # check, whether the outfile already exists
        util.write_text_file(filename, html)
    else:
        return html
Ejemplo n.º 21
0
def alm2html(infile,
             title='',
             shorttitle='',
             filename='',
             colored=False,
             main_template='',
             table_template='',
             dataset='',
             confidence=False,
             **keywords):
    """
    Convert files in ``alm``-format into colored ``html``-format.

    Parameters
    ----------

    title : str
        Define the title of the output file. If no title is provided, the
        default title ``LexStat - Automatic Cognate Judgments`` will be used.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``LexStat`` will be used.
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the
    ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed
    and adapted. 

    See also
    --------
    lingpy.convert.html.msa2html
    lingpy.convert.html.msa2tex

    """
    util.setdefaults(keywords, json="", labels={})

    # open the infile
    if not os.path.exists(infile):
        infile = infile + '.alm'
    data = util.read_text_file(infile)

    # create the outfile
    if not filename:
        filename = rcParams['filename']

    # read in the templates
    html = util.read_text_file(main_template or template_path('alm2html.html'))
    if not table_template:
        table_template = template_path(
            'alm2html.table.js.html' if confidence else 'alm2html.table.html')
    table = util.read_text_file(table_template)
    css = util.read_text_file(template_path('alm.css'))
    js = util.read_text_file(template_path('alm.js'))

    # define a label function for the taxa
    label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x

    # check for windows-compatibility
    data = data.replace(os.linesep, '\n')[:-1]

    # split the data into blocks
    blocks = data.split('\n\n')

    # retrieve the dataset
    dataset = dataset or blocks[0]

    # create the outstring
    tmp_str = ''

    for block in blocks[1:]:
        lines = block.split('\n')
        m = [l.split('\t') for l in lines]

        # create colordict for different colors
        dc = len(set([l[0] for l in m]))

        if colored:
            colors = {
                a: b
                for a, b in zip(
                    sorted(set([int(l[0]) for l in m])),
                    colorRange(dc, brightness=400),
                )
            }
        else:
            colors = []
            white = True
            for i in sorted(set([abs(int(l[0])) for l in m])):
                if white:
                    colors.append((i, 'white'))
                    white = False
                else:
                    colors.append((i, 'gray'))
                    white = True
            colors = dict(colors)

        # get the basic item and its id
        iName = m[0][2]
        iID = m[0][3]

        # start writing the stuff to string
        tmp_str += table.format(NAME=iName, ID=iID)
        # define the basic string for the insertion
        bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}'

        for tracer, l in enumerate(m):
            # check whether the current line is a borrowing
            if int(l[0]) < 0:
                loan_line = ' loan'
            else:
                loan_line = ''

            # assign the cognate id
            tmp = '  <td>{0}</td>\n'.format(l[0])
            tmp += '  <td>{0}</td>\n'.format(label(l[1].strip('.')))

            # check alignments for confidence scores
            ipa_string = ''.join([cell.split('/')[0]
                                  for cell in l[4:]]).replace('-', '')

            tmp += '  <td>{0}</td>\n'.format(ipa_string)
            tmp += '  <td class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '   <table class="{0}">\n'.format(colors[abs(int(l[0]))])
            tmp += '    <tr>\n{0}    </tr>\n   </table>\n  </td>\n </tr>\n'

            # check whether another entry follows that is also an alignment,
            # otherwise, there's no need to display a word as an alignment
            cognate_set = False
            if tracer < len(m) - 1:
                if abs(int(m[tracer + 1][0])) == abs(int(l[0])):
                    cognate_set = True
            if tracer > 0:
                if abs(int(m[tracer - 1][0])) == abs(int(l[0])):
                    cognate_set = True

            # fill out html for the cognate sets
            if cognate_set:

                alm = ''
                for char in l[4:]:

                    # check for confidence scores
                    if '/' in char:
                        try:
                            char, conf, num = char.split('/')
                            conf = int(conf)
                        except ValueError:
                            print(char.split('/'))
                            raise ValueError("Something is wrong with %s." %
                                             (char))

                    else:
                        char, conf, rgb = char, (255, 255, 255), 0.0

                    if char == '-':
                        d = 'dolgo_GAP'
                    else:
                        d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                        # bad check for three classes named differently
                        if d == 'dolgo__':
                            d = 'dolgo_X'
                        elif d == 'dolgo_1':
                            d = 'dolgo_TONE'
                        elif d == 'dolgo_0':
                            d = 'dolgo_ERROR'

                    if confidence:
                        alm += '     '
                        alm += '<td class="char {1}" confidence={0} '.format(
                            conf, d)
                        alm += 'char="{0}" '.format(char)
                        alm += 'onclick="' + "show('{0}')".format(num) + '" '
                        alm += 'num="{0}"'.format(num)
                        alm += '>\n      {0}\n     </td>\n'.format(char)
                    else:
                        alm += '     '
                        alm += '<td class="char {0}">{1}</td>\n'.format(
                            d, char)
            else:
                alm = '      '
                alm += '<td class="{0}">--</td>\n'.format(colors[abs(int(
                    l[0]))])

            # format the alignment
            try:
                tmp = tmp.format(alm)
            except ValueError:
                raise ValueError("Unknown problem in matchin %s and %s." %
                                 (alm, tmp))

            # check for last line, where a new line should be inserted (not the
            # fastest solution, but plotting is not a matter of time, and it
            # suffices it's current purpose
            if tracer < len(m) - 1:
                pass
            else:
                if confidence:
                    tmp += ' </table>\n'

                tmp += ' <tr class="empty"><td colspan="4" class="empty">'
                tmp += '<hr class="empty" /></td></tr>\n'

            # format the whole string
            tmp_str += bas.format(colors[abs(int(l[0]))], tmp, loan_line, l[1])

    if not title:
        title = "LexStat - Automatic Cognate Judgments"
    if not shorttitle:
        shorttitle = "LexStat"

    # check for json-attribute
    if keywords['json']:
        keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'],
                                                        indent=1)

    html = html.format(shorttitle=shorttitle,
                       title=title,
                       table=tmp_str,
                       dataset=dataset,
                       javascript=js,
                       css=css,
                       **keywords)
    util.write_text_file(filename + '.html', html)
    return
Ejemplo n.º 22
0
color.converter['³¹'] = 'Brown'
color.converter['¹'] = 'White'
color.converter['²¹'] = 'DarkOrange'
color.converter['³³'] = 'CornflowerBlue'
color.converter['⁵³'] = '#c86496'
color.converter['⁵¹'] = 'cyan'
_conv = {}
_conv['A'] = 'LightBlue'
_conv['E'] = 'Orange'
_conv['I'] = 'LightGreen'
_conv['O'] = 'white'
_conv['U'] = 'Crimson'
_conv['Y'] = 'LightYellow'

for sound in color.converter:
    cls = token2class(sound, 'sca')
    if cls in 'AEIOUY':
        color.converter[sound] = _conv[cls]


def contains(syllable, sound):
    _s = normalize('NFD', ''.join(syllable))
    if sound in _s:
        return True
    return False


def is_aspirated(syllable):
    return contains(syllable, 'ʰ')