Ejemplo n.º 1
0
def get_confidence(alms, scorer, ref='lexstatid', gap_weight=1):
    """
    Function creates confidence scores for a given set of alignments.

    Parameters
    ----------
    alms : :py:class`~lingpy.align.sca.Alignments`
        An *Alignments* object containing already aligned strings.
    scorer : :py:class:`~lingpy.algorithm._misc.ScoreDict`
        A *ScoreDict* object which gives similarity scores for all segments in
        the alignment.
    ref : str (default="lexstatid")
        The reference entry-type, referring to the cognate-set to be used for
        the analysis.
    """
    # store all values for average scores
    values = []

    # store all correspondences
    corrs = {}

    # store occurrences
    occs = {}

    for key, msa in alms.msa[ref].items():
        # get basic stuff
        idxs = msa['ID']
        taxa = msa['taxa']
        concept = cgi.escape(alms[idxs[0], 'concept'], True)

        # get numerical representation of alignments
        if scorer:
            alignment = [class2tokens(
                alms[idxs[i], 'numbers'],
                msa['alignment'][i]) for i in range(len(idxs))]
        else:
            alignment = msa['alignment']

        # create new array for confidence
        confidence_matrix = []
        character_matrix = []

        # iterate over each taxon
        for i, taxon in enumerate(taxa):
            idx = alms.taxa.index(taxon) + 1

            # get the numerical sequence
            nums = alignment[i]

            # store confidences per line
            confidences = []

            # store chars per line
            chars = []

            # iterate over the sequence
            for j, num in enumerate(nums):
                col = [alm[j] for alm in alignment]
                score = 0
                count = 0

                # get the char
                if num != '-':
                    charA = dotjoin(taxa[i], msa['alignment'][i][j], num.split('.')[2])
                    chars += [charA]
                    try:
                        occs[charA] += [concept]
                    except:
                        occs[charA] = [concept]
                else:
                    chars += ['-']

                for k, numB in enumerate(col):
                    if k != i:
                        if num == '-' and numB == '-':
                            pass
                        else:
                            if numB != '-' and num != '-':
                                # get the second char
                                charB = dotjoin(
                                    taxa[k], msa['alignment'][k][j], numB.split('.')[2])
                                try:
                                    corrs[charA][charB] += 1
                                except:
                                    try:
                                        corrs[charA][charB] = 1
                                    except:
                                        corrs[charA] = {charB: 1}

                            gaps = False
                            if num == '-' and numB != '-':
                                numA = charstring(idx)
                                gaps = True
                            elif numB == '-' and num != '-':
                                numB = charstring(alms.taxa.index(taxa[k]))
                                numA = num
                                gaps = True
                            else:
                                numA = num

                            scoreA = scorer[numA, numB]
                            scoreB = scorer[numB, numA]
                            this_score = max(scoreA, scoreB)

                            if not gaps:
                                score += this_score
                                count += 1
                            else:
                                score += this_score * gap_weight
                                count += gap_weight

                if count:
                    score = score / count
                else:
                    score = -25

                confidences += [int(score + 0.5)]
                values += [int(score + 0.5)]
            confidence_matrix += [confidences]
            character_matrix += [chars]

        # append confidence matrix to alignments
        alms.msa[ref][key]['confidence'] = confidence_matrix
        alms.msa[ref][key]['_charmat'] = character_matrix

    # sort the values
    values = sorted(set(values + [1]))

    # make conversion to scale of 100 values
    converter = {}
    valsA = values[:values.index(1)]
    valsB = values[values.index(1):]
    stepA = 50 / (len(valsA) + 1)
    stepB = 75 / (len(valsB) + 1)
    for i, score in enumerate(valsA):  # values[:values.index(0)):
        converter[score] = int((stepA * i) / 4 + 0.5)
    for i, score in enumerate(valsB):
        converter[score] = int(stepB * i + 0.5) + 50

    # iterate over keys again
    for key, msa in alms.msa[ref].items():
        # get basic stuff
        for i, line in enumerate(msa['confidence']):
            for j, cell in enumerate(line):
                alms.msa[ref][key]['confidence'][i][j] = converter[cell]

    jsond = {}
    for key, corr in corrs.items():
        splits = [c.split('.') + [o] for c, o in corr.items()]
        sorts = sorted(splits, key=lambda x: (x[0], -x[3]))
        new_sorts = []

        # check for rowspan
        spans = {}
        for a, b, c, d in sorts:
            if a in spans:
                if spans[a] < 3 and d > 1:
                    spans[a] += 1
                    new_sorts += [[a, b, c, d]]
            else:
                if d > 1:
                    spans[a] = 1
                    new_sorts += [[a, b, c, d]]

        bestis = []
        old_lang = ''
        counter = 0
        for a, b, c, d in new_sorts:
            new_lang = a
            if new_lang != old_lang:
                old_lang = new_lang

                tmp = '<tr class="display">'
                tmp += '<td class="display" rowspan={0}>'.format(spans[a])
                tmp += a + '</td>'
                tmp += '<td class="display" onclick="show({0});"><span '.format(
                    "'" + dotjoin(a, b, c) + "'")
                tmp += 'class="char {0}">' + b + '</span></td>'
                tmp += '<td class="display">'
                tmp += c + '</td>'
                tmp += '<td class="display">' + str(d) + '</td>'
                tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>'
                tmp += '</tr>'
                t = 'dolgo_' + token2class(b, rcParams['dolgo'])

                # bad check for three classes named differently
                if t == 'dolgo__':
                    t = 'dolgo_X'
                elif t == 'dolgo_1':
                    t = 'dolgo_TONE'
                elif t == 'dolgo_0':
                    t = 'dolgo_ERROR'

                bestis += [tmp.format(t)]
                counter += 1

            elif counter > 0:
                tmp = '<tr class="display">'
                tmp += '<td class="display" onclick="show({0});"><span '.format(
                    "'" + dotjoin(a, b, c) + "'")
                tmp += 'class="char {0}">' + b + '</span></td>'
                tmp += '<td class="display">' + c + '</td>'
                tmp += '<td class="display">' + str(d) + '</td>'
                tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>'
                tmp += '</tr>'

                t = 'dolgo_' + token2class(b, rcParams['dolgo'])

                # bad check for three classes named differently
                if t == 'dolgo__':
                    t = 'dolgo_X'
                elif t == 'dolgo_1':
                    t = 'dolgo_TONE'
                elif t == 'dolgo_0':
                    t = 'dolgo_ERROR'

                bestis += [tmp.format(t)]
                counter += 1
                old_lang = new_lang
            else:
                old_lang = new_lang
                counter = 0

        jsond[key] = [''.join(bestis), occs[key]]

    return jsond
Ejemplo n.º 2
0
    def get_partial_scorer(self, **keywords):
        """
        Create a scoring function based on sound correspondences.

        Parameters
        ----------
        method : str (default='shuffle')
            Select between "markov", for automatically generated random
            strings, and "shuffle", for random strings taken directly from the
            data.
        ratio : tuple (default=3,2)
            Define the ratio between derived and original score for
            sound-matches.
        vscale : float (default=0.5)
            Define a scaling factor for vowels, in order to decrease their
            score in the calculations.
        runs : int (default=1000)
            Choose the number of random runs that shall be made in order to
            derive the random distribution.
        threshold : float (default=0.7)
            The threshold which used to select those words that are compared
            in order to derive the attested distribution.
        modes : list (default = [("global",-2,0.5),("local",-1,0.5)])
            The modes which are used in order to derive the distributions from
            pairwise alignments.
        factor : float (default=0.3)
            The scaling factor for sound segments with identical prosodic
            environment.
        force : bool (default=False)
            Force recalculation of existing distribution.
        preprocessing: bool (default=False)
            Select whether SCA-analysis shall be used to derive a preliminary
            set of cognates from which the attested distribution shall be
            derived.
        rands : int (default=1000)
            If "method" is set to "markov", this parameter defines the number
            of strings to produce for the calculation of the random
            distribution.
        limit : int (default=10000)
            If "method" is set to "markov", this parameter defines the limit
            above which no more search for unique strings will be carried out.
        cluster_method : {"upgma" "single" "complete"} (default="upgma")
            Select the method to be used for the calculation of cognates in the
            preprocessing phase, if "preprocessing" is set to c{True}.
        gop : int (default=-2)
            If "preprocessing" is selected, define the gap opening penalty for
            the preprocessing calculation of cognates.
        unattested : {int, float} (default=-5)
            If a pair of sounds is not attested in the data, but expected by
            the alignment algorithm that computes the expected distribution,
            the score would be -infinity. Yet in order to allow to smooth this
            behaviour and to reduce the strictness, we set a default negative
            value which does not necessarily need to be too high, since it may
            well be that we miss a potentially good pairing in the first runs
            of alignment analyses. Use this keyword to adjust this parameter.
        unexpected : {int, float} (default=0.000001)
            If a pair is encountered in a given alignment but not expected
            according to the randomized alignments, the score would be not
            calculable, since we had to divide by zero. For this reason, we set
            a very small constant, by which the score is divided in this case.
            Not that this constant is only relevant in those cases where the
            shuffling procedure was not carried out long enough.

        """
        kw = dict(
            method=rcParams['lexstat_scoring_method'],
            ratio=rcParams['lexstat_ratio'],
            vscale=rcParams['lexstat_vscale'],
            runs=rcParams['lexstat_runs'],
            threshold=rcParams['lexstat_scoring_threshold'],
            modes=rcParams['lexstat_modes'],
            factor=rcParams['align_factor'],
            restricted_chars=rcParams['restricted_chars'],
            force=False,
            preprocessing=False,
            rands=rcParams['lexstat_rands'],
            limit=rcParams['lexstat_limit'],
            cluster_method=rcParams['lexstat_cluster_method'],
            gop=rcParams['align_gop'],
            preprocessing_threshold=rcParams[
                'lexstat_preprocessing_threshold'],
            preprocessing_method=rcParams['lexstat_preprocessing_method'],
            subset=False,
            defaults=False,
            unattested=-5,
            unexpected=0.00001,
            smooth=1)
        kw.update(keywords)
        if kw['defaults']:
            return kw

        # get parameters and store them in string
        params = dict(
            ratio=kw['ratio'],
            vscale=kw['vscale'],
            runs=kw['runs'],
            scoring_threshold=kw['threshold'],
            preprocessing_threshold=kw['preprocessing_threshold'],
            modestring=':'.join('{0}-{1}-{2:.2f}'.format(a, abs(b), c)
                                for a, b, c in kw['modes']),
            factor=kw['factor'],
            restricted_chars=kw['restricted_chars'],
            method=kw['method'],
            preprocessing='{0}:{1}:{2}'.format(kw['preprocessing'],
                                               kw['cluster_method'],
                                               kw['gop']),
            unattested=kw['unattested'],
            unexpected=kw['unexpected'],
            smooth=kw['smooth'])

        parstring = '_'.join([
            '{ratio[0]}:{ratio[1]}', '{vscale:.2f}', '{runs}',
            '{scoring_threshold:.2f}', '{modestring}', '{factor:.2f}',
            '{restricted_chars}', '{method}', '{preprocessing}',
            '{preprocessing_threshold}', '{unexpected:.2f}', '{unattested:.2f}'
        ]).format(**params)

        # check for existing attributes
        if hasattr(self, 'cscorer') and not kw['force']:
            log.warning(
                "An identical scoring function has already been calculated, "
                "force recalculation by setting 'force' to 'True'.")
            return

        # check for attribute
        if hasattr(self, 'params') and not kw['force']:
            if 'cscorer' in self.params:
                if self.params['cscorer'] == params:
                    log.warning(
                        "An identical scoring function has already been "
                        "calculated, force recalculation by setting 'force'"
                        " to 'True'.")
                    return
            else:
                log.warning(
                    "A different scoring function has already been calculated, "
                    "overwriting previous settings.")

        # store parameters
        self.params = {'cscorer': params}
        self._meta['params'] = self.params
        self._stamp += "# Parameters: " + parstring + '\n'

        # get the correspondence distribution
        self._corrdist = self._get_partial_corrdist(**kw)
        # get the random distribution
        self._randist = self._get_partial_randist(**kw)

        # get the average gop
        gop = sum([m[1] for m in kw['modes']]) / len(kw['modes'])

        # create the new scoring matrix
        matrix = [[c for c in line] for line in self.bscorer.matrix]
        char_dict = self.bscorer.chars2int

        for (i, tA), (j, tB) in util.multicombinations2(enumerate(self.cols)):
            for charA, charB in product(
                    list(self.freqs[tA]) + [util.charstring(i + 1)],
                    list(self.freqs[tB]) + [util.charstring(j + 1)]):
                exp = self._randist.get((tA, tB), {}).get((charA, charB),
                                                          False)
                att = self._corrdist.get((tA, tB), {}).get((charA, charB),
                                                           False)
                # in the following we follow the former lexstat protocol
                if att <= kw['smooth'] and i != j:
                    att = False

                if att and exp:
                    score = np.log2((att**2) / (exp**2))
                elif att and not exp:
                    score = np.log2((att**2) / kw['unexpected'])
                elif exp and not att:
                    score = kw['unattested']  # XXX gop ???
                else:  # elif not exp and not att:
                    score = -90  # ???

                # combine the scores
                if rcParams['gap_symbol'] not in charA + charB:
                    sim = self.bscorer[charA, charB]
                else:
                    sim = gop

                # get the real score
                rscore = (kw['ratio'][0] * score + kw['ratio'][1] * sim) \
                    / sum(kw['ratio'])

                try:
                    iA = char_dict[charA]
                    iB = char_dict[charB]

                    # use the vowel scale
                    if charA[4] in self.vowels and charB[4] in self.vowels:
                        matrix[iA][iB] = matrix[iB][iA] = kw['vscale'] * rscore
                    else:
                        matrix[iA][iB] = matrix[iB][iA] = rscore
                except:
                    pass

        self.cscorer = misc.ScoreDict(self.chars, matrix)
        self._meta['scorer']['cscorer'] = self.cscorer
Ejemplo n.º 3
0
    def _get_partial_corrdist(self, **keywords):
        """
        Use alignments to get a correspondences statistics.
        """
        kw = dict(
            cluster_method='upgma',
            factor=rcParams['align_factor'],
            gop=rcParams['align_gop'],
            modes=rcParams['lexstat_modes'],
            preprocessing=False,
            preprocessing_method=rcParams['lexstat_preprocessing_method'],
            preprocessing_threshold=rcParams[
                'lexstat_preprocessing_threshold'],
            split_on_tones=False,
            ref='scaid',
            restricted_chars=rcParams['restricted_chars'],
            threshold=rcParams['lexstat_scoring_threshold'],
            subset=False)
        kw.update(keywords)

        self._included = {}
        corrdist = {}

        if kw['preprocessing']:
            if kw['ref'] not in self.header:
                self.cluster(method=kw['preprocessing_method'],
                             threshold=kw['preprocessing_threshold'],
                             gop=kw['gop'],
                             cluster_method=kw['cluster_method'],
                             ref=kw['ref'])

        with util.pb(desc='CORRESPONDENCE CALCULATION',
                     total=self.width**2 / 2) as pb:
            for (i, tA), (j,
                          tB) in util.multicombinations2(enumerate(self.cols)):
                pb.update(1)
                log.info("Calculating alignments for pair {0} / {1}.".format(
                    tA, tB))

                corrdist[tA, tB] = defaultdict(float)
                for mode, gop, scale in kw['modes']:
                    pairs = self.pairs[tA, tB]
                    if kw['subset']:
                        pairs = [
                            pair for pair in pairs
                            if pair in self.subsets[tA, tB]
                        ]

                    # threshold and preprocessing, make sure threshold is
                    # different from pre-processing threshold when
                    # preprocessing is set to false
                    if kw['preprocessing']:
                        pairs = [
                            pair for pair in pairs
                            if self[pair, kw['ref']][0] == self[pair,
                                                                kw['ref']][1]
                        ]
                        threshold = 10.0
                    else:
                        threshold = kw['threshold']

                    # create morpheme-segmented pairs
                    new_nums, new_weights, new_pros = [], [], []
                    for idxA, idxB in pairs:
                        for iA, iB in self._slices[idxA]:
                            for jA, jB in self._slices[idxB]:
                                new_nums += [(self[idxA, self._numbers][iA:iB],
                                              self[idxB,
                                                   self._numbers][jA:jB])]
                                new_weights += [(self[idxA,
                                                      self._weights][iA:iB],
                                                 self[idxB,
                                                      self._weights][jA:jB])]
                                new_pros += [(self[idxA,
                                                   self._prostrings][iA:iB],
                                              self[idxB,
                                                   self._prostrings][jA:jB])]

                    corrs, self._included[tA, tB] = calign.corrdist(
                        threshold, new_nums, new_weights, new_pros, gop, scale,
                        kw['factor'], self.bscorer, mode,
                        kw['restricted_chars'])

                    # change representation of gaps
                    for (a, b), d in corrs.items():
                        # XXX check for bias XXX
                        if a == '-':
                            a = util.charstring(i + 1)
                        elif b == '-':
                            b = util.charstring(j + 1)
                        corrdist[tA, tB][a, b] += d / float(len(kw['modes']))

        return corrdist
Ejemplo n.º 4
0
    def _get_partial_randist(self, **keywords):
        """
        Return the aligned results of randomly aligned sequences.
        """
        kw = dict(modes=rcParams['lexstat_modes'],
                  factor=rcParams['align_factor'],
                  restricted_chars=rcParams['restricted_chars'],
                  runs=rcParams['lexstat_runs'],
                  rands=rcParams['lexstat_rands'],
                  limit=rcParams['lexstat_limit'],
                  method=rcParams['lexstat_scoring_method'])
        kw.update(keywords)

        # determine the mode
        method = 'markov' if kw['method'] in ['markov', 'markov-chain', 'mc'] \
            else 'shuffle'

        corrdist = {}
        tasks = (self.width**2) / 2
        with util.pb(desc='RANDOM CORRESPONDENCE CALCULATION',
                     total=tasks) as progress:
            for (i, tA), (j,
                          tB) in util.multicombinations2(enumerate(self.cols)):
                progress.update(1)
                log.info("Calculating random alignments"
                         "for pair {0}/{1}.".format(tA, tB))
                corrdist[tA, tB] = defaultdict(float)

                # create morpheme-segmented pairs
                pairs = self.pairs[tA, tB]
                new_nums, new_weights, new_pros = [], [], []
                for idxA, idxB in pairs:
                    for iA, iB in self._slices[idxA]:
                        for jA, jB in self._slices[idxB]:
                            new_nums += [(self[idxA, self._numbers][iA:iB],
                                          self[idxB, self._numbers][jA:jB])]
                            new_weights += [(self[idxA, self._weights][iA:iB],
                                             self[idxB, self._weights][jA:jB])]
                            new_pros += [(self[idxA, self._prostrings][iA:iB],
                                          self[idxB, self._prostrings][jA:jB])]
                # get the number pairs etc.
                sample = [(x, y) for x in range(len(new_nums))
                          for y in range(len(new_nums))]
                if len(sample) > kw['runs']:
                    sample = random.sample(sample, kw['runs'])

                for mode, gop, scale in kw['modes']:
                    corrs, included = calign.corrdist(
                        10.0, [(new_nums[s[0]][0], new_nums[s[1]][1])
                               for s in sample],
                        [(new_weights[s[0]][0], new_weights[s[1]][1])
                         for s in sample],
                        [(new_pros[s[0]][0], new_pros[s[1]][1])
                         for s in sample], gop, scale, kw['factor'],
                        self.bscorer, mode, kw['restricted_chars'])

                    # change representation of gaps
                    for a, b in list(corrs.keys()):
                        # get the correspondence count
                        d = corrs[a, b] * self._included[tA, tB] / included
                        # XXX check XXX* len(self.pairs[tA,tB]) / runs

                        # check for gaps
                        if a == '-':
                            a = util.charstring(i + 1)
                        elif b == '-':
                            b = util.charstring(j + 1)

                        corrdist[tA, tB][a, b] += d / len(kw['modes'])
        return corrdist
Ejemplo n.º 5
0
def get_confidence(alms, scorer, ref='lexstatid', gap_weight=1):
    """
    Function creates confidence scores for a given set of alignments.

    Parameters
    ----------
    alms : :py:class`~lingpy.align.sca.Alignments`
        An *Alignments* object containing already aligned strings.
    scorer : :py:class:`~lingpy.algorithm._misc.ScoreDict`
        A *ScoreDict* object which gives similarity scores for all segments in
        the alignment.
    ref : str (default="lexstatid")
        The reference entry-type, referring to the cognate-set to be used for
        the analysis.
    """
    # store all values for average scores
    values = []

    # store all correspondences
    corrs = {}

    # store occurrences
    occs = {}

    for key, msa in alms.msa[ref].items():
        # get basic stuff
        idxs = msa['ID']
        taxa = msa['taxa']
        concept = cgi.escape(alms[idxs[0], 'concept'], True)

        # get numerical representation of alignments
        if scorer:
            alignment = [class2tokens(
                alms[idxs[i], 'numbers'],
                msa['alignment'][i]) for i in range(len(idxs))]
        else:
            alignment = msa['alignment']

        # create new array for confidence
        confidence_matrix = []
        character_matrix = []

        # iterate over each taxon
        for i, taxon in enumerate(taxa):
            idx = alms.taxa.index(taxon) + 1

            # get the numerical sequence
            nums = alignment[i]

            # store confidences per line
            confidences = []

            # store chars per line
            chars = []

            # iterate over the sequence
            for j, num in enumerate(nums):
                col = [alm[j] for alm in alignment]
                score = 0
                count = 0

                # get the char
                if num != '-':
                    charA = dotjoin(taxa[i], msa['alignment'][i][j], num.split('.')[2])
                    chars += [charA]
                    try:
                        occs[charA] += [concept]
                    except:
                        occs[charA] = [concept]
                else:
                    chars += ['-']

                for k, numB in enumerate(col):
                    if k != i:
                        if num == '-' and numB == '-':
                            pass
                        else:
                            if numB != '-' and num != '-':
                                # get the second char
                                charB = dotjoin(
                                    taxa[k], msa['alignment'][k][j], numB.split('.')[2])
                                try:
                                    corrs[charA][charB] += 1
                                except:
                                    try:
                                        corrs[charA][charB] = 1
                                    except:
                                        corrs[charA] = {charB: 1}

                            gaps = False
                            if num == '-' and numB != '-':
                                numA = charstring(idx)
                                gaps = True
                            elif numB == '-' and num != '-':
                                numB = charstring(alms.taxa.index(taxa[k]))
                                numA = num
                                gaps = True
                            else:
                                numA = num

                            scoreA = scorer[numA, numB]
                            scoreB = scorer[numB, numA]
                            this_score = max(scoreA, scoreB)

                            if not gaps:
                                score += this_score
                                count += 1
                            else:
                                score += this_score * gap_weight
                                count += gap_weight

                if count:
                    score = score / count
                else:
                    score = -25

                confidences += [int(score + 0.5)]
                values += [int(score + 0.5)]
            confidence_matrix += [confidences]
            character_matrix += [chars]

        # append confidence matrix to alignments
        alms.msa[ref][key]['confidence'] = confidence_matrix
        alms.msa[ref][key]['_charmat'] = character_matrix

    # sort the values
    values = sorted(set(values + [1]))

    # make conversion to scale of 100 values
    converter = {}
    valsA = values[:values.index(1)]
    valsB = values[values.index(1):]
    stepA = 50 / (len(valsA) + 1)
    stepB = 75 / (len(valsB) + 1)
    for i, score in enumerate(valsA):  # values[:values.index(0)):
        converter[score] = int((stepA * i) / 4 + 0.5)
    for i, score in enumerate(valsB):
        converter[score] = int(stepB * i + 0.5) + 50

    # iterate over keys again
    for key, msa in alms.msa[ref].items():
        # get basic stuff
        for i, line in enumerate(msa['confidence']):
            for j, cell in enumerate(line):
                alms.msa[ref][key]['confidence'][i][j] = converter[cell]

    jsond = {}
    for key, corr in corrs.items():
        splits = [c.split('.') + [o] for c, o in corr.items()]
        sorts = sorted(splits, key=lambda x: (x[0], -x[3]))
        new_sorts = []

        # check for rowspan
        spans = {}
        for a, b, c, d in sorts:
            if a in spans:
                if spans[a] < 3 and d > 1:
                    spans[a] += 1
                    new_sorts += [[a, b, c, d]]
            else:
                if d > 1:
                    spans[a] = 1
                    new_sorts += [[a, b, c, d]]

        bestis = []
        old_lang = ''
        counter = 0
        for a, b, c, d in new_sorts:
            new_lang = a
            if new_lang != old_lang:
                old_lang = new_lang

                tmp = '<tr class="display">'
                tmp += '<td class="display" rowspan={0}>'.format(spans[a])
                tmp += a + '</td>'
                tmp += '<td class="display" onclick="show({0});"><span '.format(
                    "'" + dotjoin(a, b, c) + "'")
                tmp += 'class="char {0}">' + b + '</span></td>'
                tmp += '<td class="display">'
                tmp += c + '</td>'
                tmp += '<td class="display">' + str(d) + '</td>'
                tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>'
                tmp += '</tr>'
                t = 'dolgo_' + token2class(b, rcParams['dolgo'])

                # bad check for three classes named differently
                if t == 'dolgo__':
                    t = 'dolgo_X'
                elif t == 'dolgo_1':
                    t = 'dolgo_TONE'
                elif t == 'dolgo_0':
                    t = 'dolgo_ERROR'

                bestis += [tmp.format(t)]
                counter += 1

            elif counter > 0:
                tmp = '<tr class="display">'
                tmp += '<td class="display" onclick="show({0});"><span '.format(
                    "'" + dotjoin(a, b, c) + "'")
                tmp += 'class="char {0}">' + b + '</span></td>'
                tmp += '<td class="display">' + c + '</td>'
                tmp += '<td class="display">' + str(d) + '</td>'
                tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>'
                tmp += '</tr>'

                t = 'dolgo_' + token2class(b, rcParams['dolgo'])

                # bad check for three classes named differently
                if t == 'dolgo__':
                    t = 'dolgo_X'
                elif t == 'dolgo_1':
                    t = 'dolgo_TONE'
                elif t == 'dolgo_0':
                    t = 'dolgo_ERROR'

                bestis += [tmp.format(t)]
                counter += 1
                old_lang = new_lang
            else:
                old_lang = new_lang
                counter = 0

        jsond[key] = [''.join(bestis), occs[key]]

    return jsond