Esempio n. 1
0
    def partial_cluster(
            self,
            method='sca',
            threshold=0.45,
            scale=0.5,
            factor=0.3,
            restricted_chars='_T',
            mode='overlap',
            cluster_method='infomap',
            gop=-1,
            restriction='',
            ref='',
            external_function=None,
            split_on_tones=True,
            **keywords):
        """
        Cluster the words into partial cognate sets.

        Function for flat clustering of words into cognate sets.

        Parameters
        ----------
        method : {'sca','lexstat','edit-dist','turchin'} (default='sca')
            Select the method that shall be used for the calculation.
        cluster_method : {'upgma','single','complete', 'mcl'} (default='upgma')
            Select the cluster method. 'upgma' (:evobib:`Sokal1958`) refers to
            average linkage clustering, 'mcl' refers to the "Markov Clustering
            Algorithm" (:evobib:`Dongen2000`).
        threshold : float (default=0.3)
            Select the threshold for the cluster approach. If set to c{False},
            an automatic threshold will be calculated by calculating the
            average distance of unrelated sequences (use with care).
        scale : float (default=0.5)
            Select the scale for the gap extension penalty.
        factor : float (default=0.3)
            Select the factor for extra scores for identical prosodic segments.
        restricted_chars : str (default="T_")
            Select the restricted chars (boundary markers) in the prosodic
            strings in order to enable secondary alignment.
        mode : {'global','local','overlap','dialign'} (default='overlap')
            Select the mode for the alignment analysis.
        verbose : bool (default=False)
            Define whether verbose output should be used or not.
        gop : int (default=-2)
            If 'sca' is selected as a method, define the gap opening penalty.
        restriction : {'cv'} (default="")
            Specify the restriction for calculations using the edit-distance.
            Currently, only "cv" is supported. If *edit-dist* is selected as
            *method* and *restriction* is set to *cv*, consonant-vowel matches
            will be prohibited in the calculations and the edit distance will
            be normalized by the length of the alignment rather than the length
            of the longest sequence, as described in :evobib:`Heeringa2006`.
        inflation : {int, float} (default=2)
            Specify the inflation parameter for the use of the MCL algorithm.
        expansion : int (default=2)
            Specify the expansion parameter for the use of the MCL algorithm.
        
        """
        kw = dict(
                imap_mode = True,
                post_processing = False,
                inflation=2,
                expansion=2,
                max_steps=1000,
                add_self_loops=True,
                sep=lingpy.settings.rcParams['morpheme_separator'],
                word_sep=lingpy.settings.rcParams['word_separator'],
                word_seps=lingpy.settings.rcParams['word_separators'],
                seps=lingpy.settings.rcParams['morpheme_separators'],
                mcl_logs=lambda x: -np.log2((1 - x) ** 2)
                )
        kw.update(keywords)        

        # check for parameters and add clustering, in order to make sure that
        # analyses are not repeated
        if not hasattr(self, 'params'):
            self.params = {}
        self.params['partial_cluster'] = "{0}_{1}_{2:.2f}".format(
            method, cluster_method, threshold)
        self._stamp += '# Partial Cluster: ' + self.params['partial_cluster']

        matrices = self._get_partial_matrices(method=method, scale=scale,
                factor=factor, restricted_chars=restricted_chars, mode=mode,
                gop=gop, imap_mode=kw['imap_mode'],
                split_on_tones=split_on_tones)
        k = 0
        C = defaultdict(list) # stores the pcogids
        G = {} # stores the graphs
        with pb(desc='PARTIAL SEQUENCE CLUSTERING', total=len(self.rows)) as progress:
            for concept, trace, matrix in matrices:
                progress.update(1)
                lingpy.log.info('Analyzing concept {0}...'.format(concept))
                if external_function:
                    c = external_function(threshold, matrix,
                            taxa=list(range(len(matrix))), revert=True)
                elif cluster_method == 'infomap':
                    c = extra.infomap_clustering(threshold,
                            matrix, taxa=list(range(len(matrix))), 
                            revert=True)
                elif cluster_method == 'mcl':
                    c = clustering.mcl(threshold, matrix, 
                            taxa = list(range(len(matrix))),
                            max_steps=kw['max_steps'],
                            inflation=kw['inflation'],
                            expansion=kw['expansion'],
                            add_self_loops=kw['add_self_loops'],
                            logs=kw['mcl_logs'],
                            revert=True)
                elif cluster_method in ['upgma', 'single', 'complete', 'ward']:
                    c = clustering.flat_cluster(cluster_method,
                            threshold, matrix,
                            revert=True)
                else:
                    raise ValueError("No suitable cluster method specified.")
                
                for i,(idx,pos,slc) in enumerate(trace):
                    C[idx] += [c[i] + k]
                if kw['post_processing']:
                    _g = nx.Graph()
                    for i,(idx,pos,slc) in enumerate(trace):
                        _g.add_node((i,idx,pos))
                    remove_edges = []
                    for (i, n1), (j, n2) in combinations2(enumerate(_g.nodes())):
                        if C[n1[1]][n1[2]] == C[n2[1]][n2[2]]:
                            _g.add_edge(n1, n2)
                            if n1[1] == n2[1]:
                                # get scores for n1 and n2 with all the rest in
                                # the matrix to decide for one
                                sn1, sn2 = 0, 0
                                for i,row in enumerate(matrix):
                                    sn1 += matrix[i][n1[0]]
                                    sn2 += matrix[i][n2[0]]
                                sn1 = sn1 / len(matrix)
                                sn2 = sn2 / len(matrix)
                                if sn1 <= sn2:
                                    remove_edges += [n2]
                                else:
                                            remove_edges += [n1]
                    for node in remove_edges:
                        for edge in sorted(_g[node]):
                            _g.remove_edge(node, edge)

                    for i,coms in enumerate(nx.connected_components(_g)):
                        cogid = i + 1 + k
                        for j,idx,pos in coms:
                            C[idx][pos] = cogid
                    
                    G[concept] = _g

                k += max(c.values())
        self.add_entries(ref or self._partials, C, lambda x: x)
        self.graphs = G
Esempio n. 2
0
def simple_profile(wordlist,
                   ref='ipa',
                   semi_diacritics='hsʃ̢ɕʂʐʑʒw',
                   merge_vowels=False,
                   brackets=None,
                   splitters='/,;~',
                   merge_geminates=True,
                   bad_word="<???>",
                   bad_sound="<?>",
                   clts=None,
                   unknown_sound="!{0}"):
    """
    Create an initial Orthography Profile using Lingpy's clean_string procedure.

    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A wordlist from which you want to derive an initial
        orthography profile.
    ref : str (default="ipa")
        The name of the reference column in which the words are stored.
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    splitters : str
        The characters which force the automatic splitting of an entry.
    clts : dict (default=None)
        A dictionary(like) object that converts a given source sound into a
        potential target sound, using the get()-method of the dictionary.
        Normally, we think of a CLTS instance here (that is: a cross-linguistic
        transcription system as defined in the pyclts package).
    bad_word : str (default="«???»")
        Indicate how words that could not be parsed should be handled. Note
        that both "bad_word" and "bad_sound" are format-strings, so you can add
        formatting information here.
    bad_sound : str (default="«?»")
        Indicate how sounds that could not be converted to a sound class be
        handled. Note that both "bad_word" and "bad_sound" are format-strings,
        so you can add formatting information here.
    unknown_sound : str (default="!{0}")
        If with_clts is set to True, use this string to indicate that sounds
        are classified as "unknown sound" in the CLTS framework.    
    
    Returns
    -------
    profile : generator
        A generator of tuples (three items), indicating the segment, its frequency,
        the conversion to sound classes in the Dolgopolsky sound-class model,
        and the unicode-codepoints.
    """
    clts = clts or {}
    nulls = set()
    bad_words = set()
    brackets = brackets or "([{『(₍⁽«)]})』⁾₎"
    profile = defaultdict(int)
    words = [wordlist[idx, ref] for idx in wordlist]
    for word in pb(words, desc='iterating over words'):
        if isinstance(word, list):
            word = ' '.join(word)
        cleaned_string = clean_string(word,
                                      semi_diacritics=semi_diacritics,
                                      merge_vowels=merge_vowels,
                                      brackets=None,
                                      ignore_brackets=False,
                                      split_entries=False,
                                      preparse=None,
                                      rules=None,
                                      merge_geminates=merge_geminates)[0]

        # retain whole word if there are splitters in the word
        if [x for x in cleaned_string if x in brackets + splitters]:
            profile[word] += 1
            bad_words.add(word)
        else:
            for segment in cleaned_string.split(' '):
                profile[segment] += 1
            for segment in [x for x in word if x not in cleaned_string]:
                profile[segment] += 1
                nulls.add(segment)

    for s, f in pb(sorted(profile.items(), key=lambda x: x[1], reverse=True),
                   desc='preparing profile'):
        sclass = token2class(s, 'dolgo')
        if s in bad_words:
            ipa = bad_word.format(s)
        elif sclass == '0' and s not in nulls:
            ipa = bad_sound.format(s)
        elif s in nulls:
            ipa = 'NULL'
        elif clts:
            sound = clts.get(s, False)
            if not sound:
                ipa = '!' + s
            else:
                ipa = text_type(sound)
        else:
            ipa = s
        yield s, ipa, text_type(f), codepoint(s)
Esempio n. 3
0
def context_profile(wordlist,
                    ref='ipa',
                    col="doculect",
                    semi_diacritics='hsʃ̢ɕʂʐʑʒw',
                    merge_vowels=False,
                    brackets=None,
                    splitters='/,;~',
                    merge_geminates=True,
                    clts=False,
                    bad_word="<???>",
                    bad_sound="<?>",
                    unknown_sound="!{0}",
                    examples=2,
                    max_entries=100):
    """
    Create an advanced Orthography Profile with context and doculect information.

    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A wordlist from which you want to derive an initial
        orthography profile.
    ref : str (default="ipa")
        The name of the reference column in which the words are stored.
    col : str (default="doculect")
        Indicate in which column the information on the language variety is
        stored.
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    splitters : str
        The characters which force the automatic splitting of an entry.
    clts : dict (default=None)
        A dictionary(like) object that converts a given source sound into a
        potential target sound, using the get()-method of the dictionary.
        Normally, we think of a CLTS instance here (that is: a cross-linguistic
        transcription system as defined in the pyclts package).
    bad_word : str (default="«???»")
        Indicate how words that could not be parsed should be handled. Note
        that both "bad_word" and "bad_sound" are format-strings, so you can add
        formatting information here.
    bad_sound : str (default="«?»")
        Indicate how sounds that could not be converted to a sound class be
        handled. Note that both "bad_word" and "bad_sound" are format-strings,
        so you can add formatting information here.
    unknown_sound : str (default="!{0}")
        If with_clts is set to True, use this string to indicate that sounds
        are classified as "unknown sound" in the CLTS framework.
    examples : int(default=2)
        Indicate the number of examples that should be printed out.

    Returns
    -------
    profile : generator
        A generator of tuples (three items), indicating the segment, its frequency,
        the conversion to sound classes in the Dolgopolsky sound-class model,
        and the unicode-codepoints.
    """
    clts_ = clts or {}
    nulls = set()
    bad_words = set()
    brackets = brackets or "([{『(₍⁽«)]})』⁾₎"
    profile = defaultdict(list)
    errors = set()
    for idx, word, language in pb(wordlist.iter_rows(ref, col),
                                  desc='iter words',
                                  total=len(wordlist)):
        log.info('processing {0}-{1}'.format(idx, word))
        if isinstance(word, list):
            word = ' '.join(word)
        if word.strip():
            try:
                cleaned_string = clean_string(
                    word,
                    semi_diacritics=semi_diacritics,
                    merge_vowels=merge_vowels,
                    brackets=None,
                    ignore_brackets=False,
                    split_entries=False,
                    preparse=None,
                    rules=None,
                    merge_geminates=merge_geminates)[0].split(' ')

                # retain whole word if there are splitters in the word
                if [x for x in cleaned_string if x in brackets + splitters]:
                    profile[word] += [(language, word)]
                    bad_words.add(word)
                else:
                    context_pre = ['^'] + (len(cleaned_string) - 1) * ['']
                    context_post = (len(cleaned_string) - 1) * [''] + ['$']
                    for ctxA, ctxB, segment in zip(context_pre, context_post,
                                                   cleaned_string):
                        profile[ctxA + segment + ctxB] += [(language, word)]
                    for segment in [
                            x for x in word
                            if x not in ' '.join(cleaned_string)
                    ]:
                        profile[segment] += [(language, word)]
                        nulls.add(segment)
            except:
                errors.add(idx)
                log.warn('problem parsing {0}'.format(word))

    for s in '^$':
        yield s, 'NULL', '', '', '', ''

    for idx, (s, entries) in pb(enumerate(
            sorted(profile.items(), key=lambda x: len(x[1]), reverse=True)),
                                desc='yielding entries',
                                total=len(profile)):
        sclass = token2class(s.strip('^$'), 'dolgo')
        words, langs = [l[1] for l in entries
                        ][:max_entries], [l[0] for l in entries][:max_entries]
        languages = ', '.join(
            sorted(set(langs), key=lambda x: langs.count(x), reverse=True))
        frequency = str(len(langs))
        codepoints = codepoint(s)
        examples_ = ', '.join(
            sorted(set(words), key=lambda x: words.count(x),
                   reverse=True)[:examples])
        if s in bad_words:
            ipa = bad_word.format(s)
        elif sclass == '0':
            ipa = bad_sound.format(s)
        elif s in nulls:
            ipa = 'NULL'
        elif clts_:
            sound = clts_.get(s.strip('^$'), False)
            if not sound:
                ipa = '!' + s.strip('^$')
            else:
                ipa = text_type(sound)
        else:
            ipa = s.strip('^$')

        yield s, ipa, examples_, languages, frequency, codepoints
Esempio n. 4
0
    def partial_cluster(self,
                        method='sca',
                        threshold=0.45,
                        scale=0.5,
                        factor=0.3,
                        restricted_chars='_T',
                        mode='overlap',
                        cluster_method='infomap',
                        gop=-1,
                        restriction='',
                        ref='',
                        external_function=None,
                        split_on_tones=False,
                        **keywords):
        """
        Cluster the words into partial cognate sets.

        Function for flat clustering of words into cognate sets.

        Parameters
        ----------
        method : {'sca','lexstat','edit-dist','turchin'} (default='sca')
            Select the method that shall be used for the calculation.
        cluster_method : {'upgma','single','complete', 'mcl'} (default='upgma')
            Select the cluster method. 'upgma' (:evobib:`Sokal1958`) refers to
            average linkage clustering, 'mcl' refers to the "Markov Clustering
            Algorithm" (:evobib:`Dongen2000`).
        threshold : float (default=0.3)
            Select the threshold for the cluster approach. If set to c{False},
            an automatic threshold will be calculated by calculating the
            average distance of unrelated sequences (use with care).
        scale : float (default=0.5)
            Select the scale for the gap extension penalty.
        factor : float (default=0.3)
            Select the factor for extra scores for identical prosodic segments.
        restricted_chars : str (default="T_")
            Select the restricted chars (boundary markers) in the prosodic
            strings in order to enable secondary alignment.
        mode : {'global','local','overlap','dialign'} (default='overlap')
            Select the mode for the alignment analysis.
        verbose : bool (default=False)
            Define whether verbose output should be used or not.
        gop : int (default=-2)
            If 'sca' is selected as a method, define the gap opening penalty.
        restriction : {'cv'} (default="")
            Specify the restriction for calculations using the edit-distance.
            Currently, only "cv" is supported. If *edit-dist* is selected as
            *method* and *restriction* is set to *cv*, consonant-vowel matches
            will be prohibited in the calculations and the edit distance will
            be normalized by the length of the alignment rather than the length
            of the longest sequence, as described in :evobib:`Heeringa2006`.
        inflation : {int, float} (default=2)
            Specify the inflation parameter for the use of the MCL algorithm.
        expansion : int (default=2)
            Specify the expansion parameter for the use of the MCL algorithm.
        
        """
        kw = dict(imap_mode=True,
                  post_processing=True,
                  inflation=2,
                  expansion=2,
                  max_steps=1000,
                  add_self_loops=True,
                  sep=lingpy.settings.rcParams['morpheme_separator'],
                  word_sep=lingpy.settings.rcParams['word_separator'],
                  word_seps=lingpy.settings.rcParams['word_separators'],
                  seps=lingpy.settings.rcParams['morpheme_separators'],
                  mcl_logs=lambda x: -np.log2((1 - x)**2))
        kw.update(keywords)

        # check for parameters and add clustering, in order to make sure that
        # analyses are not repeated
        if not hasattr(self, 'params'):
            self.params = {}
        self.params['partial_cluster'] = "{0}_{1}_{2:.2f}".format(
            method, cluster_method, threshold)
        self._stamp += '# Partial Cluster: ' + self.params['partial_cluster']

        matrices = self._get_partial_matrices(
            method=method,
            scale=scale,
            factor=factor,
            restricted_chars=restricted_chars,
            mode=mode,
            gop=gop,
            imap_mode=kw['imap_mode'],
            split_on_tones=split_on_tones)
        k = 0
        C = defaultdict(list)  # stores the pcogids
        G = {}  # stores the graphs
        with util.pb(desc='PARTIAL SEQUENCE CLUSTERING',
                     total=len(self.rows)) as progress:
            for concept, trace, matrix in matrices:
                progress.update(1)
                lingpy.log.info('Analyzing concept {0}...'.format(concept))
                if external_function:
                    c = external_function(threshold,
                                          matrix,
                                          taxa=list(range(len(matrix))),
                                          revert=True)
                elif cluster_method == 'infomap':
                    c = extra.infomap_clustering(threshold,
                                                 matrix,
                                                 taxa=list(range(len(matrix))),
                                                 revert=True)
                elif cluster_method == 'mcl':
                    c = clustering.mcl(threshold,
                                       matrix,
                                       taxa=list(range(len(matrix))),
                                       max_steps=kw['max_steps'],
                                       inflation=kw['inflation'],
                                       expansion=kw['expansion'],
                                       add_self_loops=kw['add_self_loops'],
                                       logs=kw['mcl_logs'],
                                       revert=True)
                elif cluster_method in ['upgma', 'single', 'complete', 'ward']:
                    c = clustering.flat_cluster(cluster_method,
                                                threshold,
                                                matrix,
                                                revert=True)
                else:
                    raise ValueError("No suitable cluster method specified.")

                for i, (idx, pos, slc) in enumerate(trace):
                    C[idx] += [c[i] + k]
                if kw['post_processing']:
                    _g = nx.Graph()
                    for i, (idx, pos, slc) in enumerate(trace):
                        _g.add_node((i, idx, pos))
                    remove_edges = []
                    for (i, n1), (j, n2) in util.combinations2(
                            enumerate(_g.nodes())):
                        if C[n1[1]][n1[2]] == C[n2[1]][n2[2]]:
                            _g.add_edge(n1, n2)
                            if n1[1] == n2[1]:
                                # get scores for n1 and n2 with all the rest in
                                # the matrix to decide for one
                                sn1, sn2 = 0, 0
                                for i, row in enumerate(matrix):
                                    sn1 += matrix[i][n1[0]]
                                    sn2 += matrix[i][n2[0]]
                                sn1 = sn1 / len(matrix)
                                sn2 = sn2 / len(matrix)
                                if sn1 <= sn2:
                                    remove_edges += [n2]
                                else:
                                    remove_edges += [n1]
                    for node in remove_edges:
                        for edge in sorted(_g[node]):
                            _g.remove_edge(node, edge)

                    for i, coms in enumerate(nx.connected_components(_g)):
                        cogid = i + 1 + k
                        for j, idx, pos in coms:
                            C[idx][pos] = cogid

                    G[concept] = _g

                k += len(matrix) + 1
        self.add_entries(ref or self._partials, C, lambda x: x)
        self.graphs = G
Esempio n. 5
0
    def _get_partial_randist(self, **keywords):
        """
        Return the aligned results of randomly aligned sequences.
        """
        kw = dict(modes=rcParams['lexstat_modes'],
                  factor=rcParams['align_factor'],
                  restricted_chars=rcParams['restricted_chars'],
                  runs=rcParams['lexstat_runs'],
                  rands=rcParams['lexstat_rands'],
                  limit=rcParams['lexstat_limit'],
                  method=rcParams['lexstat_scoring_method'])
        kw.update(keywords)

        # determine the mode
        method = 'markov' if kw['method'] in ['markov', 'markov-chain', 'mc'] \
            else 'shuffle'

        corrdist = {}
        tasks = (self.width**2) / 2
        with util.pb(desc='RANDOM CORRESPONDENCE CALCULATION',
                     total=tasks) as progress:
            for (i, tA), (j,
                          tB) in util.multicombinations2(enumerate(self.cols)):
                progress.update(1)
                log.info("Calculating random alignments"
                         "for pair {0}/{1}.".format(tA, tB))
                corrdist[tA, tB] = defaultdict(float)

                # create morpheme-segmented pairs
                pairs = self.pairs[tA, tB]
                new_nums, new_weights, new_pros = [], [], []
                for idxA, idxB in pairs:
                    for iA, iB in self._slices[idxA]:
                        for jA, jB in self._slices[idxB]:
                            new_nums += [(self[idxA, self._numbers][iA:iB],
                                          self[idxB, self._numbers][jA:jB])]
                            new_weights += [(self[idxA, self._weights][iA:iB],
                                             self[idxB, self._weights][jA:jB])]
                            new_pros += [(self[idxA, self._prostrings][iA:iB],
                                          self[idxB, self._prostrings][jA:jB])]
                # get the number pairs etc.
                sample = [(x, y) for x in range(len(new_nums))
                          for y in range(len(new_nums))]
                if len(sample) > kw['runs']:
                    sample = random.sample(sample, kw['runs'])

                for mode, gop, scale in kw['modes']:
                    corrs, included = calign.corrdist(
                        10.0, [(new_nums[s[0]][0], new_nums[s[1]][1])
                               for s in sample],
                        [(new_weights[s[0]][0], new_weights[s[1]][1])
                         for s in sample],
                        [(new_pros[s[0]][0], new_pros[s[1]][1])
                         for s in sample], gop, scale, kw['factor'],
                        self.bscorer, mode, kw['restricted_chars'])

                    # change representation of gaps
                    for a, b in list(corrs.keys()):
                        # get the correspondence count
                        d = corrs[a, b] * self._included[tA, tB] / included
                        # XXX check XXX* len(self.pairs[tA,tB]) / runs

                        # check for gaps
                        if a == '-':
                            a = util.charstring(i + 1)
                        elif b == '-':
                            b = util.charstring(j + 1)

                        corrdist[tA, tB][a, b] += d / len(kw['modes'])
        return corrdist
Esempio n. 6
0
    def _get_partial_corrdist(self, **keywords):
        """
        Use alignments to get a correspondences statistics.
        """
        kw = dict(
            cluster_method='upgma',
            factor=rcParams['align_factor'],
            gop=rcParams['align_gop'],
            modes=rcParams['lexstat_modes'],
            preprocessing=False,
            preprocessing_method=rcParams['lexstat_preprocessing_method'],
            preprocessing_threshold=rcParams[
                'lexstat_preprocessing_threshold'],
            split_on_tones=False,
            ref='scaid',
            restricted_chars=rcParams['restricted_chars'],
            threshold=rcParams['lexstat_scoring_threshold'],
            subset=False)
        kw.update(keywords)

        self._included = {}
        corrdist = {}

        if kw['preprocessing']:
            if kw['ref'] not in self.header:
                self.cluster(method=kw['preprocessing_method'],
                             threshold=kw['preprocessing_threshold'],
                             gop=kw['gop'],
                             cluster_method=kw['cluster_method'],
                             ref=kw['ref'])

        with util.pb(desc='CORRESPONDENCE CALCULATION',
                     total=self.width**2 / 2) as pb:
            for (i, tA), (j,
                          tB) in util.multicombinations2(enumerate(self.cols)):
                pb.update(1)
                log.info("Calculating alignments for pair {0} / {1}.".format(
                    tA, tB))

                corrdist[tA, tB] = defaultdict(float)
                for mode, gop, scale in kw['modes']:
                    pairs = self.pairs[tA, tB]
                    if kw['subset']:
                        pairs = [
                            pair for pair in pairs
                            if pair in self.subsets[tA, tB]
                        ]

                    # threshold and preprocessing, make sure threshold is
                    # different from pre-processing threshold when
                    # preprocessing is set to false
                    if kw['preprocessing']:
                        pairs = [
                            pair for pair in pairs
                            if self[pair, kw['ref']][0] == self[pair,
                                                                kw['ref']][1]
                        ]
                        threshold = 10.0
                    else:
                        threshold = kw['threshold']

                    # create morpheme-segmented pairs
                    new_nums, new_weights, new_pros = [], [], []
                    for idxA, idxB in pairs:
                        for iA, iB in self._slices[idxA]:
                            for jA, jB in self._slices[idxB]:
                                new_nums += [(self[idxA, self._numbers][iA:iB],
                                              self[idxB,
                                                   self._numbers][jA:jB])]
                                new_weights += [(self[idxA,
                                                      self._weights][iA:iB],
                                                 self[idxB,
                                                      self._weights][jA:jB])]
                                new_pros += [(self[idxA,
                                                   self._prostrings][iA:iB],
                                              self[idxB,
                                                   self._prostrings][jA:jB])]

                    corrs, self._included[tA, tB] = calign.corrdist(
                        threshold, new_nums, new_weights, new_pros, gop, scale,
                        kw['factor'], self.bscorer, mode,
                        kw['restricted_chars'])

                    # change representation of gaps
                    for (a, b), d in corrs.items():
                        # XXX check for bias XXX
                        if a == '-':
                            a = util.charstring(i + 1)
                        elif b == '-':
                            b = util.charstring(j + 1)
                        corrdist[tA, tB][a, b] += d / float(len(kw['modes']))

        return corrdist