Ejemplo n.º 1
0
    def _get_partial_randist(self, **keywords):
        """
        Return the aligned results of randomly aligned sequences.
        """
        kw = dict(modes=rcParams['lexstat_modes'],
                  factor=rcParams['align_factor'],
                  restricted_chars=rcParams['restricted_chars'],
                  runs=rcParams['lexstat_runs'],
                  rands=rcParams['lexstat_rands'],
                  limit=rcParams['lexstat_limit'],
                  method=rcParams['lexstat_scoring_method'])
        kw.update(keywords)

        # determine the mode
        method = 'markov' if kw['method'] in ['markov', 'markov-chain', 'mc'] \
            else 'shuffle'

        corrdist = {}
        tasks = (self.width**2) / 2
        with util.pb(desc='RANDOM CORRESPONDENCE CALCULATION',
                     total=tasks) as progress:
            for (i, tA), (j,
                          tB) in util.multicombinations2(enumerate(self.cols)):
                progress.update(1)
                log.info("Calculating random alignments"
                         "for pair {0}/{1}.".format(tA, tB))
                corrdist[tA, tB] = defaultdict(float)

                # create morpheme-segmented pairs
                pairs = self.pairs[tA, tB]
                new_nums, new_weights, new_pros = [], [], []
                for idxA, idxB in pairs:
                    for iA, iB in self._slices[idxA]:
                        for jA, jB in self._slices[idxB]:
                            new_nums += [(self[idxA, self._numbers][iA:iB],
                                          self[idxB, self._numbers][jA:jB])]
                            new_weights += [(self[idxA, self._weights][iA:iB],
                                             self[idxB, self._weights][jA:jB])]
                            new_pros += [(self[idxA, self._prostrings][iA:iB],
                                          self[idxB, self._prostrings][jA:jB])]
                # get the number pairs etc.
                sample = [(x, y) for x in range(len(new_nums))
                          for y in range(len(new_nums))]
                if len(sample) > kw['runs']:
                    sample = random.sample(sample, kw['runs'])

                for mode, gop, scale in kw['modes']:
                    corrs, included = calign.corrdist(
                        10.0, [(new_nums[s[0]][0], new_nums[s[1]][1])
                               for s in sample],
                        [(new_weights[s[0]][0], new_weights[s[1]][1])
                         for s in sample],
                        [(new_pros[s[0]][0], new_pros[s[1]][1])
                         for s in sample], gop, scale, kw['factor'],
                        self.bscorer, mode, kw['restricted_chars'])

                    # change representation of gaps
                    for a, b in list(corrs.keys()):
                        # get the correspondence count
                        d = corrs[a, b] * self._included[tA, tB] / included
                        # XXX check XXX* len(self.pairs[tA,tB]) / runs

                        # check for gaps
                        if a == '-':
                            a = util.charstring(i + 1)
                        elif b == '-':
                            b = util.charstring(j + 1)

                        corrdist[tA, tB][a, b] += d / len(kw['modes'])
        return corrdist
Ejemplo n.º 2
0
    def _get_partial_corrdist(self, **keywords):
        """
        Use alignments to get a correspondences statistics.
        """
        kw = dict(
            cluster_method='upgma',
            factor=rcParams['align_factor'],
            gop=rcParams['align_gop'],
            modes=rcParams['lexstat_modes'],
            preprocessing=False,
            preprocessing_method=rcParams['lexstat_preprocessing_method'],
            preprocessing_threshold=rcParams[
                'lexstat_preprocessing_threshold'],
            split_on_tones=False,
            ref='scaid',
            restricted_chars=rcParams['restricted_chars'],
            threshold=rcParams['lexstat_scoring_threshold'],
            subset=False)
        kw.update(keywords)

        self._included = {}
        corrdist = {}

        if kw['preprocessing']:
            if kw['ref'] not in self.header:
                self.cluster(method=kw['preprocessing_method'],
                             threshold=kw['preprocessing_threshold'],
                             gop=kw['gop'],
                             cluster_method=kw['cluster_method'],
                             ref=kw['ref'])

        with util.pb(desc='CORRESPONDENCE CALCULATION',
                     total=self.width**2 / 2) as pb:
            for (i, tA), (j,
                          tB) in util.multicombinations2(enumerate(self.cols)):
                pb.update(1)
                log.info("Calculating alignments for pair {0} / {1}.".format(
                    tA, tB))

                corrdist[tA, tB] = defaultdict(float)
                for mode, gop, scale in kw['modes']:
                    pairs = self.pairs[tA, tB]
                    if kw['subset']:
                        pairs = [
                            pair for pair in pairs
                            if pair in self.subsets[tA, tB]
                        ]

                    # threshold and preprocessing, make sure threshold is
                    # different from pre-processing threshold when
                    # preprocessing is set to false
                    if kw['preprocessing']:
                        pairs = [
                            pair for pair in pairs
                            if self[pair, kw['ref']][0] == self[pair,
                                                                kw['ref']][1]
                        ]
                        threshold = 10.0
                    else:
                        threshold = kw['threshold']

                    # create morpheme-segmented pairs
                    new_nums, new_weights, new_pros = [], [], []
                    for idxA, idxB in pairs:
                        for iA, iB in self._slices[idxA]:
                            for jA, jB in self._slices[idxB]:
                                new_nums += [(self[idxA, self._numbers][iA:iB],
                                              self[idxB,
                                                   self._numbers][jA:jB])]
                                new_weights += [(self[idxA,
                                                      self._weights][iA:iB],
                                                 self[idxB,
                                                      self._weights][jA:jB])]
                                new_pros += [(self[idxA,
                                                   self._prostrings][iA:iB],
                                              self[idxB,
                                                   self._prostrings][jA:jB])]

                    corrs, self._included[tA, tB] = calign.corrdist(
                        threshold, new_nums, new_weights, new_pros, gop, scale,
                        kw['factor'], self.bscorer, mode,
                        kw['restricted_chars'])

                    # change representation of gaps
                    for (a, b), d in corrs.items():
                        # XXX check for bias XXX
                        if a == '-':
                            a = util.charstring(i + 1)
                        elif b == '-':
                            b = util.charstring(j + 1)
                        corrdist[tA, tB][a, b] += d / float(len(kw['modes']))

        return corrdist