def _get_partial_randist(self, **keywords): """ Return the aligned results of randomly aligned sequences. """ kw = dict(modes=rcParams['lexstat_modes'], factor=rcParams['align_factor'], restricted_chars=rcParams['restricted_chars'], runs=rcParams['lexstat_runs'], rands=rcParams['lexstat_rands'], limit=rcParams['lexstat_limit'], method=rcParams['lexstat_scoring_method']) kw.update(keywords) # determine the mode method = 'markov' if kw['method'] in ['markov', 'markov-chain', 'mc'] \ else 'shuffle' corrdist = {} tasks = (self.width**2) / 2 with util.pb(desc='RANDOM CORRESPONDENCE CALCULATION', total=tasks) as progress: for (i, tA), (j, tB) in util.multicombinations2(enumerate(self.cols)): progress.update(1) log.info("Calculating random alignments" "for pair {0}/{1}.".format(tA, tB)) corrdist[tA, tB] = defaultdict(float) # create morpheme-segmented pairs pairs = self.pairs[tA, tB] new_nums, new_weights, new_pros = [], [], [] for idxA, idxB in pairs: for iA, iB in self._slices[idxA]: for jA, jB in self._slices[idxB]: new_nums += [(self[idxA, self._numbers][iA:iB], self[idxB, self._numbers][jA:jB])] new_weights += [(self[idxA, self._weights][iA:iB], self[idxB, self._weights][jA:jB])] new_pros += [(self[idxA, self._prostrings][iA:iB], self[idxB, self._prostrings][jA:jB])] # get the number pairs etc. sample = [(x, y) for x in range(len(new_nums)) for y in range(len(new_nums))] if len(sample) > kw['runs']: sample = random.sample(sample, kw['runs']) for mode, gop, scale in kw['modes']: corrs, included = calign.corrdist( 10.0, [(new_nums[s[0]][0], new_nums[s[1]][1]) for s in sample], [(new_weights[s[0]][0], new_weights[s[1]][1]) for s in sample], [(new_pros[s[0]][0], new_pros[s[1]][1]) for s in sample], gop, scale, kw['factor'], self.bscorer, mode, kw['restricted_chars']) # change representation of gaps for a, b in list(corrs.keys()): # get the correspondence count d = corrs[a, b] * self._included[tA, tB] / included # XXX check XXX* len(self.pairs[tA,tB]) / runs # check for gaps if a == '-': a = util.charstring(i + 1) elif b == '-': b = util.charstring(j + 1) corrdist[tA, tB][a, b] += d / len(kw['modes']) return corrdist
def _get_partial_corrdist(self, **keywords): """ Use alignments to get a correspondences statistics. """ kw = dict( cluster_method='upgma', factor=rcParams['align_factor'], gop=rcParams['align_gop'], modes=rcParams['lexstat_modes'], preprocessing=False, preprocessing_method=rcParams['lexstat_preprocessing_method'], preprocessing_threshold=rcParams[ 'lexstat_preprocessing_threshold'], split_on_tones=False, ref='scaid', restricted_chars=rcParams['restricted_chars'], threshold=rcParams['lexstat_scoring_threshold'], subset=False) kw.update(keywords) self._included = {} corrdist = {} if kw['preprocessing']: if kw['ref'] not in self.header: self.cluster(method=kw['preprocessing_method'], threshold=kw['preprocessing_threshold'], gop=kw['gop'], cluster_method=kw['cluster_method'], ref=kw['ref']) with util.pb(desc='CORRESPONDENCE CALCULATION', total=self.width**2 / 2) as pb: for (i, tA), (j, tB) in util.multicombinations2(enumerate(self.cols)): pb.update(1) log.info("Calculating alignments for pair {0} / {1}.".format( tA, tB)) corrdist[tA, tB] = defaultdict(float) for mode, gop, scale in kw['modes']: pairs = self.pairs[tA, tB] if kw['subset']: pairs = [ pair for pair in pairs if pair in self.subsets[tA, tB] ] # threshold and preprocessing, make sure threshold is # different from pre-processing threshold when # preprocessing is set to false if kw['preprocessing']: pairs = [ pair for pair in pairs if self[pair, kw['ref']][0] == self[pair, kw['ref']][1] ] threshold = 10.0 else: threshold = kw['threshold'] # create morpheme-segmented pairs new_nums, new_weights, new_pros = [], [], [] for idxA, idxB in pairs: for iA, iB in self._slices[idxA]: for jA, jB in self._slices[idxB]: new_nums += [(self[idxA, self._numbers][iA:iB], self[idxB, self._numbers][jA:jB])] new_weights += [(self[idxA, self._weights][iA:iB], self[idxB, self._weights][jA:jB])] new_pros += [(self[idxA, self._prostrings][iA:iB], self[idxB, self._prostrings][jA:jB])] corrs, self._included[tA, tB] = calign.corrdist( threshold, new_nums, new_weights, new_pros, gop, scale, kw['factor'], self.bscorer, mode, kw['restricted_chars']) # change representation of gaps for (a, b), d in corrs.items(): # XXX check for bias XXX if a == '-': a = util.charstring(i + 1) elif b == '-': b = util.charstring(j + 1) corrdist[tA, tB][a, b] += d / float(len(kw['modes'])) return corrdist