Esempio n. 1
0
def pw_align(
        seqA,
        seqB,
        gop=-1,
        scale=0.5,
        scorer=False,
        mode='global',
        distance=False,
        **keywords):
    """
    Align two sequences in various ways.

    Parameters
    ----------
    seqA, seqB : {text_type, list, tuple}
        The input strings. These should be iterables, so you can use tuples,
        lists, or strings.
    scorer : dict (default=False)
        If set to c{False} a scorer will automatically be calculated,
        otherwise, the scorer needs to be passed as a dictionary that covers
        all segment matches between the input strings.
    gop : int (default=-1)
        The gap opening penalty.
    scale : float (default=0.5)
        The gap extension scale. This scale is similar to the gap extension
        penalty, but in contrast to the traditional GEP, it "scales" the gap
        opening penalty.
    mode : {"global", "local", "dialign", "overlap"} (default="global")
        Select between one of the four different alignment modes regularly
        implemented in LingPy, see :evobib:`List2012a` for details.
    distance : bool (default=False)
        If set to c{True} return the distance score following the formula by
        :evobib:`Downey2008`. Otherwise, return the basic similarity score.

    Examples
    --------
    Align two words using the dialign algorithm::
        >>> seqA = 'fat cat'
        >>> seqB = 'catfat'
        >>> pw_align(seqA, seqB, mode='dialign')
        (['f', 'a', 't', ' ', 'c', 'a', 't', '-', '-', '-'],
         ['-', '-', '-', '-', 'c', 'a', 't', 'f', 'a', 't'],
         3.0)

    """
    seqA, seqB = _as_lists(seqA, seqB)
    distance = 1 if distance else 0

    if not scorer and distance == 0:
        scorer = _get_scorer(seqA, seqB)
    elif not scorer and distance == 1:
        scorer = {}
        for a, b in multicombinations2(sorted(set(seqA + seqB))):
            scorer[b, a] = scorer[a, b] = 1.0 if a == b else -1.0

    # start alignment
    return talign.align_pair(seqA, seqB, gop, scale, scorer, mode, distance)
Esempio n. 2
0
def test_combinations2():
    def f(l):
        for i, a1 in enumerate(l):
            for j, a2 in enumerate(l):
                if i < j:
                    yield a1, a2

    def fm(l):
        for i, a1 in enumerate(l):
            for j, a2 in enumerate(l):
                if i <= j:
                    yield a1, a2

    for ch in [list(range(5)), 'abcdefg']:
        assert list(util.combinations2(ch)) == list(f(ch))
        assert list(util.multicombinations2(ch)) == list(fm(ch))
Esempio n. 3
0
    def test_combinations2(self):
        def f(l):
            for i, a1 in enumerate(l):
                for j, a2 in enumerate(l):
                    if i < j:
                        yield a1, a2

        def fm(l):
            for i, a1 in enumerate(l):
                for j, a2 in enumerate(l):
                    if i <= j:
                        yield a1, a2

        for l in [list(range(5)), 'abcdefg']:
            self.assertEqual(list(util.combinations2(l)), list(f(l)))
            self.assertEqual(list(util.multicombinations2(l)), list(fm(l)))
Esempio n. 4
0
def compile_model(model, path=None):
    """
    Function compiles customized sound-class models.

    Parameters
    ----------

    model : str
        A string indicating the name of the model which shall be created.

    path : str
        A string indication the path where the model-folder is stored.

    Notes
    -----
    A model is defined by a folder placed in :file:`data/models` directory of
    the LingPy package. The name of the folder reflects the name of the model.
    It contains three files: the file :file:`converter`, the file :file:`INFO`,
    and the optional file :file:`scorer`. The format requirements for these
    files are as follows:

    :file:`INFO`
        The ``INFO``-file serves as a reference for a given sound-class model.
        It can contain arbitrary information (and also be empty). If one wants
        to define specific characteristics, like the ``source``, the
        ``compiler``, the ``date``, or a ``description`` of a given model,
        this can be done by employing a key-value structure in which the key is
        preceded by an ``@`` and followed by a colon and the value is written
        right next to the key in the same line, e.g.::
            
            @source: Dolgopolsky (1986)

        This information will then be read from the ``INFO`` file and rendered
        when printing the model to screen with help of the :py:func:`print`
        function.

    :file:`converter`
        The ``converter`` file contains all sound classes which are matched
        with their respective sound values. Each line is reserved for one
        class, precede by the key (preferably an ASCII-letter) representing the
        class::

            B : ɸ, β, f, p͡f, p͜f, ƀ
            E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ
            D : θ, ð, ŧ, þ, đ
            G : x, ɣ, χ
            ...
    
    :file:`matrix`
        A scoring matrix indicating the alignment scores of all sound-class
        characters defined by the model. The scoring is structured as a simple
        tab-delimited text file. The first cell contains the character names,
        the following cells contain the scores in redundant form (with both
        triangles being filled)::

            B  10.0 -10.0   5.0 ...
            E -10.0   5.0 -10.0 ...
            F   5.0 -10.0  10.0 ...
            ...

    :file:`scorer`
        The ``scorer`` file (which is optional) contains the graph of
        class-transitions which is used for the calculation of the scoring
        dictionary. Each class is listed in a separate line, followed by the
        symbols ``v``,``c``, or ``t`` (indicating whether the class
        represents vowels, consonants, or tones), and by the classes it is
        directly connected to. The strength of this connection is indicated by
        digits (the smaller the value, the shorter the path between the
        classes)::

            A : v, E:1, O:1
            C : c, S:2
            B : c, W:2
            E : v, A:1, I:1
            D : c, S:2
            ...
        
        The information in such a file is automatically converted into a
        scoring dictionary (see :evobib:`List2012b` for details).

    Based on the information provided by the files, a dictionary for the
    conversion of IPA-characters to sound classes and a scoring dictionary are
    created and stored as a binary.  The model can be loaded with help of the
    :py:class:`~lingpy.data.model.Model` class and used in the various classes
    and functions provided by the library.
    
    See also
    --------
    lingpy.data.model.Model
    compile_dvt

    """
    log.info("Compiling model <" + model + ">...")
    # get the path to the models
    new_path = lambda *cmps: os.path.join(path or util.data_path('models'),
                                          model, *cmps)

    log.debug("Model-Path: %s" % new_path)

    # load the sound classes
    sound_classes = _import_sound_classes(new_path('converter'))

    # dump the data
    cache.dump(sound_classes, model + '.converter')
    log.info("... successfully created the converter.")

    # try to load the scoring function or the score tree
    scorer = False

    if os.path.isfile(new_path('matrix')):
        scorer = read_scorer(new_path('matrix'))
    elif os.path.isfile(new_path('scorer')):
        score_tree = _import_score_tree(new_path('scorer'))

        # calculate the scoring dictionary
        score_dict = _make_scoring_dictionary(score_tree)

        # make score_dict a ScoreDict instance
        chars = sorted(set([s[0] for s in score_dict.keys()]))
        matrix = [[0 for i in range(len(chars))] for j in range(len(chars))]
        for (i, charA), (j,
                         charB) in util.multicombinations2(enumerate(chars)):
            if i < j:
                matrix[i][j] = score_dict.get((charA, charB), -100)
                matrix[j][i] = score_dict.get((charB, charA), -100)
            elif i == j:
                matrix[i][j] = score_dict[charA, charB]

        scorer = misc.ScoreDict(chars, matrix)
        util.write_text_file(new_path('matrix'), scorer2str(scorer))

    if scorer:
        cache.dump(scorer, model + '.scorer')
        log.info("... successfully created the scorer.")
    else:
        log.info("... no scoring dictionary defined.")

    log.info("Model <" + model + "> was compiled successfully.")
Esempio n. 5
0
    def get_partial_scorer(self, **keywords):
        """
        Create a scoring function based on sound correspondences.

        Parameters
        ----------
        method : str (default='shuffle')
            Select between "markov", for automatically generated random
            strings, and "shuffle", for random strings taken directly from the
            data.
        ratio : tuple (default=3,2)
            Define the ratio between derived and original score for
            sound-matches.
        vscale : float (default=0.5)
            Define a scaling factor for vowels, in order to decrease their
            score in the calculations.
        runs : int (default=1000)
            Choose the number of random runs that shall be made in order to
            derive the random distribution.
        threshold : float (default=0.7)
            The threshold which used to select those words that are compared
            in order to derive the attested distribution.
        modes : list (default = [("global",-2,0.5),("local",-1,0.5)])
            The modes which are used in order to derive the distributions from
            pairwise alignments.
        factor : float (default=0.3)
            The scaling factor for sound segments with identical prosodic
            environment.
        force : bool (default=False)
            Force recalculation of existing distribution.
        preprocessing: bool (default=False)
            Select whether SCA-analysis shall be used to derive a preliminary
            set of cognates from which the attested distribution shall be
            derived.
        rands : int (default=1000)
            If "method" is set to "markov", this parameter defines the number
            of strings to produce for the calculation of the random
            distribution.
        limit : int (default=10000)
            If "method" is set to "markov", this parameter defines the limit
            above which no more search for unique strings will be carried out.
        cluster_method : {"upgma" "single" "complete"} (default="upgma")
            Select the method to be used for the calculation of cognates in the
            preprocessing phase, if "preprocessing" is set to c{True}.
        gop : int (default=-2)
            If "preprocessing" is selected, define the gap opening penalty for
            the preprocessing calculation of cognates.
        unattested : {int, float} (default=-5)
            If a pair of sounds is not attested in the data, but expected by
            the alignment algorithm that computes the expected distribution,
            the score would be -infinity. Yet in order to allow to smooth this
            behaviour and to reduce the strictness, we set a default negative
            value which does not necessarily need to be too high, since it may
            well be that we miss a potentially good pairing in the first runs
            of alignment analyses. Use this keyword to adjust this parameter.
        unexpected : {int, float} (default=0.000001)
            If a pair is encountered in a given alignment but not expected
            according to the randomized alignments, the score would be not
            calculable, since we had to divide by zero. For this reason, we set
            a very small constant, by which the score is divided in this case.
            Not that this constant is only relevant in those cases where the
            shuffling procedure was not carried out long enough.

        """
        kw = dict(
            method=rcParams['lexstat_scoring_method'],
            ratio=rcParams['lexstat_ratio'],
            vscale=rcParams['lexstat_vscale'],
            runs=rcParams['lexstat_runs'],
            threshold=rcParams['lexstat_scoring_threshold'],
            modes=rcParams['lexstat_modes'],
            factor=rcParams['align_factor'],
            restricted_chars=rcParams['restricted_chars'],
            force=False,
            preprocessing=False,
            rands=rcParams['lexstat_rands'],
            limit=rcParams['lexstat_limit'],
            cluster_method=rcParams['lexstat_cluster_method'],
            gop=rcParams['align_gop'],
            preprocessing_threshold=rcParams[
                'lexstat_preprocessing_threshold'],
            preprocessing_method=rcParams['lexstat_preprocessing_method'],
            subset=False,
            defaults=False,
            unattested=-5,
            unexpected=0.00001,
            smooth=1)
        kw.update(keywords)
        if kw['defaults']:
            return kw

        # get parameters and store them in string
        params = dict(
            ratio=kw['ratio'],
            vscale=kw['vscale'],
            runs=kw['runs'],
            scoring_threshold=kw['threshold'],
            preprocessing_threshold=kw['preprocessing_threshold'],
            modestring=':'.join('{0}-{1}-{2:.2f}'.format(a, abs(b), c)
                                for a, b, c in kw['modes']),
            factor=kw['factor'],
            restricted_chars=kw['restricted_chars'],
            method=kw['method'],
            preprocessing='{0}:{1}:{2}'.format(kw['preprocessing'],
                                               kw['cluster_method'],
                                               kw['gop']),
            unattested=kw['unattested'],
            unexpected=kw['unexpected'],
            smooth=kw['smooth'])

        parstring = '_'.join([
            '{ratio[0]}:{ratio[1]}', '{vscale:.2f}', '{runs}',
            '{scoring_threshold:.2f}', '{modestring}', '{factor:.2f}',
            '{restricted_chars}', '{method}', '{preprocessing}',
            '{preprocessing_threshold}', '{unexpected:.2f}', '{unattested:.2f}'
        ]).format(**params)

        # check for existing attributes
        if hasattr(self, 'cscorer') and not kw['force']:
            log.warning(
                "An identical scoring function has already been calculated, "
                "force recalculation by setting 'force' to 'True'.")
            return

        # check for attribute
        if hasattr(self, 'params') and not kw['force']:
            if 'cscorer' in self.params:
                if self.params['cscorer'] == params:
                    log.warning(
                        "An identical scoring function has already been "
                        "calculated, force recalculation by setting 'force'"
                        " to 'True'.")
                    return
            else:
                log.warning(
                    "A different scoring function has already been calculated, "
                    "overwriting previous settings.")

        # store parameters
        self.params = {'cscorer': params}
        self._meta['params'] = self.params
        self._stamp += "# Parameters: " + parstring + '\n'

        # get the correspondence distribution
        self._corrdist = self._get_partial_corrdist(**kw)
        # get the random distribution
        self._randist = self._get_partial_randist(**kw)

        # get the average gop
        gop = sum([m[1] for m in kw['modes']]) / len(kw['modes'])

        # create the new scoring matrix
        matrix = [[c for c in line] for line in self.bscorer.matrix]
        char_dict = self.bscorer.chars2int

        for (i, tA), (j, tB) in util.multicombinations2(enumerate(self.cols)):
            for charA, charB in product(
                    list(self.freqs[tA]) + [util.charstring(i + 1)],
                    list(self.freqs[tB]) + [util.charstring(j + 1)]):
                exp = self._randist.get((tA, tB), {}).get((charA, charB),
                                                          False)
                att = self._corrdist.get((tA, tB), {}).get((charA, charB),
                                                           False)
                # in the following we follow the former lexstat protocol
                if att <= kw['smooth'] and i != j:
                    att = False

                if att and exp:
                    score = np.log2((att**2) / (exp**2))
                elif att and not exp:
                    score = np.log2((att**2) / kw['unexpected'])
                elif exp and not att:
                    score = kw['unattested']  # XXX gop ???
                else:  # elif not exp and not att:
                    score = -90  # ???

                # combine the scores
                if rcParams['gap_symbol'] not in charA + charB:
                    sim = self.bscorer[charA, charB]
                else:
                    sim = gop

                # get the real score
                rscore = (kw['ratio'][0] * score + kw['ratio'][1] * sim) \
                    / sum(kw['ratio'])

                try:
                    iA = char_dict[charA]
                    iB = char_dict[charB]

                    # use the vowel scale
                    if charA[4] in self.vowels and charB[4] in self.vowels:
                        matrix[iA][iB] = matrix[iB][iA] = kw['vscale'] * rscore
                    else:
                        matrix[iA][iB] = matrix[iB][iA] = rscore
                except:
                    pass

        self.cscorer = misc.ScoreDict(self.chars, matrix)
        self._meta['scorer']['cscorer'] = self.cscorer
Esempio n. 6
0
    def _get_partial_randist(self, **keywords):
        """
        Return the aligned results of randomly aligned sequences.
        """
        kw = dict(modes=rcParams['lexstat_modes'],
                  factor=rcParams['align_factor'],
                  restricted_chars=rcParams['restricted_chars'],
                  runs=rcParams['lexstat_runs'],
                  rands=rcParams['lexstat_rands'],
                  limit=rcParams['lexstat_limit'],
                  method=rcParams['lexstat_scoring_method'])
        kw.update(keywords)

        # determine the mode
        method = 'markov' if kw['method'] in ['markov', 'markov-chain', 'mc'] \
            else 'shuffle'

        corrdist = {}
        tasks = (self.width**2) / 2
        with util.pb(desc='RANDOM CORRESPONDENCE CALCULATION',
                     total=tasks) as progress:
            for (i, tA), (j,
                          tB) in util.multicombinations2(enumerate(self.cols)):
                progress.update(1)
                log.info("Calculating random alignments"
                         "for pair {0}/{1}.".format(tA, tB))
                corrdist[tA, tB] = defaultdict(float)

                # create morpheme-segmented pairs
                pairs = self.pairs[tA, tB]
                new_nums, new_weights, new_pros = [], [], []
                for idxA, idxB in pairs:
                    for iA, iB in self._slices[idxA]:
                        for jA, jB in self._slices[idxB]:
                            new_nums += [(self[idxA, self._numbers][iA:iB],
                                          self[idxB, self._numbers][jA:jB])]
                            new_weights += [(self[idxA, self._weights][iA:iB],
                                             self[idxB, self._weights][jA:jB])]
                            new_pros += [(self[idxA, self._prostrings][iA:iB],
                                          self[idxB, self._prostrings][jA:jB])]
                # get the number pairs etc.
                sample = [(x, y) for x in range(len(new_nums))
                          for y in range(len(new_nums))]
                if len(sample) > kw['runs']:
                    sample = random.sample(sample, kw['runs'])

                for mode, gop, scale in kw['modes']:
                    corrs, included = calign.corrdist(
                        10.0, [(new_nums[s[0]][0], new_nums[s[1]][1])
                               for s in sample],
                        [(new_weights[s[0]][0], new_weights[s[1]][1])
                         for s in sample],
                        [(new_pros[s[0]][0], new_pros[s[1]][1])
                         for s in sample], gop, scale, kw['factor'],
                        self.bscorer, mode, kw['restricted_chars'])

                    # change representation of gaps
                    for a, b in list(corrs.keys()):
                        # get the correspondence count
                        d = corrs[a, b] * self._included[tA, tB] / included
                        # XXX check XXX* len(self.pairs[tA,tB]) / runs

                        # check for gaps
                        if a == '-':
                            a = util.charstring(i + 1)
                        elif b == '-':
                            b = util.charstring(j + 1)

                        corrdist[tA, tB][a, b] += d / len(kw['modes'])
        return corrdist
Esempio n. 7
0
    def _get_partial_corrdist(self, **keywords):
        """
        Use alignments to get a correspondences statistics.
        """
        kw = dict(
            cluster_method='upgma',
            factor=rcParams['align_factor'],
            gop=rcParams['align_gop'],
            modes=rcParams['lexstat_modes'],
            preprocessing=False,
            preprocessing_method=rcParams['lexstat_preprocessing_method'],
            preprocessing_threshold=rcParams[
                'lexstat_preprocessing_threshold'],
            split_on_tones=False,
            ref='scaid',
            restricted_chars=rcParams['restricted_chars'],
            threshold=rcParams['lexstat_scoring_threshold'],
            subset=False)
        kw.update(keywords)

        self._included = {}
        corrdist = {}

        if kw['preprocessing']:
            if kw['ref'] not in self.header:
                self.cluster(method=kw['preprocessing_method'],
                             threshold=kw['preprocessing_threshold'],
                             gop=kw['gop'],
                             cluster_method=kw['cluster_method'],
                             ref=kw['ref'])

        with util.pb(desc='CORRESPONDENCE CALCULATION',
                     total=self.width**2 / 2) as pb:
            for (i, tA), (j,
                          tB) in util.multicombinations2(enumerate(self.cols)):
                pb.update(1)
                log.info("Calculating alignments for pair {0} / {1}.".format(
                    tA, tB))

                corrdist[tA, tB] = defaultdict(float)
                for mode, gop, scale in kw['modes']:
                    pairs = self.pairs[tA, tB]
                    if kw['subset']:
                        pairs = [
                            pair for pair in pairs
                            if pair in self.subsets[tA, tB]
                        ]

                    # threshold and preprocessing, make sure threshold is
                    # different from pre-processing threshold when
                    # preprocessing is set to false
                    if kw['preprocessing']:
                        pairs = [
                            pair for pair in pairs
                            if self[pair, kw['ref']][0] == self[pair,
                                                                kw['ref']][1]
                        ]
                        threshold = 10.0
                    else:
                        threshold = kw['threshold']

                    # create morpheme-segmented pairs
                    new_nums, new_weights, new_pros = [], [], []
                    for idxA, idxB in pairs:
                        for iA, iB in self._slices[idxA]:
                            for jA, jB in self._slices[idxB]:
                                new_nums += [(self[idxA, self._numbers][iA:iB],
                                              self[idxB,
                                                   self._numbers][jA:jB])]
                                new_weights += [(self[idxA,
                                                      self._weights][iA:iB],
                                                 self[idxB,
                                                      self._weights][jA:jB])]
                                new_pros += [(self[idxA,
                                                   self._prostrings][iA:iB],
                                              self[idxB,
                                                   self._prostrings][jA:jB])]

                    corrs, self._included[tA, tB] = calign.corrdist(
                        threshold, new_nums, new_weights, new_pros, gop, scale,
                        kw['factor'], self.bscorer, mode,
                        kw['restricted_chars'])

                    # change representation of gaps
                    for (a, b), d in corrs.items():
                        # XXX check for bias XXX
                        if a == '-':
                            a = util.charstring(i + 1)
                        elif b == '-':
                            b = util.charstring(j + 1)
                        corrdist[tA, tB][a, b] += d / float(len(kw['modes']))

        return corrdist