Example #1
0
def check_stats(models, wordlist, filename='results.txt', pprint=False):
    results = []
    for m in models:
        p, z = tstats(wordlist, m, return_dists=True)
        results += [[m, p, z]]

    txt = ''
    for a, b, c in results:
        txt += '{0}\t{1:.2f}\t{2:.2f}\n'.format(a, b, c)
    as_string(txt, pprint)
    if filename: write_text_file(filename, txt)
Example #2
0
def check_stats(models, wordlist, filename='results.txt', pprint=False):
    results = []
    for m in models:
        p, z = tstats(wordlist, m, return_dists=True)
        results += [[m, p, z]]


    txt = ''
    for a, b, c in results:
        txt += '{0}\t{1:.2f}\t{2:.2f}\n'.format(a, b, c)
    as_string(txt, pprint)
    if filename: write_text_file(filename, txt)
Example #3
0
def bcubes(wordlist,
           gold='cogid',
           test='lexstatid',
           modify_ref=False,
           pprint=True,
           per_concept=False):
    """
    Compute B-Cubed scores for test and reference datasets.

    Parameters
    ----------
    lex : :py:class:`lingpy.basic.wordlist.Wordlist`
        A :py:class:`lingpy.basic.wordlist.Wordlist` class or a daughter class,
        (like the :py:class:`~lingpy.compare.lexstat.LexStat` class used for the
        computation). It should have two columns indicating cognate IDs.
    gold : str (default='cogid')
        The name of the column containing the gold standard cognate
        assignments.
    test : str (default='lexstatid')
        The name of the column containing the automatically implemented cognate
        assignments.
    modify_ref : function (default=False)
        Use a function to modify the reference. If your cognate identifiers
        are numerical, for example, and negative values are assigned as
        loans, but you want to suppress this behaviour, just set this
        keyword to "abs", and all cognate IDs will be converted to their
        absolute value.
    pprint : bool (default=True)
        Print out the results
    per_concept : bool (default=False)
        Compute b-cubed scores per concep and not for the whole data in one
        piece.

    Returns
    -------
    t : tuple
        A tuple consisting of the precision, the recall, and the harmonic mean
        (F-scores).

    Notes
    -----
    B-Cubed scores were first described by :evobib:`Bagga1998` as part of an
    algorithm. Later on, :evobib:`Amigo2009` showed that they can also used as
    to compare cluster decisions. :evobib:`Hauer2011` applied the B-Cubed
    scores first to the task of automatic cognate detection.
    
    See also
    --------
    diff
    pairs
    """
    # if loans are treated as homologs
    evl = modify_ref if modify_ref else identity

    def get_scores(one, other):
        for _, line in wordlist.get_etymdict(ref=one,
                                             modify_ref=modify_ref).items():
            line = [value for value in [evl(x[0]) for x in line if x != 0]]
            # check for linesize
            if len(line) > 1:
                # get cognate-ids in the other set for the line
                other_line = [evl(wordlist[idx, other]) for idx in line]

                # get the recall
                for idx in other_line:
                    yield other_line.count(idx) / len(line)
            else:
                yield 1.0

    if per_concept:
        bcr, bcp, fsc = [], [], []
        for concept in wordlist.rows:
            idxsG = _get_cogs(gold, concept, evl, wordlist)
            idxsT = _get_cogs(test, concept, evl, wordlist)
            r = _get_bcubed_score(idxsG, idxsT)
            p = _get_bcubed_score(idxsT, idxsG)
            f = 2 * ((r * p) / (p + r))
            bcr += [r]
            bcp += [p]
            fsc += [f]

            as_string('{0:15}\t{1:.2f}\t{2:.2f}\t{3:.2f}'.format(
                concept, p, r, f),
                      pprint=pprint)
    else:
        # b-cubed recall
        bcr = list(get_scores(gold, test))
        # b-cubed precision
        bcp = list(get_scores(test, gold))
        fsc = []

    # calculate general scores
    BCP = sum(bcp) / len(bcp)
    BCR = sum(bcr) / len(bcr)
    FSC = sum(fsc) / len(fsc) if fsc else 2 * ((BCP * BCR) / (BCP + BCR))

    as_string(_format_results('B-Cubed', BCP, BCR, FSC), pprint=pprint)

    return BCP, BCR, FSC
Example #4
0
def diff(wordlist,
         gold='cogid',
         test='lexstatid',
         modify_ref=False,
         pprint=True,
         filename='',
         tofile=True,
         transcription="ipa"):
    r"""
    Write differences in classifications on an item-basis to file.

    lex : :py:class:`lingpy.compare.lexstat.LexStat`
        The :py:class:`~lingpy.compare.lexstat.LexStat` class used for the
        computation. It should have two columns indicating cognate IDs.
    gold : str (default='cogid')
        The name of the column containing the gold standard cognate
        assignments.
    test : str (default='lexstatid')
        The name of the column containing the automatically implemented cognate
        assignments.
    modify_ref : function (default=False)
        Use a function to modify the reference. If your cognate identifiers
        are numerical, for example, and negative values are assigned as
        loans, but you want to suppress this behaviour, just set this
        keyword to "abs", and all cognate IDs will be converted to their
        absolute value.
    pprint : bool (default=True)
        Print out the results
    filename : str (default='')
        Name of the output file. If not specified, it is identical with the
        name of the :py:class:`~lingpy.compare.lexstat.LexStat`, but with the
        extension ``diff``.
    tofile : bool (default=True)
        If set to c{False}, no data will be written to file, but instead, the
        data will be returned.
    transcription : str (default="ipa")
        The file in which the transcriptions are located (should be a string,
        no segmentized version, for convenience of writing to file).

    Returns
    -------
    t : tuple
        A nested tuple consisting of two further tuples. The first
        containing precision, recall, and harmonic mean
        (F-scores), the second containing the same values for the pair-scores.

    Notes
    -----
    If the **tofile** option is chosen, the results are written to a specific
    file with the extension ``diff``. This file contains all cognate sets in
    which there are differences between gold standard and test sets. It also
    gives detailed information regarding false positives, false negatives, and
    the words involved in these wrong decisions.

    .. This function also calculates the "transformation" score. This score is
    .. based on the calculation of steps that are needed to transform one cluster
    .. for one set of meanings into the other. Ideally, if there are *n* different
    .. cognate sets covering one gloss in the gold standard, the minimal length of
    .. a mapping to convert the *m* cognate sets of the test set into the gold standard
    .. is *n*. In this case, both gold standard and test set are identical.
    .. However, if gold standard and test set differ, the number of mappings
    .. necessarily exceeds *m* and *n*. Based on this, the transformation
    .. precision is defined as :math:`\frac{m}{M}`, where *m* is the number of
    .. distinct clusters in the test set and *M* is the length of the mapping.
    .. Accordingly, the recall is defined as :math:`\frac{n}{M}`, where *n* is the
    .. number of clusters in the gold standard.

    .. Note that if precision is lower than 1.0, this means there are false
    .. positive decisions in the test set. Accordingly, a recall lower than 1.0
    .. indicates that there are false negative decisions in the test set.
    .. The drawback of this score is that it is not sensitive regarding the
    .. distinct number of decisions in which gold standard and test set differ, so
    .. the recall can be very low although most of the words have been grouped
    .. accurately. The advantage is that it can be directly interpreted in terms
    .. of 'false positive/false negative' decisions.

    See also
    --------
    bcubes
    pairs
    """
    filename = filename or wordlist.filename
    loan = modify_ref if modify_ref else identity

    # open file
    if tofile:
        f = codecs.open(filename + '.diff', 'w', 'utf-8')

    # get a formatter for language names
    lform = '{0:' + str(max([len(l) for l in wordlist.cols])) + '}'

    preT, recT = [], []
    preB, recB = [], []
    preP, recP = [], []

    def get_pairs(cogs, idxs):
        tmp = defaultdict(list)
        for x, y in zip(cogs, idxs):
            tmp[x].append(y)
        for x in tmp:
            for yA, yB in combinations(tmp[x], r=2):
                yield tuple(sorted([yA, yB]))

    for concept in wordlist.rows:
        idxs = wordlist.get_list(row=concept, flat=True)
        # get the basic index for all seqs
        bidx = [i + 1 for i in range(len(idxs))]

        cogsG = _get_cogs(gold, concept, loan, wordlist)
        cogsT = _get_cogs(test, concept, loan, wordlist)

        if cogsG != cogsT:
            # calculate the transformation distance of the sets
            tramGT = len(set(zip(cogsG, cogsT)))
            tramG = len(set(cogsG))
            tramT = len(set(cogsT))
            preT += [tramT / tramGT]
            recT += [tramG / tramGT]

            # calculate the bcubed precision for the sets
            preB += [_get_bcubed_score(cogsT, cogsG)]

            # calculate b-cubed recall
            recB += [_get_bcubed_score(cogsG, cogsT)]

            # calculate pair precision
            pairsG = set(get_pairs(cogsG, idxs))
            pairsT = set(get_pairs(cogsT, idxs))

            preP.append(
                len(pairsT.intersection(pairsG)) /
                len(pairsT) if pairsT else 1.0)
            recP.append(
                len(pairsT.intersection(pairsG)) /
                len(pairsG) if pairsG else 1.0)
            fp = "no" if preP[-1] == 1.0 else "yes"
            fn = "no" if recP[-1] == 1.0 else "yes"

            if tofile:
                f.write(
                    "Concept: {0}, False Positives: {1}, False Negatives: {2}\n"
                    .format(concept, fp, fn))

            # get the words
            words = [wordlist[i, 'ipa'] for i in idxs]
            langs = [wordlist[i, 'taxa'] for i in idxs]

            # get a word-formater
            wform = '{0:' + str(max([len(w) for w in words])) + '}'

            # write differences to file
            if tofile:
                for word, lang, cG, cT in sorted(zip(words, langs, cogsG,
                                                     cogsT),
                                                 key=lambda x: (x[2], x[3])):
                    f.write('{0}\t{1}\t{2:4}\t{3:4}\n'.format(
                        lform.format(lang), wform.format(word), cG, cT))
                f.write('#\n')
        else:
            preT += [1.0]
            recT += [1.0]
            preB += [1.0]
            recB += [1.0]
            preP += [1.0]
            recP += [1.0]

    bp = sum(preB) / len(preB)
    br = sum(recB) / len(recB)
    bf = 2 * (bp * br) / (bp + br)
    pp = sum(preP) / len(preP)
    pr = sum(recP) / len(recP)
    pf = 2 * (pp * pr) / (pp + pr)


    as_string(_format_results('B-Cubed', bp, br, bf) + \
            _format_results('Pair', pp, pr, pf),
            pprint=pprint)

    if tofile:
        f.write('B-Cubed Scores:\n')
        f.write('Precision: {0:.4f}\n'.format(bp))
        f.write('Recall:    {0:.4f}\n'.format(br))
        f.write('F-Score:   {0:.4f}\n'.format(bf))
        f.write('#\n')
        f.write('Pair Scores:\n')
        f.write('Precision: {0:.4f}\n'.format(pp))
        f.write('Recall:    {0:.4f}\n'.format(pr))
        f.write('F-Score:   {0:.4f}\n'.format(pf))
        f.close()
        log.file_written(filename + '.diff')
    else:
        return (bp, br, bf), (pp, pr, pf)
Example #5
0
def pairs(lex,
          gold='cogid',
          test='lexstatid',
          modify_ref=False,
          pprint=True,
          _return_string=False):
    """
    Compute pair scores for the evaluation of cognate detection algorithms.
    
    Parameters
    ----------
    lex : :py:class:`lingpy.compare.lexstat.LexStat`
        The :py:class:`~lingpy.compare.lexstat.LexStat` class used for the
        computation. It should have two columns indicating cognate IDs.
    gold : str (default='cogid')
        The name of the column containing the gold standard cognate
        assignments.
    test : str (default='lexstatid')
        The name of the column containing the automatically implemented cognate
        assignments.
    modify_ref : function (default=False)
        Use a function to modify the reference. If your cognate identifiers
        are numerical, for example, and negative values are assigned as
        loans, but you want to suppress this behaviour, just set this
        keyword to "abs", and all cognate IDs will be converted to their
        absolute value.
    pprint : bool (default=True)
        Print out the results

    Returns
    -------
    t : tuple
        A tuple consisting of the precision, the recall, and the harmonic mean
        (F-scores).
    
    Notes
    -----
    Pair-scores can be computed in different ways, with often different
    results. This variant follows the description by :evobib:`Bouchard-Cote2013`.
    
    See also
    --------
    diff
    bcubes
    """
    # if loans are treated as homologs
    evl = modify_ref if modify_ref else identity

    def get_pairs(ref):
        for key, line in lex.get_etymdict(ref=ref,
                                          modify_ref=modify_ref).items():
            line = [value for value in [evl(x[0]) for x in line if x != 0]]
            for a, b in combinations(line, r=2):
                yield tuple(sorted([a, b]))

    pairsG = set(get_pairs(gold))
    pairsT = set(get_pairs(test))

    # calculate precision and recall
    pp = len(pairsG.intersection(pairsT)) / len(pairsT)
    pr = len(pairsG.intersection(pairsT)) / len(pairsG)
    fs = 2 * (pp * pr) / (pp + pr)

    # print the results if this option is chosen
    as_string(_format_results('Pairs', pp, pr, fs), pprint=pprint)

    return pp, pr, fs
Example #6
0
def partial_bcubes(wordlist, gold, test, pprint=True):
    """
    Compute B-Cubed scores for test and reference datasets for partial cognate\
            detection.

    Parameters
    ----------
    wordlist : :py:class:`~lingpy.basic.wordlist.Wordlist`
        A :py:class:`~lingpy.basic.wordlist.Wordlist`, or one of it's daughter
        classes (like, e.g., the :py:class:`~lingpy.compare.partial.Partial`
        class used for computation of partial cognates. It should have two
        columns indicating cognate IDs.
    gold : str (default='cogid')
        The name of the column containing the gold standard cognate
        assignments.
    test : str (default='lexstatid')
        The name of the column containing the automatically implemented cognate
        assignments.
    pprint : bool (default=True)
        Print out the results

    Returns
    -------
    t : tuple
        A tuple consisting of the precision, the recall, and the harmonic mean
        (F-scores).

    Notes
    -----
    B-Cubed scores were first described by :evobib:`Bagga1998` as part of an
    algorithm. Later on, :evobib:`Amigo2009` showed that they can also used as
    to compare cluster decisions. :evobib:`Hauer2011` applied the B-Cubed
    scores first to the task of automatic cognate detection.
    
    See also
    --------
    bcubes
    diff
    pairs
    """

    # here's the point with bcubes for fuzzy: if we compare, we need to make
    # sure we count whether one instance is identical, not whether all of them
    # are identical!

    def get_scores(one, other):
        scores = []
        multiple_items = []
        for k, v in wordlist.get_etymdict(ref=one).items():
            _idxs = [val for val in v if val != 0]
            # now we need to get the position in the index
            poss, idxs = [], []
            for val in _idxs:
                if len(val) > 1:
                    multiple_items += [len(val)]
                for idx in val:
                    new_pos = [
                        i for i, cog in zip(range(len(wordlist[idx, one])),
                                            wordlist[idx, one]) if cog == k
                    ]
                    idxs += [idx for x in new_pos]
                    poss += new_pos
            if len(idxs) > 1:
                other_idxs = [
                    wordlist[idx, other][pos] for pos, idx in zip(poss, idxs)
                ]
                for idx in other_idxs:
                    scores += [other_idxs.count(idx) / len(idxs)]
            else:
                scores += [1]
        return sum(scores) / len(scores)

    bcr = get_scores(gold, test)
    bcp = get_scores(test, gold)
    bcf = 2 * ((bcp * bcr) / (bcp + bcr))

    as_string(_format_results('B-Cubed', bcp, bcr, bcf), pprint=pprint)
    return bcp, bcr, bcf
Example #7
0
    def align(self, **keywords):
        """
        Align a pair of sequences or multiple sequence pairs.

        Parameters
        ----------
        gop : int (default=-1)
            The gap opening penalty (GOP).
        scale : float (default=0.5)
            The gap extension penalty (GEP), calculated with help of a scaling
            factor.
        mode : {"global","local","overlap","dialign"}
            The alignment mode, see :evobib:`List2012a` for details.
        factor : float (default = 0.3)
            The factor by which matches in identical prosodic position are
            increased.
        restricted_chars : str (default="T\_")
            The restricted chars that function as an indicator of syllable or
            morpheme breaks for secondary alignment, see :evobib:`List2012c`
            for details.
        distance : bool (default=False)
            If set to *True*, return the distance instead of the similarity
            score. Distance is calculated using the formula by
            :evobib:`Downey2008`.
        model : { None, ~lingpy.data.model.Model }
            Specify the sound class model that shall be used for the analysis.
            If no model is specified, the default model of :evobib:`List2012a`
            will be used.
        pprint : bool (default=False)
            If set to *True*, the alignments are printed to the screen.

        """
        setdefaults(
            keywords,
            gop=-1,
            scale=0.5,
            mode='global',
            factor=0.3,
            restricted_chars='T_',
            distance=False,
            model=rcParams['sca'],
            pprint=False,
            transform=rcParams['align_transform'])

        if hasattr(self, 'model'):
            if keywords['model'] != self.model:
                self._set_model(**keywords)
        else:
            self._set_model(**keywords)

        # create the alignments array
        self._alignments = calign.align_pairs(
            self.classes,
            self.weights,
            self.prostrings,
            keywords['gop'],
            keywords['scale'],
            keywords['factor'],
            self.scoredict,
            keywords['mode'],
            keywords['restricted_chars'],
            distance=1 if keywords['distance'] else 0)

        # switch back to alignments
        self.alignments = []
        for i, (almA, almB, sim) in enumerate(self._alignments):
            self.alignments.append((
                class2tokens(self.tokens[i][0], almA, local=keywords['mode'] == "local"),
                class2tokens(self.tokens[i][1], almB, local=keywords['mode'] == "local"),
                sim))

        # print the alignments, if this is chosen
        as_string(self, pprint=keywords['pprint'])
Example #8
0
def test_as_string():
    out = util.as_string('text', pprint=False)
    assert out == 'text'
Example #9
0
def diff(wordlist,
         gold='cogid',
         test='lexstatid',
         modify_ref=False,
         pprint=True,
         filename='',
         tofile=True,
         transcription="ipa",
         concepts=False):
    r"""
    Write differences in classifications on an item-basis to file.

    lex : :py:class:`lingpy.compare.lexstat.LexStat`
        The :py:class:`~lingpy.compare.lexstat.LexStat` class used for the
        computation. It should have two columns indicating cognate IDs.
    gold : str (default='cogid')
        The name of the column containing the gold standard cognate
        assignments.
    test : str (default='lexstatid')
        The name of the column containing the automatically implemented cognate
        assignments.
    modify_ref : function (default=False)
        Use a function to modify the reference. If your cognate identifiers
        are numerical, for example, and negative values are assigned as
        loans, but you want to suppress this behaviour, just set this
        keyword to "abs", and all cognate IDs will be converted to their
        absolute value.
    pprint : bool (default=True)
        Print out the results
    filename : str (default='')
        Name of the output file. If not specified, it is identical with the
        name of the :py:class:`~lingpy.compare.lexstat.LexStat`, but with the
        extension ``diff``.
    tofile : bool (default=True)
        If set to c{False}, no data will be written to file, but instead, the
        data will be returned.
    transcription : str (default="ipa")
        The file in which the transcriptions are located (should be a string,
        no segmentized version, for convenience of writing to file).

    Returns
    -------
    t : tuple
        A nested tuple consisting of two further tuples. The first
        containing precision, recall, and harmonic mean
        (F-scores), the second containing the same values for the pair-scores.

    Notes
    -----
    If the **tofile** option is chosen, the results are written to a specific
    file with the extension ``diff``. This file contains all cognate sets in
    which there are differences between gold standard and test sets. It also
    gives detailed information regarding false positives, false negatives, and
    the words involved in these wrong decisions.

    See also
    --------
    bcubes
    pairs
    """
    filename = filename or wordlist.filename
    loan = modify_ref if modify_ref else identity

    # open file
    lines = []

    # concepts, allow to check scores for only one concept
    concepts = concepts or [c for c in wordlist.rows]

    # get a formatter for language names
    lform = '{0:' + str(max([len(l) for l in wordlist.cols])) + '}'

    preT, recT = [], []
    preB, recB = [], []
    preP, recP = [], []

    def get_pairs(cogs, idxs):
        tmp = defaultdict(list)
        for x, y in zip(cogs, idxs):
            tmp[x].append(y)
        for x in tmp:
            for yA, yB in combinations(tmp[x], r=2):
                yield tuple(sorted([yA, yB]))

    for concept in concepts:
        idxs = wordlist.get_list(row=concept, flat=True)
        # get the basic index for all seqs
        bidx = [i + 1 for i in range(len(idxs))]

        cogsG = _get_cogs(gold, concept, loan, wordlist)
        cogsT = _get_cogs(test, concept, loan, wordlist)

        if cogsG != cogsT:
            # calculate the transformation distance of the sets
            tramGT = len(set(zip(cogsG, cogsT)))
            tramG = len(set(cogsG))
            tramT = len(set(cogsT))
            preT += [tramT / tramGT]
            recT += [tramG / tramGT]

            # calculate the bcubed precision for the sets
            preB += [_get_bcubed_score(cogsT, cogsG)]

            # calculate b-cubed recall
            recB += [_get_bcubed_score(cogsG, cogsT)]

            # calculate pair precision
            pairsG = set(get_pairs(cogsG, idxs))
            pairsT = set(get_pairs(cogsT, idxs))

            preP.append(
                len(pairsT.intersection(pairsG)) /
                len(pairsT) if pairsT else 1.0)
            recP.append(
                len(pairsT.intersection(pairsG)) /
                len(pairsG) if pairsG else 1.0)
            fp = "no" if preP[-1] == 1.0 else "yes"
            fn = "no" if recP[-1] == 1.0 else "yes"

            lines.append(
                "Concept: {0}, False Positives: {1}, False Negatives: {2}".
                format(concept, fp, fn))

            # get the words
            words = [wordlist[i, 'ipa'] for i in idxs]
            langs = [wordlist[i, 'taxa'] for i in idxs]

            # get a word-formater
            wform = '{0:' + str(max([len(w) for w in words])) + '}'

            # write differences to file
            for word, lang, cG, cT in sorted(zip(words, langs, cogsG, cogsT),
                                             key=lambda x: (x[2], x[3])):
                lines.append('{0}\t{1}\t{2:4}\t{3:4}'.format(
                    lform.format(lang), wform.format(word), cG, cT))
            lines.append('#')
        else:
            preT += [1.0]
            recT += [1.0]
            preB += [1.0]
            recB += [1.0]
            preP += [1.0]
            recP += [1.0]

    bp = sum(preB) / len(preB)
    br = sum(recB) / len(recB)
    bf = 2 * (bp * br) / (bp + br)
    pp = sum(preP) / len(preP)
    pr = sum(recP) / len(recP)
    pf = 2 * (pp * pr) / (pp + pr)

    as_string('\n'.join(lines), pprint=pprint)

    if tofile:
        write_text_file(filename + '.diff', lines)
    return (bp, br, bf), (pp, pr, pf)
Example #10
0
def bcubes(wordlist, gold='cogid', test='lexstatid', modify_ref=False, pprint=True, 
        per_concept=False):
    """
    Compute B-Cubed scores for test and reference datasets.

    Parameters
    ----------
    lex : :py:class:`lingpy.basic.wordlist.Wordlist`
        A :py:class:`lingpy.basic.wordlist.Wordlist` class or a daughter class,
        (like the :py:class:`~lingpy.compare.lexstat.LexStat` class used for the
        computation). It should have two columns indicating cognate IDs.
    gold : str (default='cogid')
        The name of the column containing the gold standard cognate
        assignments.
    test : str (default='lexstatid')
        The name of the column containing the automatically implemented cognate
        assignments.
    modify_ref : function (default=False)
        Use a function to modify the reference. If your cognate identifiers
        are numerical, for example, and negative values are assigned as
        loans, but you want to suppress this behaviour, just set this
        keyword to "abs", and all cognate IDs will be converted to their
        absolute value.
    pprint : bool (default=True)
        Print out the results
    per_concept : bool (default=False)
        Compute b-cubed scores per concep and not for the whole data in one
        piece.

    Returns
    -------
    t : tuple
        A tuple consisting of the precision, the recall, and the harmonic mean
        (F-scores).

    Notes
    -----
    B-Cubed scores were first described by :evobib:`Bagga1998` as part of an
    algorithm. Later on, :evobib:`Amigo2009` showed that they can also used as
    to compare cluster decisions. :evobib:`Hauer2011` applied the B-Cubed
    scores first to the task of automatic cognate detection.
    
    See also
    --------
    diff
    pairs
    """
    # if loans are treated as homologs
    evl = modify_ref if modify_ref else identity
    
    def get_scores(one, other):
        for _, line in wordlist.get_etymdict(ref=one, modify_ref=modify_ref).items():
            line = [value for value in [evl(x[0]) for x in line if x != 0]]
            # check for linesize
            if len(line) > 1:
                # get cognate-ids in the other set for the line
                other_line = [evl(wordlist[idx, other]) for idx in line]

                # get the recall
                for idx in other_line:
                    yield other_line.count(idx) / len(line)
            else:
                yield 1.0

    if per_concept:
        bcr, bcp, fsc = [], [], []
        for concept in wordlist.rows:
            idxsG = _get_cogs(gold, concept, evl, wordlist)
            idxsT = _get_cogs(test, concept, evl, wordlist)
            r = _get_bcubed_score(idxsG, idxsT)
            p = _get_bcubed_score(idxsT, idxsG)
            f = 2 * ((r * p) / (p + r))
            bcr += [r]
            bcp += [p]
            fsc += [f]
            
            as_string('{0:15}\t{1:.2f}\t{2:.2f}\t{3:.2f}'.format(
                    concept, p, r, f), pprint=pprint)
    else:
        # b-cubed recall
        bcr = list(get_scores(gold, test))
        # b-cubed precision
        bcp = list(get_scores(test, gold))
        fsc = []

    # calculate general scores
    BCP = sum(bcp) / len(bcp)
    BCR = sum(bcr) / len(bcr)
    FSC = sum(fsc) / len(fsc) if fsc else 2 * ((BCP * BCR) / (BCP + BCR))
    
    as_string(_format_results('B-Cubed', BCP, BCR, FSC), pprint=pprint)

    return BCP, BCR, FSC
Example #11
0
def diff(
        wordlist,
        gold='cogid',
        test='lexstatid',
        modify_ref=False,
        pprint=True,
        filename='',
        tofile=True,
        transcription="ipa"):
    r"""
    Write differences in classifications on an item-basis to file.

    lex : :py:class:`lingpy.compare.lexstat.LexStat`
        The :py:class:`~lingpy.compare.lexstat.LexStat` class used for the
        computation. It should have two columns indicating cognate IDs.
    gold : str (default='cogid')
        The name of the column containing the gold standard cognate
        assignments.
    test : str (default='lexstatid')
        The name of the column containing the automatically implemented cognate
        assignments.
    modify_ref : function (default=False)
        Use a function to modify the reference. If your cognate identifiers
        are numerical, for example, and negative values are assigned as
        loans, but you want to suppress this behaviour, just set this
        keyword to "abs", and all cognate IDs will be converted to their
        absolute value.
    pprint : bool (default=True)
        Print out the results
    filename : str (default='')
        Name of the output file. If not specified, it is identical with the
        name of the :py:class:`~lingpy.compare.lexstat.LexStat`, but with the
        extension ``diff``.
    tofile : bool (default=True)
        If set to c{False}, no data will be written to file, but instead, the
        data will be returned.
    transcription : str (default="ipa")
        The file in which the transcriptions are located (should be a string,
        no segmentized version, for convenience of writing to file).

    Returns
    -------
    t : tuple
        A nested tuple consisting of two further tuples. The first
        containing precision, recall, and harmonic mean
        (F-scores), the second containing the same values for the pair-scores.

    Notes
    -----
    If the **tofile** option is chosen, the results are written to a specific
    file with the extension ``diff``. This file contains all cognate sets in
    which there are differences between gold standard and test sets. It also
    gives detailed information regarding false positives, false negatives, and
    the words involved in these wrong decisions.

    .. This function also calculates the "transformation" score. This score is
    .. based on the calculation of steps that are needed to transform one cluster
    .. for one set of meanings into the other. Ideally, if there are *n* different
    .. cognate sets covering one gloss in the gold standard, the minimal length of
    .. a mapping to convert the *m* cognate sets of the test set into the gold standard
    .. is *n*. In this case, both gold standard and test set are identical.
    .. However, if gold standard and test set differ, the number of mappings
    .. necessarily exceeds *m* and *n*. Based on this, the transformation
    .. precision is defined as :math:`\frac{m}{M}`, where *m* is the number of
    .. distinct clusters in the test set and *M* is the length of the mapping.
    .. Accordingly, the recall is defined as :math:`\frac{n}{M}`, where *n* is the
    .. number of clusters in the gold standard.

    .. Note that if precision is lower than 1.0, this means there are false
    .. positive decisions in the test set. Accordingly, a recall lower than 1.0
    .. indicates that there are false negative decisions in the test set.
    .. The drawback of this score is that it is not sensitive regarding the
    .. distinct number of decisions in which gold standard and test set differ, so
    .. the recall can be very low although most of the words have been grouped
    .. accurately. The advantage is that it can be directly interpreted in terms
    .. of 'false positive/false negative' decisions.

    See also
    --------
    bcubes
    pairs
    """
    filename = filename or wordlist.filename
    loan = modify_ref if modify_ref else identity

    # open file
    if tofile:
        f = codecs.open(filename + '.diff', 'w', 'utf-8')

    # get a formatter for language names
    lform = '{0:' + str(max([len(l) for l in wordlist.cols])) + '}'
    
    preT, recT = [], []
    preB, recB = [], []
    preP, recP = [], []

    def get_pairs(cogs, idxs):
        tmp = defaultdict(list)
        for x, y in zip(cogs, idxs):
            tmp[x].append(y)
        for x in tmp:
            for yA, yB in combinations(tmp[x], r=2):
                yield tuple(sorted([yA, yB]))

    for concept in wordlist.rows:
        idxs = wordlist.get_list(row=concept, flat=True)
        # get the basic index for all seqs
        bidx = [i + 1 for i in range(len(idxs))]

        cogsG = _get_cogs(gold, concept, loan, wordlist)
        cogsT = _get_cogs(test, concept, loan, wordlist)

        if cogsG != cogsT:
            # calculate the transformation distance of the sets
            tramGT = len(set(zip(cogsG, cogsT)))
            tramG = len(set(cogsG))
            tramT = len(set(cogsT))
            preT += [tramT / tramGT]
            recT += [tramG / tramGT]

            # calculate the bcubed precision for the sets
            preB += [_get_bcubed_score(cogsT, cogsG)]

            # calculate b-cubed recall
            recB += [_get_bcubed_score(cogsG, cogsT)]

            # calculate pair precision
            pairsG = set(get_pairs(cogsG, idxs))
            pairsT = set(get_pairs(cogsT, idxs))

            preP.append(len(pairsT.intersection(pairsG)) / len(pairsT) if pairsT else 1.0)
            recP.append(len(pairsT.intersection(pairsG)) / len(pairsG) if pairsG else 1.0)
            fp = "no" if preP[-1] == 1.0 else "yes"
            fn = "no" if recP[-1] == 1.0 else "yes"

            if tofile:
                f.write(
                    "Concept: {0}, False Positives: {1}, False Negatives: {2}\n".format(
                        concept, fp, fn))

            # get the words
            words = [wordlist[i, 'ipa'] for i in idxs]
            langs = [wordlist[i, 'taxa'] for i in idxs]

            # get a word-formater
            wform = '{0:' + str(max([len(w) for w in words])) + '}'

            # write differences to file
            if tofile:
                for word, lang, cG, cT in sorted(
                        zip(words, langs, cogsG, cogsT),
                        key=lambda x: (x[2], x[3])):
                    f.write('{0}\t{1}\t{2:4}\t{3:4}\n'.format(
                        lform.format(lang), wform.format(word), cG, cT))
                f.write('#\n')
        else:
            preT += [1.0]
            recT += [1.0]
            preB += [1.0]
            recB += [1.0]
            preP += [1.0]
            recP += [1.0]

    bp = sum(preB) / len(preB)
    br = sum(recB) / len(recB)
    bf = 2 * (bp * br) / (bp + br)
    pp = sum(preP) / len(preP)
    pr = sum(recP) / len(recP)
    pf = 2 * (pp * pr) / (pp + pr)
    

    as_string(_format_results('B-Cubed', bp, br, bf) + \
            _format_results('Pair', pp, pr, pf), 
            pprint=pprint)

    if tofile:
        f.write('B-Cubed Scores:\n')
        f.write('Precision: {0:.4f}\n'.format(bp))
        f.write('Recall:    {0:.4f}\n'.format(br))
        f.write('F-Score:   {0:.4f}\n'.format(bf))
        f.write('#\n')
        f.write('Pair Scores:\n')
        f.write('Precision: {0:.4f}\n'.format(pp))
        f.write('Recall:    {0:.4f}\n'.format(pr))
        f.write('F-Score:   {0:.4f}\n'.format(pf))
        f.close()
        log.file_written(filename + '.diff')
    else:
        return (bp, br, bf), (pp, pr, pf)
Example #12
0
def pairs(lex, gold='cogid', test='lexstatid', modify_ref=False, pprint=True,
        _return_string=False):
    """
    Compute pair scores for the evaluation of cognate detection algorithms.
    
    Parameters
    ----------
    lex : :py:class:`lingpy.compare.lexstat.LexStat`
        The :py:class:`~lingpy.compare.lexstat.LexStat` class used for the
        computation. It should have two columns indicating cognate IDs.
    gold : str (default='cogid')
        The name of the column containing the gold standard cognate
        assignments.
    test : str (default='lexstatid')
        The name of the column containing the automatically implemented cognate
        assignments.
    modify_ref : function (default=False)
        Use a function to modify the reference. If your cognate identifiers
        are numerical, for example, and negative values are assigned as
        loans, but you want to suppress this behaviour, just set this
        keyword to "abs", and all cognate IDs will be converted to their
        absolute value.
    pprint : bool (default=True)
        Print out the results

    Returns
    -------
    t : tuple
        A tuple consisting of the precision, the recall, and the harmonic mean
        (F-scores).
    
    Notes
    -----
    Pair-scores can be computed in different ways, with often different
    results. This variant follows the description by :evobib:`Bouchard-Cote2013`.
    
    See also
    --------
    diff
    bcubes
    """
    # if loans are treated as homologs
    evl = modify_ref if modify_ref else identity

    def get_pairs(ref):
        for key, line in lex.get_etymdict(ref=ref, modify_ref=modify_ref).items():
            line = [value for value in [evl(x[0]) for x in line if x != 0]]
            for a, b in combinations(line, r=2):
                yield tuple(sorted([a, b]))

    pairsG = set(get_pairs(gold))
    pairsT = set(get_pairs(test))
    
    # calculate precision and recall
    pp = len(pairsG.intersection(pairsT)) / len(pairsT)
    pr = len(pairsG.intersection(pairsT)) / len(pairsG)
    fs = 2 * (pp * pr) / (pp + pr)

    # print the results if this option is chosen
    as_string(_format_results('Pairs', pp, pr, fs), pprint=pprint)
    
    return pp, pr, fs
Example #13
0
def partial_bcubes(wordlist, gold, test, pprint=True):
    """
    Compute B-Cubed scores for test and reference datasets for partial cognate\
            detection.

    Parameters
    ----------
    wordlist : :py:class:`~lingpy.basic.wordlist.Wordlist`
        A :py:class:`~lingpy.basic.wordlist.Wordlist`, or one of it's daughter
        classes (like, e.g., the :py:class:`~lingpy.compare.partial.Partial`
        class used for computation of partial cognates. It should have two
        columns indicating cognate IDs.
    gold : str (default='cogid')
        The name of the column containing the gold standard cognate
        assignments.
    test : str (default='lexstatid')
        The name of the column containing the automatically implemented cognate
        assignments.
    pprint : bool (default=True)
        Print out the results

    Returns
    -------
    t : tuple
        A tuple consisting of the precision, the recall, and the harmonic mean
        (F-scores).

    Notes
    -----
    B-Cubed scores were first described by :evobib:`Bagga1998` as part of an
    algorithm. Later on, :evobib:`Amigo2009` showed that they can also used as
    to compare cluster decisions. :evobib:`Hauer2011` applied the B-Cubed
    scores first to the task of automatic cognate detection.
    
    See also
    --------
    bcubes
    diff
    pairs
    """
    
    # here's the point with bcubes for fuzzy: if we compare, we need to make
    # sure we count whether one instance is identical, not whether all of them
    # are identical!
    
    def get_scores(one, other):
        scores = []
        multiple_items = []
        for k,v in wordlist.get_etymdict(ref=one).items():
            _idxs = [val for val in v if val != 0]
            # now we need to get the position in the index
            poss,idxs = [],[]
            for val in _idxs:
                if len(val) > 1:
                    multiple_items += [len(val)]
                for idx in val:
                    new_pos = [i for i,cog in zip(range(len(wordlist[idx,one])),
                        wordlist[idx,one]) if cog == k]
                    idxs += [idx for x in new_pos]
                    poss += new_pos
            if len(idxs) > 1:
                other_idxs = [wordlist[idx,other][pos] for pos,idx in zip(poss,idxs)]
                for idx in other_idxs:
                    scores += [other_idxs.count(idx) / len(idxs)]
            else: 
                scores += [1]
        return sum(scores) / len(scores)

    bcr = get_scores(gold, test)
    bcp = get_scores(test, gold)
    bcf = 2 * ((bcp * bcr) / (bcp + bcr))
    
    as_string(_format_results('B-Cubed', bcp, bcr, bcf), 
            pprint=pprint)
    return bcp, bcr, bcf
Example #14
0
 def test_as_string(self):
     out = util.as_string('text', pprint=False)
     assert out == 'text'
Example #15
0
    def align(self, **keywords):
        """
        Align a pair of sequences or multiple sequence pairs.

        Parameters
        ----------
        gop : int (default=-1)
            The gap opening penalty (GOP).
        scale : float (default=0.5)
            The gap extension penalty (GEP), calculated with help of a scaling
            factor.
        mode : {"global","local","overlap","dialign"}
            The alignment mode, see :evobib:`List2012a` for details.
        factor : float (default = 0.3)
            The factor by which matches in identical prosodic position are
            increased.
        restricted_chars : str (default="T_")
            The restricted chars that function as an indicator of syllable or
            morpheme breaks for secondary alignment, see :evobib:`List2012c`
            for details.
        distance : bool (default=False)
            If set to *True*, return the distance instead of the similarity
            score. Distance is calculated using the formula by
            :evobib:`Downey2008`.
        model : { None, ~lingpy.data.model.Model }
            Specify the sound class model that shall be used for the analysis.
            If no model is specified, the default model of :evobib:`List2012a`
            will be used.
        pprint : bool (default=False)
            If set to *True*, the alignments are printed to the screen.

        """
        setdefaults(
            keywords,
            gop=-1,
            scale=0.5,
            mode='global',
            factor=0.3,
            restricted_chars='T_',
            distance=False,
            model=rcParams['sca'],
            pprint=False,
            transform=rcParams['align_transform'])

        if hasattr(self, 'model'):
            if keywords['model'] != self.model:
                self._set_model(**keywords)
        else:
            self._set_model(**keywords)

        # create the alignments array
        self._alignments = calign.align_pairs(
            self.classes,
            self.weights,
            self.prostrings,
            keywords['gop'],
            keywords['scale'],
            keywords['factor'],
            self.scoredict,
            keywords['mode'],
            keywords['restricted_chars'],
            distance=1 if keywords['distance'] else 0)

        # switch back to alignments
        self.alignments = []
        for i, (almA, almB, sim) in enumerate(self._alignments):
            self.alignments.append((
                class2tokens(self.tokens[i][0], almA, local=keywords['mode'] == "local"),
                class2tokens(self.tokens[i][1], almB, local=keywords['mode'] == "local"),
                sim))

        # print the alignments, if this is chosen
        as_string(self, pprint=keywords['pprint'])