Python LexStat.cluster Examples

Programming Language: Python

Namespace/Package Name: lingpy.compare.lexstat

Class/Type: LexStat

Method/Function: cluster

Examples at hotexamples.com: 3

Python LexStat.cluster - 3 examples found. These are the top rated real world Python examples of lingpy.compare.lexstat.LexStat.cluster extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

LexStat(6)

get_scorer(3)

cluster(2)

align(1)

get_dict(1)

output(1)

Example #1

Show file

File: workflow.py Project: tjade273/lingpy

class Workflow(object):
    """
    Class provides access to generic workflows.

    Parameters
    ----------
    infile : str
        A tsv-file providing the input data for the given workflow.
    """
    def __init__(self, infile):

        # we don't do anything specific here, we just assign the input file as
        # an attribute of the Workflow class
        self.infile = infile

    def cognate_detection(self, **keywords):
        """
        Method runs a cognate detection analysis.
        """
        kw = dict(
            align_method='progressive',
            align_mode=rcParams['align_mode'],
            align_modes=rcParams['align_modes'],
            cluster_method=rcParams['lexstat_cluster_method'],
            cognate_method='sca',
            cognate_mode='overlap',
            defaults=False,
            factor=rcParams['align_factor'],
            gap_weight=rcParams['gap_weight'],
            gop=rcParams['align_gop'],
            iteration=False,
            lexstat_modes=rcParams['lexstat_modes'],
            limit=rcParams['lexstat_limit'],
            merge_vowels=rcParams['merge_vowels'],
            model=rcParams['sca'],
            export="html",
            preprocessing=False,
            preprocessing_method=rcParams['lexstat_preprocessing_method'],
            preprocessing_threshold=rcParams[
                'lexstat_preprocessing_threshold'],
            rands=rcParams['lexstat_rands'],
            ratio=rcParams['lexstat_ratio'],
            ref="customid",
            restricted_chars=rcParams['restricted_chars'],
            restriction='',
            runs=rcParams['lexstat_runs'],
            scale=rcParams['align_scale'],
            scoring_method=rcParams['lexstat_scoring_method'],
            swap_check=False,
            threshold=rcParams['lexstat_threshold'],
            tree_calc=rcParams['align_tree_calc'],
            vscale=rcParams['lexstat_vscale'],
            outfile=False,
            sonar=True,
        )

        # first load
        kw.update(keywords)
        if kw['defaults']:
            return kw

        # carry out lexstat cluster analysis
        self.lex = LexStat(self.infile, **kw)

        # reset filename if it is not defined
        kw['outfile'] = kw['outfile'] or self.lex.filename + '_lingpy'

        # check for traditional lexstat analysis
        if kw['cognate_method'] == 'lexstat':
            self.lex.get_scorer(method=kw['scoring_method'],
                                modes=kw['lexstat_modes'],
                                **kw)

        self.lex.cluster(method=kw['cognate_method'],
                         mode=kw['cognate_mode'],
                         **kw)

        # align the data
        self.alms = Alignments(self.lex, **kw)
        kw['scoredict'] = self.lex.cscorer \
            if kw['cognate_method'] == 'lexstat' else self.lex.bscorer

        self.alms.align(method=kw['align_method'],
                        mode=kw['align_mode'],
                        modes=kw['align_modes'],
                        **kw)

        if 'tsv' in kw['export']:
            self.alms.output('tsv',
                             filename=kw['outfile'],
                             ignore=['scorer', 'json', 'taxa', 'msa'],
                             **kw)
        if 'html' in kw['export']:
            corrs, occs = get_correspondences(self.alms, kw['ref'])

            # serialize the wordlist
            wl = {}
            for concept in self.alms.concepts:
                entries = self.alms.get_list(concept=concept, flat=True)
                cogids = [self.alms[idx, kw['ref']] for idx in entries]
                words = [self.alms[idx, 'ipa'] for idx in entries]
                alms = [self.alms[idx, 'alignment'] for idx in entries]
                langs = [self.alms[idx, 'doculect'] for idx in entries]

                checkalm = lambda x: x if type(x) == str else ' '.join(x)

                wl[concept] = [
                    list(k) for k in sorted(zip(
                        langs,
                        [str(x) for x in entries],
                        words,
                        [str(x) for x in cogids],
                        [checkalm(x) for x in alms],
                    ),
                                            key=lambda x: int(x[3]))
                ]

                # make simple gloss id for internal use as id
                gloss2id = list(
                    zip(self.alms.concepts, [
                        str(x) for x in range(1,
                                              len(self.alms.concepts) + 1)
                    ]))
                id2gloss = dict([[b, a] for a, b in gloss2id])
                gloss2id = dict(gloss2id)

                txt = ''
                txt += 'CORRS = ' + json.dumps(corrs) + ';\n'
                txt += 'LANGS = ' + json.dumps(self.alms.taxa) + ';\n'
                txt += 'OCCS = ' + json.dumps(occs) + ';\n'
                txt += 'WLS = ' + json.dumps(wl) + ';\n'
                txt += 'GlossId = ' + json.dumps(gloss2id) + ';\n'
                txt += 'IdGloss = ' + json.dumps(id2gloss) + ';\n'
                txt += 'FILE = "' + kw['outfile'] + '.tsv";\n'

                tpath = partial(util.data_path, 'templates')

                tname = 'jcov.{0}.html'.format('remote' if 'remote' in
                                               kw['export'] else 'direct')
                content = util.read_text_file(tpath(tname))

                util.write_text_file(
                    kw['outfile'] + '.html',
                    content.format(
                        CORRS=txt,
                        JCOV=util.read_text_file(tpath('jcov.js')),
                        STYLE=util.read_text_file(tpath('jcov.css')),
                        VENDOR=util.read_text_file(tpath('jcov.vendor.js')),
                        DIGHL=util.read_text_file(tpath('jcov.dighl.js'))))

Example #2

Show file

File: workflow.py Project: xrotwang/lingpy

class Workflow(object):
    """
    Class provides access to generic workflows.

    Parameters
    ----------
    infile : str
        A tsv-file providing the input data for the given workflow.
    """

    def __init__(self, infile):

        # we don't do anything specific here, we just assign the input file as
        # an attribute of the Workflow class
        self.infile = infile

    def cognate_detection(self, **keywords):
        """
        Method runs a cognate detection analysis.
        """
        kw = dict(
            align_method='progressive',
            align_mode=rcParams['align_mode'],
            align_modes=rcParams['align_modes'],
            cluster_method=rcParams['lexstat_cluster_method'],
            cognate_method='sca',
            cognate_mode='overlap',
            defaults=False,
            factor=rcParams['align_factor'],
            gap_weight=rcParams['gap_weight'],
            gop=rcParams['align_gop'],
            iteration=False,
            lexstat_modes=rcParams['lexstat_modes'],
            limit=rcParams['lexstat_limit'],
            merge_vowels=rcParams['merge_vowels'],
            model=rcParams['sca'],
            export="html",
            preprocessing=False,
            preprocessing_method=rcParams['lexstat_preprocessing_method'],
            preprocessing_threshold=rcParams['lexstat_preprocessing_threshold'],
            rands=rcParams['lexstat_rands'],
            ratio=rcParams['lexstat_ratio'],
            ref="customid",
            restricted_chars=rcParams['restricted_chars'],
            restriction='',
            runs=rcParams['lexstat_runs'],
            scale=rcParams['align_scale'],
            scoring_method=rcParams['lexstat_scoring_method'],
            swap_check=False,
            threshold=rcParams['lexstat_threshold'],
            tree_calc=rcParams['align_tree_calc'],
            vscale=rcParams['lexstat_vscale'],
            outfile=False,
            sonar=True,
        )

        # first load
        kw.update(keywords)
        if kw['defaults']:
            return kw

        # carry out lexstat cluster analysis
        self.lex = LexStat(self.infile, **kw)

        # reset filename if it is not defined
        kw['outfile'] = kw['outfile'] or self.lex.filename + '_lingpy'

        # check for traditional lexstat analysis
        if kw['cognate_method'] == 'lexstat':
            self.lex.get_scorer(
                method=kw['scoring_method'], modes=kw['lexstat_modes'], **kw)

        self.lex.cluster(method=kw['cognate_method'], mode=kw['cognate_mode'], **kw)

        # align the data
        self.alms = Alignments(self.lex, **kw)
        kw['scoredict'] = self.lex.cscorer \
            if kw['cognate_method'] == 'lexstat' else self.lex.bscorer

        self.alms.align(
            method=kw['align_method'],
            mode=kw['align_mode'],
            modes=kw['align_modes'],
            **kw)

        if 'tsv' in kw['export']:
            self.alms.output(
                'tsv',
                filename=kw['outfile'],
                ignore=['scorer', 'json', 'taxa', 'msa'],
                **kw)
        if 'html' in kw['export']:
            corrs, occs = get_correspondences(self.alms, kw['ref'])

            # serialize the wordlist
            wl = {}
            for concept in self.alms.concepts:
                entries = self.alms.get_list(concept=concept, flat=True)
                cogids = [self.alms[idx, kw['ref']] for idx in entries]
                words = [self.alms[idx, 'ipa'] for idx in entries]
                alms = [self.alms[idx, 'alignment'] for idx in entries]
                langs = [self.alms[idx, 'doculect'] for idx in entries]

                checkalm = lambda x: x if type(x) == str else ' '.join(x)

                wl[concept] = [list(k) for k in sorted(
                    zip(
                        langs,
                        [str(x) for x in entries],
                        words,
                        [str(x) for x in cogids],
                        [checkalm(x) for x in alms],
                    ),
                    key=lambda x: int(x[3]))]

                # make simple gloss id for internal use as id
                gloss2id = list(
                    zip(
                        self.alms.concepts,
                        [str(x) for x in range(1, len(self.alms.concepts) + 1)]))
                id2gloss = dict([[b, a] for a, b in gloss2id])
                gloss2id = dict(gloss2id)

                txt = ''
                txt += 'CORRS = ' + json.dumps(corrs) + ';\n'
                txt += 'LANGS = ' + json.dumps(self.alms.taxa) + ';\n'
                txt += 'OCCS = ' + json.dumps(occs) + ';\n'
                txt += 'WLS = ' + json.dumps(wl) + ';\n'
                txt += 'GlossId = ' + json.dumps(gloss2id) + ';\n'
                txt += 'IdGloss = ' + json.dumps(id2gloss) + ';\n'
                txt += 'FILE = "' + kw['outfile'] + '.tsv";\n'

                tpath = partial(util.data_path, 'templates')

                tname = 'jcov.{0}.html'.format(
                    'remote' if 'remote' in kw['export'] else 'direct')
                content = util.read_text_file(tpath(tname))

                util.write_text_file(
                    kw['outfile'] + '.html',
                    content.format(
                        CORRS=txt,
                        JCOV=util.read_text_file(tpath('jcov.js')),
                        STYLE=util.read_text_file(tpath('jcov.css')),
                        VENDOR=util.read_text_file(tpath('jcov.vendor.js')),
                        DIGHL=util.read_text_file(tpath('jcov.dighl.js'))))

Example #3

Show file

File: lexstat.py Project: Anaphory/lingpycldf

    else:
        wordlist = args.wordlist
    try:
        wordlist.add_component("CognateTable")
    except ValueError:
        if args.overwrite:
            pass
        else:
            print("DataSet already has a CognateTable. To drop existing cognate data, use `--overwrite`.")
            sys.exit(2)
    lpwl = to_lingpy(wordlist)

    # Use LingPy functionality
    lexstat = LexStat(lpwl, check=False, segments="tokens")
    if args.bad_tokens_log:
        json.dump(find_bad_tokens(lexstat), args.bad_tokens_log)

    # Prepare analysis
    if args.method != 'sca':
        lexstat.get_scorer(preprocessing=False, runs=10000, ratio=(2,1), vscale=1.0)
    lexstat.cluster(method=args.method, cluster_method=args.cluster_method, ref="cogid",
                    threshold=args.threshold)
    lexstat = Alignments(lexstat, segments="tokens")
    lexstat.align(model="sca")
    lexstat.output("tsv", filename="with_lexstat_and_alignment")

    # Create new CognateTable and write it to there
    cognate_table = wordlist["CognateTable"]
    cognate_table.write(cognatetable_from_lingpy(lexstat))