def iter_alignments(dataset, cognate_sets, column='Segments', method='library'): """ Function computes automatic alignments and writes them to file. """ if not isinstance(dataset, lingpy.basic.parser.QLCParser): wordlist = _cldf2wordlist(dataset) cognates = {r['Form_ID']: r for r in cognate_sets} wordlist.add_entries( 'cogid', 'lid', lambda x: cognates[x]['Cognateset_ID'] if x in cognates else 0) alm = lingpy.Alignments( wordlist, ref='cogid', row='parameter_id', col='language_id', transcription='form', segments=column.lower()) alm.align(method=method) for k in alm: if alm[k, 'lid'] in cognates: cognate = cognates[alm[k, 'lid']] cognate['Alignment'] = alm[k, 'alignment'] cognate['Alignment_Method'] = method else: alm = lingpy.Alignments(dataset, ref='cogid') alm.align(method=method) for cognate in cognate_sets: idx = cognate['ID'] or cognate['Form_ID'] cognate['Alignment'] = alm[int(idx), 'alignment'] cognate['Alignment_Method'] = 'SCA-' + method
def test_partial_alignments_with_lexstat(): lex = lp.LexStat(test_data('test-partial-alignments.tsv'), segments='tokens') alms = lp.Alignments(test_data('test-partial-alignments.tsv'), fuzzy=True, ref='cogids', sonar=True, segments='tokens') alms.align(scorer=lex.bscorer) assert '-' in alms.msa['cogids'][12]['alignment'][-1]
def iter_alignments(dataset, cognate_sets, column='Segments', method='library', prefix=''): """ Function computes automatic alignments and writes them to file. """ if not isinstance(dataset, lp.basic.parser.QLCParser): wordlist = _cldf2wordlist(dataset) cognates = {r[0]: r for r in cognate_sets} wordlist.add_entries('cogid', 'lid', lambda x: cognates[x][3] if x in cognates else '') for i, k in enumerate(wordlist): if not wordlist[k, 'cogid']: wordlist[k][wordlist.header['cogid']] = 'empty-%s' % i alm = lp.Alignments(wordlist, ref='cogid', row='parameter_name', col='language_name', segments=column.lower()) alm.align(method=method) for k in alm: if alm[k, 'lid'] in cognates: row = list(cognates[alm[k, 'lid']]) row[7] = alm[k, 'alignment'] row[8] = method yield row else: alm = lp.Alignments(dataset, ref='cogid') alm.align(method=method) for row in cognate_sets: try: idx = int(row[0].split('-')[1]) except: print(row) raise row[7] = alm[idx, 'alignment'] row[8] = 'SCA-' + method yield row
def cldf(dataset, concepticon, **kw): gloss2con = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts} lang2glot = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages} for dset, srckey in zip(DSETS, sources): wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix()) if 'tokens' not in wl.header: wl.add_entries('tokens', 'ipa', lp.ipa2tokens, merge_vowels=False, expand_nasals=True) src = getEvoBibAsSource(srckey) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'Cognacy', 'Loan'), dataset, subset=dset.split('.')[0]) as ds: ds.sources.add(src) errors = [] cognates = [] for k in wl: concept = wl[k, 'concept'] if '(V)' in concept: concept = concept[:-4] concept = correct_concepts.get(concept, concept) if concept not in gloss2con: errors += [concept] doculect = correct_languages.get(wl[k, 'doculect'], wl[k, 'doculect']) loan = wl[k, 'cogid'] < 0 cogid = abs(wl[k, 'cogid']) wid = '{0}-{1}'.format(dset.split('.')[0], k) ds.add_row([ wid, lang2glot[doculect], wl[k, 'doculect'], '', gloss2con.get(wl[k, 'concept'], ''), wl[k, 'concept'], wl[k, 'ipa'], srckey, ' '.join(wl[k, 'tokens'] or ['']), cogid, wl[k, 'loan'] ]) cognates.append([ wid, ds.name, wl[k, 'ipa'], cogid, 'borrowed' if loan else '', 'expert', srckey, '', '', '' ]) dataset.cognates.extend( iter_alignments(lp.Alignments(wl), cognates, method='library')) for er in sorted(set(errors)): print(er, dset)
def cldf(dataset, concepticon, **kw): concepts = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts} D = {} # dictionary to be passed to lingpy D[0] = [ 'doculect', 'glottolog', 'concept', 'concepticon', 'ipa', 'segments', 'cogid', 'alignment' ] idx = 1 for f in FILES: msa = lp.MSA( dataset.raw.joinpath('phonalign_{0}.msa'.format(f)).as_posix()) concept = msa.seq_id[1:-1] # strip quotation marks from concept cid = concepts.get(concept, '') for i, taxon in enumerate(msa.taxa): if taxon in languages: tid = languages[taxon] alignment = ' '.join(msa.alignment[i]) tokens = ' '.join([x for x in msa.alignment[i] if x != '-']) ipa = tokens.replace(' ', '') cogid = '{0}-{1}'.format(concept, f) D[idx] = [ taxon, tid, concept, cid, ipa, tokens, cogid, alignment ] idx += 1 with CldfDataset( ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Cognacy', 'Source'), dataset) as ds: src = getEvoBibAsSource('Heggarty2007') ds.sources.add(src) src = getEvoBibAsSource('List2014e') ds.sources.add(src) alm = lp.Alignments(D) for k in alm: ds.add_row( ['Heggarty2007-{0}'.format(k)] + [alm[k, x] or '' for x in ['glottolog', 'taxon', 'iso', 'concepticon', 'concept', 'ipa']] + \ [' '.join(alm[k, 'tokens']), alm[k, 'cogid'], 'Heggarty2007'] ) dataset.cognates += [[ 'Heggarty2007-{0}'.format(k), ds.name, alm[k, 'ipa'], alm[k, 'cogid'], '', 'expert', 'Heggarty2007', alm[k, 'alignment'], 'expert', 'List2014e' ]] dataset.write_cognates()
def cldf(dataset, concepticon, **kw): orig_ds = Dataset.from_name('baidial') orig_ds.commands.cldf(dataset, concepticon, **kw) for cldfds in dataset.iter_cldf_datasets(): for attr in ['dc:isVersionOf', 'dc:provenance']: cldfds.table[attr] = dataset.md[attr] cldfds.write(outdir=dataset.cldf_dir) # assuming that we don't need anything, I only load the wordlist, align it # in lingpy, and create cognates and alignments, currently, there is no # real source, so I'll just make a fake source "List2016i", but the dataset # should be published with zenodo, ideally alm = lp.Alignments( dataset.raw.joinpath('BDS-cognates.tsv').as_posix()) cognates = wordlist2cognates(alm, cldfds, 'List2016i') dataset.cognates.extend(iter_alignments(alm, cognates))
def cldf(dataset, concepticon, **kw): wl = lp.Alignments(dataset.raw.joinpath('tukano.tsv').as_posix()) src1 = getEvoBibAsSource('Chacon2014') src2 = getEvoBibAsSource('Chacon2015') gloss2conc = {r['GLOSS']: r['CONCEPTICON_ID'] for r in dataset.concepts} with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'Cognacy', ), dataset) as ds: ds.sources.add(src1) ds.sources.add(src2) for k in wl: lid = wl[k, 'language'] cogid = wl[k, 'cogid'] concept = wl[k, 'concept'] segments = wl[k, 'tokens'] value = wl[k, 'ipa'] cogid = wl[k, 'cogid'] alignment = wl[k, 'alignment'] name, iso = abbr2lang[lid] concept = wl[k, 'concept'] cid = gloss2conc.get(concept) ds.add_row(('Chacon2014-' + str(k), dataset.glottocode_by_iso.get(iso, ''), name, iso, cid, concept, value, 'Chacon2014', ' '.join(segments), str(cogid))) cogid = '-'.join([slug(wl[k, 'concept']), '%s' % cogid]) dataset.cognates.append([ 'Chacon2014-' + str(k), ds.name, wl[k, 'ipa'], cogid, '', 'expert', 'Chacon2014', alignment, 'expert', 'Chacon2015' ])
def cognate_code_to_file( metadata: Path, ratio: float, soundclass: str, cluster_method: str, threshold: float, initial_threshold: float, gop: float, mode: str, output_file: Path, ) -> None: dataset = pycldf.Wordlist.from_metadata(args.metadata) assert ( dataset.column_names.forms.segments is not None ), "Dataset must have a CLDF #segments column." def filter(row: t.Dict[str, t.Any]) -> bool: row["tokens"] = [ str(x) for x in clean_segments(row[dataset.column_names.forms.segments.lower()]) ] row["tokens"] = ["+" if x == "_" else x for x in row["tokens"]] # TODO: Find the official LingPy way to consider word boundaries to # also be morpheme boundaries – just adding them in # `partial_cluster(sep=...+'_')` did not work, and why isn't it the # default anyway? row["doculect"] = row[dataset.column_names.forms.languageReference.lower()] row["concept"] = row[dataset.column_names.forms.parameterReference.lower()] return row["segments"] and row["concept"] lex = lingpy.compare.partial.Partial.from_cldf( metadata, filter=filter, columns=["doculect", "concept", "tokens"], model=lingpy.data.model.Model(soundclass), check=True, ) if ratio != 1.5: if ratio == float("inf"): ratio_pair = (1, 0) ratio_str = "-inf" if ratio == int(ratio) >= 0: r = int(ratio) ratio_pair = (r, 1) ratio_str = "-{:d}".format(r) elif ratio > 0: ratio_pair = (ratio, 1) ratio_str = "-" + str(ratio) else: raise ValueError("LexStat ratio must be in [0, ∞]") else: ratio_pair = (3, 2) ratio_str = "" if initial_threshold != 0.7: ratio_str += "-t{:02d}".format(int(initial_threshold * 100)) try: scorers_etc = lingpy.compare.lexstat.LexStat( filename="lexstats-{:}-{:s}{:s}.tsv".format( sha1(metadata), soundclass, ratio_str ) ) lex.scorer = scorers_etc.scorer lex.cscorer = scorers_etc.cscorer lex.bscorer = scorers_etc.bscorer except (OSError, ValueError): lex.get_scorer(runs=10000, ratio=ratio_pair, threshold=initial_threshold) lex.output( "tsv", filename="lexstats-{:}-{:s}{:s}".format( sha1(metadata), soundclass, ratio_str ), ignore=[], ) # For some purposes it is useful to have monolithic cognate classes. lex.cluster( method="lexstat", threshold=args.threshold, ref="cogid", cluster_method=cluster_method, verbose=True, override=True, gop=args.gop, mode=mode, ) # But actually, in most cases partial cognates are much more useful. lex.partial_cluster( method="lexstat", threshold=threshold, cluster_method=cluster_method, ref="partialcognateids", override=True, verbose=True, gop=gop, mode=mode, ) lex.output("tsv", filename="auto-clusters") alm = lingpy.Alignments(lex, ref="partialcognateids", fuzzy=True) alm.align(method="progressive") alm.output("tsv", filename=output_file, ignore="all", prettify=False) try: dataset.add_component("CognateTable") except ValueError: ... try: dataset.add_component("CognatesetTable") except ValueError: ... read_back = csv.DictReader(open(output_file + ".tsv"), delimiter="\t") cognatesets = {} judgements = [] i = 1 for line in read_back: partial = line["PARTIALCOGNATEIDS"].split() alignment = line["ALIGNMENT"].split(" + ") slice_start = 0 for cs, alm in zip(partial, alignment): cognatesets.setdefault(cs, {"ID": cs}) length = len(alm.split()) judgements.append( { "ID": i, "Form_ID": line["ID"], "Cognateset_ID": cs, "Segment_Slice": [ "{:d}:{:d}".format(slice_start, slice_start + length) ], "Alignment": alm.split(), "Source": ["LexStat"], } ) i += 1 slice_start += length dataset.write(CognatesetTable=cognatesets.values()) dataset.write(CognateTable=judgements)
def cldf(dataset, concepticon, **kw): concepticon = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } concepticon['you (sing.)'] = concepticon['you (sing.) (thou)'] concepticon['you (pl.)'] = concepticon['you (pl.) (ye)'] concepticon['to itch/itchy'] = concepticon['to itch/to be itchy'] concepticon['medicine'] = concepticon['medicine/juice'] concepticon['excrement/shit'] = concepticon['feces/excrement/shit'] language_map = { 'Tampuon': 'Tampuan', 'Palaung-Namhsan-Taunggyi': 'Palaung-Namhsan', 'Jru-Laven\u02d0': 'Jru-Laven', 'Pnar-Jaintia': 'Pnar', 'K-Surin': 'Khmer-Surin', } languages = {} words = [] with UnicodeReader(dataset.raw.joinpath('ds.Sheet1.csv')) as reader: for i, row in enumerate(reader): if 3 <= i < 125: languages[row[1]] = row elif i > 334: words.append(row) lids = [int(float(r[0])) for r in languages.values()] assert min(lids) == 1 and max(lids) == 122 glottolog = dataset.glottocode_by_iso glottolog.update( {l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages}) sources = {} for src, langs in groupby(sorted(languages.values(), key=lambda r: r[6]), lambda r: r[6]): langs = [l[1] for l in langs] src = Source('misc', '_'.join(map(slug, langs)), title=src) for lang in langs: sources[lang] = src sources['cognates'] = getEvoBibAsSource(SOURCE) unmapped = Unmapped() with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Source', 'Comment', ), dataset) as ds: ds.sources.add(*sources.values()) D = {0: ['lid', 'doculect', 'concept', 'ipa', 'tokens', 'cog']} for i, row in enumerate(words): form = row[4] if not form or form in '*-': continue assert row[1] in concepticon lang = language_map.get(row[3], row[3].strip()) assert lang in languages gc = glottolog.get(glottolog.get(languages[lang][7]), lang) if not gc: unmapped.languages.add(('', lang, languages[lang][7])) # get segments segments = clean_string(form)[0] # get cognate identifier cogid = row[5] if row[5].strip() and row[5].strip() != '*' else ( 'e%s' % i) cogid = row[1] + '-' + cogid lid = '{0}-{1}'.format(ds.name, i + 1) ds.add_row([ lid, glottolog.get(lang, glottolog.get(languages[lang][7])), lang, languages[lang][7], concepticon[row[1]], row[1], form, segments, sources[lang].id, None ]) D[i + 1] = [lid, lang, row[1], form, segments, cogid] wl = lp.Wordlist(D) wl.renumber('cog') alm = lp.Alignments(wl) dataset.cognates.extend( iter_alignments(alm, wordlist2cognates(wl, ds, SOURCE))) unmapped.pprint()
'tsv', filename='lexstats-{:}-{:s}{:s}'.format( sha1(args.wordlist), args.soundclass, ratio_str), ignore=[]) # For some purposes it is useful to have monolithic cognate classes. lex.cluster(method='lexstat', threshold=args.threshold, ref='cogid', cluster_method=args.cluster_method, verbose=True, override=True, gop=args.gop, mode=args.mode) # But actually, in most cases partial cognates are much more useful. lex.partial_cluster(method='lexstat', threshold=args.threshold, cluster_method=args.cluster_method, ref='partialcognateids', override=True, verbose=True, gop=args.gop, mode=args.mode) lex.output("tsv", filename="auto-clusters") alm = lingpy.Alignments(lex, ref="partialcognateids", fuzzy=True) alm.align(method='progressive') alm.output('tsv', filename=args.output, ignore='all', prettify=False) try: dataset.add_component("CognateTable") except ValueError: ... try: dataset.add_component("CognatesetTable") except ValueError: ... read_back = csv.DictReader(open(args.output + ".tsv"), delimiter="\t") cognatesets = {} judgements = []
def cldf(dataset, concepticon, **kw): concepticon = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } header, rows = None, [] with UnicodeReader( dataset.raw.joinpath( 'Semitic.Wordlists.ActualWordlists.csv')) as reader: for i, row in enumerate(reader): row = [c.strip() for c in row] if i == 0: header = row if i > 0: rows.append(row) cheader, crows = None, [] with UnicodeReader( dataset.raw.joinpath( 'Semitic.Codings.Multistate.Sheet1.csv')) as reader: for i, row in enumerate(reader): row = [c.strip() for c in row] if i == 0: cheader = row if i > 0: crows.append(row) langs = header[1:] clean_langs = { """Gɛ'ɛz""": "Ge'ez", "Tigrɛ": "Tigre", 'ʷalani': "Walani", "Ogadɛn Arabic": "Ogaden Arabic", "Mɛhri": "Mehri", "Gibbali": "Jibbali", } correct_concepts = { 'Cold (air)': 'Cold (of air)', } src = getEvoBibAsSource('Kitchen2012') with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments'), dataset) as ds: D = {0: ['doculect', 'concept', 'ipa', 'tokens']} idx = 1 ds.sources.add(src) for row in rows: concept = row[0] for i, col in enumerate(row[1:]): lang = langs[i] if col != '---': cleaned_string = clean_string(col, merge_vowels=False, preparse=PREPARSE, rules=CONVERSION, semi_diacritics='')[0] ds.add_row([ 'Kitchen2012-' + str(idx), language_map[lang], clean_langs.get(lang, lang), concepticon[concept], concept, col, cleaned_string ]) D[idx] = [ clean_langs.get(lang, lang), concept, col, cleaned_string ] idx += 1 wl = lp.Wordlist(D) id2cog = {} errors = [] for row in crows: taxon = row[0] for i, (concept, cog) in enumerate(zip(cheader[1:], row[1:])): nconcept = rows[i][0] if cog != '-': idxs = wl.get_dict(taxon=taxon) if idxs.get(nconcept, ''): id2cog[idxs[nconcept][0]] = concept + '-' + cog else: errors += [(concept, nconcept, taxon)] bad_cogs = 1 cognates = [] for k in wl: cognates = [] if k in id2cog: cogid = id2cog[k] else: cogid = str(bad_cogs) bad_cogs += 1 id2cog[k] = cogid wl.add_entries('cog', id2cog, lambda x: x) wl.renumber('cog') for k in wl: cognates += [[ 'Kitchen2012-' + str(k), ds.name, wl[k, 'ipa'], wl[k, 'concept'] + '-' + str(wl[k, 'cogid']), '', 'expert', 'Kitchen2012', '', '', '' ]] dataset.cognates.extend(iter_alignments(lp.Alignments(wl), cognates))
def setUp(self): self.alm = lp.Alignments(os.path.join('data', 'kessler.qlc'), loans=False)
def cldf(dataset, concepticon, **kw): """ Implements the conversion of the raw data to CLDF dataset(s). :param dataset: provides access to the information in supplementary files as follows:\ - the JSON object from `metadata.json` is available as `dataset.md`\ - items from languages.csv are available as `dataset.languages`\ - items from concepts.csv are available as `dataset.concepts`\ - if a Concepticon conceptlist was specified in metadata.json, its ID is available\ as `dataset.conceptlist` :param glottolog: a pyglottolog.api.Glottolog` instance. :param concepticon: a pyconcepticon.api.Concepticon` instance. :param kw: All arguments passed on the command line. """ wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix()) # get language identifiers lids, cids, coords = {}, {}, {} for row in dataset.languages: language = row['NAME'] lids[language] = row['GLOTTOCODE'] coords = dict([wl.coords[taxon] for taxon in lids]) modify = { 'thunder (verb)': 'thunder', 'flash (verb)': 'lightning', 'room': 'flat', 'have diarrea': 'have diarrhoea', 'watery': 'light' } for row in dataset.concepts: concept = modify[row['CONCEPT']] if row['CONCEPT'] in modify else \ row['CONCEPT'] cids[concept] = row['CONCEPT_SET'] # language ids src = getEvoBibAsSource(SOURCE) src2 = getEvoBibAsSource('List2014b') # get partial identifiers partial_ids = defaultdict(list) partial_converter = {} idx = 1 for k in wl: for char in wl[k, 'counterpart']: if char in partial_converter: pidx = partial_converter[char] else: pidx = idx partial_converter[char] = idx idx += 1 partial_ids[k] += [pidx] # trace if proto-langugages was visited visited = [] idx = max([k for k in wl]) + 1 with CldfDataset( ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Parameter_Chinese_name', 'Value', 'Value_Chinese_characters', 'Source', 'Segments', 'Cognacy', 'Rank', 'Comment'), dataset) as ds: ds.sources.add(src) ds.sources.add(src2) D = {0: ['doculect', 'concept', 'ipa', 'tokens', 'cogid']} for k in wl: tokens = lp.ipa2tokens(wl[k, 'ipa'], merge_vowels=False, expand_nasals=True) # remove sandhi-annotation in tokens, as it is confusing clpa for i, t in enumerate(tokens): if '⁻' in t: tokens[i] = t[:t.index('⁻')] ds.add_row([ '{0}-{1}'.format(SOURCE, k), lids[wl[k, 'doculect']], wl[k, 'doculect'], '', cids[wl[k, 'concept']], wl[k, 'concept'], wl[k, 'mandarin'], wl[k, 'ipa'], wl[k, 'counterpart'], SOURCE, ' '.join(tokens), wl[k, 'cogid'], wl[k, 'order'], wl[k, 'note'] if wl[k, 'note'] != '-' else '', ]) D[k] = [ wl[k, 'doculect'], wl[k, 'concept'], wl[k, 'ipa'], tokens, wl[k, 'cogid'] ] if wl[k, 'cogid'] not in visited: # we need to add new tones, otherwise it won't work, so we # split syllables first, then check if the syllable ends with # tone or not and add a '1' if this is not the case syllables = wl[k, 'mch'].split('.') for i, s in enumerate(syllables): if s[-1] not in '²³': if s[-1] not in 'ptk': syllables[i] += '¹' else: syllables[i] += '⁴' tokens = lp.ipa2tokens(''.join(syllables)) ds.add_row([ '{0}-{1}'.format(wl[k, 'concept'], idx), 'sini1245', 'Middle Chinese', '', cids[wl[k, 'concept']], wl[k, 'concept'], '', wl[k, 'proto'], wl[k, 'counterpart'], SOURCE, ' '.join(tokens), wl[k, 'cogid'], '', '' ]) D[idx] = [ 'Middle Chinese', wl[k, 'concept'], wl[k, 'mch'], tokens, wl[k, 'cogid'] ] idx += 1 visited += [wl[k, 'cogid']] alms = lp.Alignments(D) cognates = [[ '{0}-{1}'.format(SOURCE, k), ds.name, alms[k, 'ipa'], '-'.join([slug(alms[k, 'concept']), str(alms[k, 'cogid'])]), '', 'expert', SOURCE, '', '', '' ] for k in alms] dataset.cognates.extend( iter_alignments(alms, cognates, method='library'))
def cognate_code_to_file( metadata: Path, ratio: float, soundclass: str, cluster_method: str, threshold: float, initial_threshold: float, gop: float, mode: str, output_file: Path, ) -> None: dataset = pycldf.Wordlist.from_metadata(metadata) assert (dataset.column_names.forms.segments is not None), "Dataset must have a CLDF #segments column." lex = lingpy.compare.partial.Partial.from_cldf( metadata, filter=filter_function_factory(dataset), columns=["doculect", "concept", "tokens"], model=lingpy.data.model.Model(soundclass), check=True, ) if ratio != 1.5: if ratio == float("inf"): ratio_pair = (1, 0) ratio_str = "-inf" if ratio == int(ratio) >= 0: r = int(ratio) ratio_pair = (r, 1) ratio_str = "-{:d}".format(r) elif ratio > 0: ratio_pair = (ratio, 1) ratio_str = "-" + str(ratio) else: raise ValueError("LexStat ratio must be in [0, ∞]") else: ratio_pair = (3, 2) ratio_str = "" if initial_threshold != 0.7: ratio_str += "-t{:02d}".format(int(initial_threshold * 100)) try: scorers_etc = lingpy.compare.lexstat.LexStat( filename="lexstats-{:}-{:s}{:s}.tsv".format( sha1(metadata), soundclass, ratio_str)) lex.scorer = scorers_etc.scorer lex.cscorer = scorers_etc.cscorer lex.bscorer = scorers_etc.bscorer except (OSError, ValueError): lex.get_scorer(runs=10000, ratio=ratio_pair, threshold=initial_threshold) lex.output( "tsv", filename="lexstats-{:}-{:s}{:s}".format(sha1(metadata), soundclass, ratio_str), ignore=[], ) # For some purposes it is useful to have monolithic cognate classes. lex.cluster( method="lexstat", threshold=threshold, ref="cogid", cluster_method=cluster_method, verbose=True, override=True, gop=gop, mode=mode, ) # But actually, in most cases partial cognates are much more useful. lex.partial_cluster( method="lexstat", threshold=threshold, cluster_method=cluster_method, ref="partialcognateids", override=True, verbose=True, gop=gop, mode=mode, ) lex.output("tsv", filename="auto-clusters") alm = lingpy.Alignments(lex, ref="partialcognateids", fuzzy=True) alm.align(method="progressive") alm.output("tsv", filename=str(output_file), ignore="all", prettify=False) try: dataset.add_component("CognateTable") except ValueError: ... try: dataset.add_component("CognatesetTable") except ValueError: ... read_back = csv.DictReader(open(str(output_file) + ".tsv", encoding="utf-8"), delimiter="\t") cognatesets = {} judgements = [] i = 1 for line in read_back: partial = line["PARTIALCOGNATEIDS"].split() alignment = line["ALIGNMENT"].split(" + ") slice_start = 0 for cs, alm in zip(partial, alignment): # TODO: @Gereon: is it alright to add the same content to Name and ID? cognatesets.setdefault(cs, {"ID": cs, "Name": cs}) length = len(alm.split()) judgements.append({ "ID": i, "Form_ID": line["ID"], "Cognateset_ID": cs, "Segment_Slice": ["{:d}:{:d}".format(slice_start, slice_start + length)], "Alignment": alm.split(), "Source": ["LexStat"], }) i += 1 slice_start += length dataset.write(CognatesetTable=cognatesets.values()) dataset.write(CognateTable=judgements)
lex = lingpy.compare.partial.Partial(out, col="doculect", row="concept", segments="tokens", transcription="ipa") lex.get_scorer(runs=1000) lex.output('tsv', filename='lexstats', ignore=[]) # For some purposes it is useful to have monolithic cognate classes. lex.cluster(method='lexstat', threshold=0.55, ref='cogid', cluster_method="infomap", verbose=True) # But actually, in most cases partial cognates are much more useful. lex.partial_cluster(method='lexstat', threshold=0.55, ref='partialids', cluster_method="infomap", verbose=True) lex.output("tsv", filename="auto-clusters") alm = lingpy.Alignments(lex, row="concept", col="doculect", segments="tokens", transcription="ipa", ref="partialids", fuzzy=True) alm.align(method='progressive') alm.output('tsv', filename='aligned', ignore='all', prettify=False)