def __init__(self, repos): self.api = repos if isinstance(repos, pyglottolog.Glottolog) \ else pyglottolog.Glottolog(repos)
import json from pathlib import Path from collections import OrderedDict, Counter import xlrd import pybtex import pyglottolog from pyglottolog.fts import search_langs from pylexirumah import get_dataset from pylexirumah.geo_lookup import geonames, get_region from pylexirumah.util import identifier gl = pyglottolog.Glottolog( Path(pyglottolog.__file__).parent.parent.parent.parent / "glottolog") lr = get_dataset() # The concepts.json matches Indonesian glosses to LexiRumah concepts and # necessary comments. Most of the matches there were found automatically # through very close matches of the Indonesian or English gloss, with some # manual corrections. concepts = json.load((Path(__file__).parent / "concepts.json").open()) new_sources = pybtex.database.BibliographyData() new_lects = list(lr["LanguageTable"].iterdicts()) new_forms = list(lr["FormTable"].iterdicts()) synonym_counts = Counter() header = None for row in xlrd.open_workbook(
def run(args): """ Entry point for command-line call. """ # Instantiate BIPA and Glottolog args.log.info("Instantiating CLTS and Glottolog...") bipa = pyclts.CLTS(args.clts.dir).bipa glottolog = pyglottolog.Glottolog(args.glottolog.dir) args.log.info("Loading dataset...") ds = get_dataset(args.dataset).cldf_reader() # Collect list of catalogs for the comparison # TODO: allow to set from command-line catalogs = sorted({row["Catalog"] for row in ds["ValueTable"]}) # Collect mapping of language ids to glottocodes glottocode_map = { row["ID"]: row["Glottocode"] for row in ds["LanguageTable"] } # Collect parameters, which include unicode and bipa parameter_map = { row["ID"]: { "unicode": row["Name"], "bipa": row["BIPA"] } for row in ds["ParameterTable"] } # Collect all contribution IDs mapped to a given glottocode, for # glottocode comparison inventories = defaultdict(lambda: defaultdict(set)) for row in ds["ValueTable"]: if row["Catalog"] in catalogs: glottocode = glottocode_map.get(row["Language_ID"]) if glottocode: inventories[row["Catalog"]][glottocode].add( row["Contribution_ID"]) # Collect unique references per glottocode, for source comparison inventories_source = defaultdict(lambda: defaultdict(set)) for row in ds['ValueTable']: lid = row['Language_ID'] glottocode = glottocode_map.get(row['Language_ID']) source = ";".join(row['Source']) if glottocode and source: inventories_source[row['Catalog']][glottocode, source].add( row['Contribution_ID']) # Collect value for glottocode comparison (values) and for # for source comparison (values_source) values = collect_inventory_values(ds, inventories, parameter_map) values_source = collect_inventory_values(ds, inventories_source, parameter_map) # Write all sounds for both comparisons in single tables args.log.info("Writing sound tables...") write_soundtable(values, inventories, glottolog) write_soundtable(values_source, inventories_source, glottolog, source=True) # Iterate over all combinations and write results iterate_combinations(values, inventories, catalogs, bipa, glottolog, args) iterate_combinations(values_source, inventories_source, catalogs, bipa, glottolog, args, source=True)
def caching_api(repos_path): """Glottolog instance from shared directory for read-only tests.""" return pyglottolog.Glottolog(str(repos_path), cache=True)
def api_copy(tmpdir, repos_path): """Glottolog instance from isolated directory copy.""" repos_copy = str(tmpdir / 'repos') path.copytree(str(repos_path), repos_copy) return pyglottolog.Glottolog(repos_copy)
import numpy import xml.etree.ElementTree as ET import itertools import pyglottolog from cldfcatalog import Config cfg = Config.from_file() glottolog = pyglottolog.Glottolog(cfg.get_clone("glottolog")) languoids = {l.id: l for l in glottolog.languoids()} def macroarea(family): macroareas = set(family.macroareas) for c in family.descendants_from_nodemap(languoids): macroareas |= set(c.macroareas) if len(macroareas) > 1: return [] return macroareas families_by_macroarea = {} for toplevel in glottolog.tree.glob("*"): # These are the top-level families, I guess there is a better way to access # them, but I don't find it documented or by reading the pyglottolog code. glottocode = toplevel.stem # A few glottocodes are bookkeeping families, not actual language families family = glottolog.languoid(glottocode) if family.category == "Pseudo Family": continue macroarea = { 'japo1237': 'eurasia',
from csvw.dsv import UnicodeWriter import pyglottolog assert int(pyglottolog.__version__.split('.')[0]) >= 2 def locations(glottolog, fid, outpath): with UnicodeWriter(outpath) as writer: writer.writerow(['name', 'glottocode', 'latitude', 'longitude']) for lang in glottolog.languoids(): if lang.level == glottolog.languoid_levels.language and lang.latitude is not None: if fid in [l[1] for l in lang.lineage]: writer.writerow( [lang.name, lang.id, lang.latitude, lang.longitude]) if __name__ == '__main__': import sys locations(pyglottolog.Glottolog(sys.argv[1]), sys.argv[2], sys.argv[3])
coding: t.Dict[int, str] = {} for code in wals["CodeTable"]: if code["Parameter_ID"] != "81A": continue coding[code["ID"]] = code["Name"] missearched = { "Albanian": "alba1267", } print("Accessing Glottolog…") languoids: t.Dict[Language_ID, t.Optional[pyglottolog.languoids.Languoid]] = {} # Activate a specific version of Glottolog with Catalog.from_config("glottolog", tag="v4.3") as glottolog_repo: glottolog = pyglottolog.Glottolog(glottolog_repo.dir) build_langs_index(glottolog, logging) languoids_by_code = glottolog.languoids_by_code() print("Getting macroareas from WALS supplemented by Glottolog…") for language in tqdm(wals["LanguageTable"], total=wals["LanguageTable"].common_props["dc:extent"]): languoids[language["ID"]] = languoids_by_code.get(missearched.get(language["Name"])) if languoids[language["ID"]] is None and language["Glottocode"]: try: languoids[language["ID"]] = languoids_by_code.get(language["Glottocode"]) except (AttributeError, IndexError): pass if languoids[language["ID"]] is None: n, langs = search_langs(glottolog, language["Name"]) if n >= 1: print(language["Name"], langs[0], end="\n\n")
import sys import argparse from clldutils.path import Path from pycldf.sources import Source from pycldf.dataset import Wordlist, Dataset from csvw.metadata import Column from urllib.error import HTTPError from urllib.request import urlopen import newick from pybtex.database import BibliographyData, Entry try: import pyglottolog local_glottolog = pyglottolog.Glottolog() except (ValueError, ImportError): local_glottolog = None from . import get_dataset, repository REPLACE = { " ": "_", '’': "'", '-': "_", '.': "_", "'": "'", "*": "", '´': "'", 'µ': "_", 'ǎ': "a",
# I have a self-imposed deadline for this but something else just came up, so here be dragons. Sorry. # The eventual plan is to have everything be in Haskell - this is only for v0.1. import psycopg2 import pyglottolog import os.path from glob import glob from collections import OrderedDict import iphon_configparser from commit import parse_phoneme, parse_allophonic_rule, validate, INI_DEFAULTS from add import maybe GLOTTOLOG_PATH = os.path.expanduser('~/Documents/glottolog-4.0') glottolog = pyglottolog.Glottolog(GLOTTOLOG_PATH) # Harmonize with Pshrimp for now; fix them both later. DOCULECT_NAME_COL = 'language_name' DOC_SEG_JOIN_TBL = 'doculect_segments' SEGMENT_COL = 'phoneme' schema = '''\ languages ( \ id SERIAL PRIMARY KEY, \ name VARCHAR(255), \ glottocode VARCHAR(255) NOT NULL, \ iso6393 VARCHAR(255), \ family VARCHAR(255), \ genus VARCHAR(255), \ macroarea VARCHAR(255), \ latitude FLOAT, \ longitude FLOAT \
def api_copy(tmp_path, repos_path): """Glottolog instance from isolated directory copy.""" repos_copy = tmp_path / 'repos' shutil.copytree(repos_path, repos_copy) return pyglottolog.Glottolog(repos_copy)
def api(repos_path): """Glottolog instance from shared directory for read-only tests.""" return pyglottolog.Glottolog(repos_path)