Ejemplo n.º 1
0
 def __init__(self, repos):
     self.api = repos if isinstance(repos, pyglottolog.Glottolog) \
         else pyglottolog.Glottolog(repos)
Ejemplo n.º 2
0
import json
from pathlib import Path
from collections import OrderedDict, Counter

import xlrd

import pybtex
import pyglottolog
from pyglottolog.fts import search_langs

from pylexirumah import get_dataset
from pylexirumah.geo_lookup import geonames, get_region
from pylexirumah.util import identifier

gl = pyglottolog.Glottolog(
    Path(pyglottolog.__file__).parent.parent.parent.parent / "glottolog")

lr = get_dataset()
# The concepts.json matches Indonesian glosses to LexiRumah concepts and
# necessary comments. Most of the matches there were found automatically
# through very close matches of the Indonesian or English gloss, with some
# manual corrections.
concepts = json.load((Path(__file__).parent / "concepts.json").open())

new_sources = pybtex.database.BibliographyData()
new_lects = list(lr["LanguageTable"].iterdicts())
new_forms = list(lr["FormTable"].iterdicts())
synonym_counts = Counter()

header = None
for row in xlrd.open_workbook(
Ejemplo n.º 3
0
def run(args):
    """
    Entry point for command-line call.
    """

    # Instantiate BIPA and Glottolog
    args.log.info("Instantiating CLTS and Glottolog...")
    bipa = pyclts.CLTS(args.clts.dir).bipa
    glottolog = pyglottolog.Glottolog(args.glottolog.dir)

    args.log.info("Loading dataset...")
    ds = get_dataset(args.dataset).cldf_reader()

    # Collect list of catalogs for the comparison
    # TODO: allow to set from command-line
    catalogs = sorted({row["Catalog"] for row in ds["ValueTable"]})

    # Collect mapping of language ids to glottocodes
    glottocode_map = {
        row["ID"]: row["Glottocode"]
        for row in ds["LanguageTable"]
    }

    # Collect parameters, which include unicode and bipa
    parameter_map = {
        row["ID"]: {
            "unicode": row["Name"],
            "bipa": row["BIPA"]
        }
        for row in ds["ParameterTable"]
    }

    # Collect all contribution IDs mapped to a given glottocode, for
    # glottocode comparison
    inventories = defaultdict(lambda: defaultdict(set))
    for row in ds["ValueTable"]:
        if row["Catalog"] in catalogs:
            glottocode = glottocode_map.get(row["Language_ID"])
            if glottocode:
                inventories[row["Catalog"]][glottocode].add(
                    row["Contribution_ID"])

    # Collect unique references per glottocode, for source comparison
    inventories_source = defaultdict(lambda: defaultdict(set))
    for row in ds['ValueTable']:
        lid = row['Language_ID']
        glottocode = glottocode_map.get(row['Language_ID'])
        source = ";".join(row['Source'])
        if glottocode and source:
            inventories_source[row['Catalog']][glottocode, source].add(
                row['Contribution_ID'])

    # Collect value for glottocode comparison (values) and for
    # for source comparison (values_source)
    values = collect_inventory_values(ds, inventories, parameter_map)
    values_source = collect_inventory_values(ds, inventories_source,
                                             parameter_map)

    # Write all sounds for both comparisons in single tables
    args.log.info("Writing sound tables...")
    write_soundtable(values, inventories, glottolog)
    write_soundtable(values_source, inventories_source, glottolog, source=True)

    # Iterate over all combinations and write results
    iterate_combinations(values, inventories, catalogs, bipa, glottolog, args)
    iterate_combinations(values_source,
                         inventories_source,
                         catalogs,
                         bipa,
                         glottolog,
                         args,
                         source=True)
Ejemplo n.º 4
0
def caching_api(repos_path):
    """Glottolog instance from shared directory for read-only tests."""
    return pyglottolog.Glottolog(str(repos_path), cache=True)
Ejemplo n.º 5
0
def api_copy(tmpdir, repos_path):
    """Glottolog instance from isolated directory copy."""
    repos_copy = str(tmpdir / 'repos')
    path.copytree(str(repos_path), repos_copy)
    return pyglottolog.Glottolog(repos_copy)
import numpy

import xml.etree.ElementTree as ET
import itertools
import pyglottolog

from cldfcatalog import Config
cfg = Config.from_file()
glottolog = pyglottolog.Glottolog(cfg.get_clone("glottolog"))
languoids = {l.id: l for l in glottolog.languoids()}


def macroarea(family):
    macroareas = set(family.macroareas)
    for c in family.descendants_from_nodemap(languoids):
        macroareas |= set(c.macroareas)
        if len(macroareas) > 1:
            return []
    return macroareas

families_by_macroarea = {}
for toplevel in glottolog.tree.glob("*"):
    # These are the top-level families, I guess there is a better way to access
    # them, but I don't find it documented or by reading the pyglottolog code.
    glottocode = toplevel.stem
    # A few glottocodes are bookkeeping families, not actual language families
    family = glottolog.languoid(glottocode)
    if family.category == "Pseudo Family":
        continue
    macroarea = {
        'japo1237': 'eurasia',
from csvw.dsv import UnicodeWriter
import pyglottolog

assert int(pyglottolog.__version__.split('.')[0]) >= 2


def locations(glottolog, fid, outpath):
    with UnicodeWriter(outpath) as writer:
        writer.writerow(['name', 'glottocode', 'latitude', 'longitude'])
        for lang in glottolog.languoids():
            if lang.level == glottolog.languoid_levels.language and lang.latitude is not None:
                if fid in [l[1] for l in lang.lineage]:
                    writer.writerow(
                        [lang.name, lang.id, lang.latitude, lang.longitude])


if __name__ == '__main__':
    import sys

    locations(pyglottolog.Glottolog(sys.argv[1]), sys.argv[2], sys.argv[3])
Ejemplo n.º 8
0
coding: t.Dict[int, str] = {}
for code in wals["CodeTable"]:
    if code["Parameter_ID"] != "81A":
        continue
    coding[code["ID"]] = code["Name"]

missearched = {
    "Albanian": "alba1267",
}

print("Accessing Glottolog…")
languoids: t.Dict[Language_ID, t.Optional[pyglottolog.languoids.Languoid]] = {}
# Activate a specific version of Glottolog
with Catalog.from_config("glottolog", tag="v4.3") as glottolog_repo:
    glottolog = pyglottolog.Glottolog(glottolog_repo.dir)
    build_langs_index(glottolog, logging)
    languoids_by_code = glottolog.languoids_by_code()

    print("Getting macroareas from WALS supplemented by Glottolog…")
    for language in tqdm(wals["LanguageTable"], total=wals["LanguageTable"].common_props["dc:extent"]):
        languoids[language["ID"]] = languoids_by_code.get(missearched.get(language["Name"]))
        if languoids[language["ID"]] is None and language["Glottocode"]:
            try:
                languoids[language["ID"]] = languoids_by_code.get(language["Glottocode"])
            except (AttributeError, IndexError):
                pass
        if languoids[language["ID"]] is None:
            n, langs = search_langs(glottolog, language["Name"])
            if n >= 1:
                print(language["Name"], langs[0], end="\n\n")
Ejemplo n.º 9
0
import sys
import argparse

from clldutils.path import Path
from pycldf.sources import Source
from pycldf.dataset import Wordlist, Dataset
from csvw.metadata import Column

from urllib.error import HTTPError
from urllib.request import urlopen

import newick
from pybtex.database import BibliographyData, Entry
try:
    import pyglottolog
    local_glottolog = pyglottolog.Glottolog()
except (ValueError, ImportError):
    local_glottolog = None

from . import get_dataset, repository

REPLACE = {
    " ": "_",
    '’': "'",
    '-': "_",
    '.': "_",
    "'": "'",
    "*": "",
    '´': "'",
    'µ': "_",
    'ǎ': "a",
Ejemplo n.º 10
0
# I have a self-imposed deadline for this but something else just came up, so here be dragons. Sorry.
# The eventual plan is to have everything be in Haskell - this is only for v0.1.

import psycopg2
import pyglottolog
import os.path
from glob import glob
from collections import OrderedDict
import iphon_configparser
from commit import parse_phoneme, parse_allophonic_rule, validate, INI_DEFAULTS
from add import maybe

GLOTTOLOG_PATH = os.path.expanduser('~/Documents/glottolog-4.0')
glottolog = pyglottolog.Glottolog(GLOTTOLOG_PATH)

# Harmonize with Pshrimp for now; fix them both later.
DOCULECT_NAME_COL = 'language_name'
DOC_SEG_JOIN_TBL = 'doculect_segments'
SEGMENT_COL = 'phoneme'

schema = '''\
    languages (                                 \
        id SERIAL PRIMARY KEY,                  \
        name VARCHAR(255),                      \
        glottocode VARCHAR(255) NOT NULL,       \
        iso6393 VARCHAR(255),                   \
        family VARCHAR(255),                    \
        genus VARCHAR(255),                     \
        macroarea VARCHAR(255),                 \
        latitude FLOAT,                         \
        longitude FLOAT                         \
Ejemplo n.º 11
0
def api_copy(tmp_path, repos_path):
    """Glottolog instance from isolated directory copy."""
    repos_copy = tmp_path / 'repos'
    shutil.copytree(repos_path, repos_copy)
    return pyglottolog.Glottolog(repos_copy)
Ejemplo n.º 12
0
def api(repos_path):
    """Glottolog instance from shared directory for read-only tests."""
    return pyglottolog.Glottolog(repos_path)