Beispiel #1
0
def read_lff(level, fp=None, dry_run=False):
    assert isinstance(level, Level)
    lang_line = re.compile("\s+" + NAME_AND_ID_REGEX + "(\[([a-z]{3}|NOCODE\_[^\]]+)?\])$")
    class_line = re.compile(NAME_AND_ID_REGEX + "(,\s*" + NAME_AND_ID_REGEX + ")*$")
    isolate_line = re.compile("([^\[]+)(\[-isolate-\])$")

    path = None
    with fp or build_path("%sff.txt" % level.name[0]).open(encoding="utf8") as fp:
        for line in fp:
            line = line.rstrip()
            if line.startswith("#") or not line.strip():
                # ignore comments or empty lines
                continue
            match = lang_line.match(line)
            if match:
                assert path
                yield Languoid.from_lff(None if path == "isolate" else path, line.strip(), level, dry_run=dry_run)
            else:
                match = isolate_line.match(line)
                if match:
                    path = "isolate"
                else:
                    # assert it matches a classification line!
                    if not class_line.match(line):
                        raise ValueError(line)
                    path = line.strip()
Beispiel #2
0
def read_lff(level, fp=None, dry_run=False):
    assert isinstance(level, Level)
    lang_line = re.compile('\s+' + NAME_AND_ID_REGEX +
                           '(\[([a-z]{3}|NOCODE\_[^\]]+)?\])$')
    class_line = re.compile(NAME_AND_ID_REGEX + '(,\s*' + NAME_AND_ID_REGEX +
                            ')*$')
    isolate_line = re.compile('([^\[]+)(\[-isolate-\])$')

    path = None
    with fp or build_path(
            '%sff.txt' % level.name[0]).open(encoding='utf8') as fp:
        for line in fp:
            line = line.rstrip()
            if line.startswith('#') or not line.strip():
                # ignore comments or empty lines
                continue
            match = lang_line.match(line)
            if match:
                assert path
                yield Languoid.from_lff(None if path == 'isolate' else path,
                                        line.strip(),
                                        level,
                                        dry_run=dry_run)
            else:
                match = isolate_line.match(line)
                if match:
                    path = 'isolate'
                else:
                    # assert it matches a classification line!
                    if not class_line.match(line):
                        raise ValueError(line)
                    path = line.strip()
Beispiel #3
0
def read_lff(level, fp=None):
    lang_line = re.compile('\s+' + NAME_AND_ID_REGEX + '(\[([a-z]{3})?\])$')
    class_line = re.compile(NAME_AND_ID_REGEX + '(,\s*' + NAME_AND_ID_REGEX + ')*$')
    isolate_line = re.compile('([^\[]+)(\[-isolate-\])$')

    path = None
    with fp or build_path('%sff.txt' % level[0]).open(encoding='utf8') as fp:
        for line in fp:
            line = line.rstrip()
            if line.startswith('#') or not line.strip():
                # ignore comments or empty lines
                continue
            match = lang_line.match(line)
            if match:
                assert path
                yield Languoid.from_lff(
                    None if path == 'isolate' else path, line.strip(), level)
            else:
                match = isolate_line.match(line)
                if match:
                    path = 'isolate'
                else:
                    # assert it matches a classification line!
                    if not class_line.match(line):
                        raise ValueError(line)
                    path = line.strip()
Beispiel #4
0
def tree2lff(tree=TREE):
    languoids = dict(dialect=defaultdict(list), language=defaultdict(list))
    nodes = {}

    for l in walk_tree(tree=tree, nodes=nodes):
        if l.level in languoids:
            languoids[l.level][l.lff_group()].append(l.lff_language())

    for level, languages in languoids.items():
        with build_path('%sff.txt' % level[0]).open('w', encoding='utf8') as fp:
            fp.write('# -*- coding: utf-8 -*-\n')
            for path in sorted(languages):
                fp.write(path + '\n')
                for l in sorted(languages[path]):
                    fp.write(l + '\n')
Beispiel #5
0
def lff2tree(tree=TREE, outdir=None, builddir=None, lffs=None):
    """
    - get mapping glottocode -> Languoid from old tree
    - assemble new directory tree
      - for each path component in lff/dff:
        - create new dir
        - copy info file from old tree (possibly updating the name) or
        - create info file
      - for each language/dialect in lff/dff:
        - create new dir
        - copy info file from old tree (possibly updating the name) or
        - create info file
    - rm old tree
    - copy new tree
    """
    # FIXME: instead of removing trees, we should just move the current one
    # from outdir to build, and then recreate in outdir.
    builddir = Path(builddir) if builddir else build_path('tree')
    old_tree = {l.id: l for l in walk_tree(tree)} if tree else {}
    out = Path(outdir or tree)
    if not out.parent.exists():
        out.parent.mkdir()

    if out.exists():
        if builddir.exists():
            try:
                rmtree(builddir)
            except:  # pragma: no cover
                pass
            if builddir.exists():  # pragma: no cover
                raise ValueError('please remove %s before proceeding' %
                                 builddir)
        # move the old tree out of the way
        shutil.move(out.as_posix(), builddir.as_posix())
    out.mkdir()

    lffs = lffs or {}
    languages = {}
    for lang in read_lff(Level.language, fp=lffs.get(Level.language)):
        languages[lang.id] = lang
        lang2tree(lang, lang.lineage, out, old_tree)

    for lang in read_lff(Level.dialect, fp=lffs.get(Level.dialect)):
        if not lang.lineage or lang.lineage[0][1] not in languages:
            raise ValueError('unattached dialect')  # pragma: no cover

        lang2tree(lang, languages[lang.lineage[0][1]].lineage + lang.lineage,
                  out, old_tree)
Beispiel #6
0
def lff2tree(tree=TREE, outdir=None, builddir=None, lffs=None):
    """
    - get mapping glottocode -> Languoid from old tree
    - assemble new directory tree
      - for each path component in lff/dff:
        - create new dir
        - copy info file from old tree (possibly updating the name) or
        - create info file
      - for each language/dialect in lff/dff:
        - create new dir
        - copy info file from old tree (possibly updating the name) or
        - create info file
    - rm old tree
    - copy new tree
    """
    # FIXME: instead of removing trees, we should just move the current one
    # from outdir to build, and then recreate in outdir.
    builddir = Path(builddir) if builddir else build_path("tree")
    old_tree = {l.id: l for l in walk_tree(tree)} if tree else {}
    out = Path(outdir or tree)
    if not out.parent.exists():
        out.parent.mkdir()

    if out.exists():
        if builddir.exists():
            try:
                rmtree(builddir)
            except:  # pragma: no cover
                pass
            if builddir.exists():  # pragma: no cover
                raise ValueError("please remove %s before proceeding" % builddir)
        # move the old tree out of the way
        shutil.move(out.as_posix(), builddir.as_posix())
    out.mkdir()

    lffs = lffs or {}
    languages = {}
    for lang in read_lff(Level.language, fp=lffs.get(Level.language)):
        languages[lang.id] = lang
        lang2tree(lang, lang.lineage, out, old_tree)

    for lang in read_lff(Level.dialect, fp=lffs.get(Level.dialect)):
        if not lang.lineage or lang.lineage[0][1] not in languages:
            raise ValueError("unattached dialect")  # pragma: no cover

        lang2tree(lang, languages[lang.lineage[0][1]].lineage + lang.lineage, out, old_tree)
Beispiel #7
0
def tree2lff(tree=TREE, out_paths=None):
    out_paths = out_paths or {}
    languoids = {Level.dialect: defaultdict(list), Level.language: defaultdict(list)}
    nodes = {}

    for l in walk_tree(tree=tree, nodes=nodes):
        if l.level in languoids:
            languoids[l.level][l.lff_group()].append(l.lff_language())

    for level, languages in languoids.items():
        out_path = out_paths.get(level, build_path("%sff.txt" % level.name[0]))
        with out_path.open("w", encoding="utf8") as fp:
            fp.write("# -*- coding: utf-8 -*-\n")
            for path in sorted(languages):
                fp.write(path + "\n")
                for l in sorted(languages[path]):
                    fp.write(l + "\n")
Beispiel #8
0
def tree2lff(tree=TREE, out_paths=None):
    out_paths = out_paths or {}
    languoids = {
        Level.dialect: defaultdict(list),
        Level.language: defaultdict(list)
    }
    nodes = {}

    for l in walk_tree(tree=tree, nodes=nodes):
        if l.level in languoids:
            languoids[l.level][l.lff_group()].append(l.lff_language())

    for level, languages in languoids.items():
        out_path = out_paths.get(level, build_path('%sff.txt' % level.name[0]))
        with out_path.open('w', encoding='utf8') as fp:
            fp.write('# -*- coding: utf-8 -*-\n')
            for path in sorted(languages):
                fp.write(path + '\n')
                for l in sorted(languages[path]):
                    fp.write(l + '\n')
Beispiel #9
0
def lff2tree(tree=TREE, outdir=None, test=False):
    """
    - get mapping glottocode -> Languoid from old tree
    - assemble new directory tree
      - for each path component in lff/dff:
        - create new dir
        - copy info file from old tree (possibly updating the name) or
        - create info file
      - for each language/dialect in lff/dff:
        - create new dir
        - copy info file from old tree (possibly updating the name) or
        - create info file
    - rm old tree
    - copy new tree
    """
    out = Path(outdir or build_path('tree'))
    if not out.parent.exists():
        out.parent.mkdir()
    if out.exists():
        rmtree(out)
    out.mkdir()
    old_tree = {l.id: l for l in walk_tree(tree)} if tree else {}

    languages = {}
    for lang in read_lff('language'):
        languages[lang.id] = lang
        lang2tree(lang, lang.lineage, out, old_tree)

    for lang in read_lff('dialect'):
        if not lang.lineage or lang.lineage[0][1] not in languages:
            raise ValueError('unattached dialect')

        lang2tree(
            lang, languages[lang.lineage[0][1]].lineage + lang.lineage, out, old_tree)

    if not test:
        rmtree(TREE, ignore_errors=True)
        copytree(out, TREE)
Beispiel #10
0
import difflib
import operator
import itertools
import contextlib
import collections

from six import string_types, viewkeys
from clldutils.dsv import UnicodeWriter
from clldutils import jsonlib

from pyglottolog.util import build_path, unique, group_first
from pyglottolog.monsterlib import _bibtex

__all__ = ['Database']

DBFILE = build_path('_bibfiles.sqlite3').as_posix()

UNION_FIELDS = {'fn', 'asjp_name', 'isbn'}
IGNORE_FIELDS = {'crossref', 'numnote', 'glotto_id'}


class Database(object):
    """Bibfile collection parsed into an sqlite3 file."""

    @staticmethod
    def _get_bibfiles(bibfiles):
        if bibfiles is None:  # pragma: no cover
            from _bibfiles import Collection
            return Collection()
        return bibfiles
Beispiel #11
0
def main(repos=DATA_DIR, rebuild=False):
    bibfiles = _bibfiles.Collection(references_path('bibtex', repos=repos))
    previous = references_path('monster.csv', repos=repos)
    replacements = build_path('monster-replacements.json', repos=repos)
    monster = _bibfiles.BibFile(
        build_path('monster-utf8.bib', repos=repos), encoding='utf-8', sortkey='bibkey')
    tree = languoids_path('tree', repos=repos)
    hht = HHTypes(repos=repos)

    print('%s open/rebuild bibfiles db' % time.ctime())
    db = bibfiles.to_sqlite(
        build_path('_bibfiles.sqlite3', repos=repos).as_posix(),
        rebuild=rebuild)

    print('%s compile_monster' % time.ctime())
    m = dict(db.merged())

    print('%s load hh.bib' % time.ctime())
    hhbib = bibfiles['hh.bib'].load()

    # Annotate with macro_area from lgcode when lgcode is assigned manually
    print('%s macro_area_from_lgcode' % time.ctime())
    m = macro_area_from_lgcode(m, tree)

    # Annotate with hhtype
    print('%s annotate hhtype' % time.ctime())
    m = markconservative(
        m,
        hht.triggers,
        hhbib,
        hht,
        build_path('monstermark-hht.txt', repos=repos),
        rank=lambda l: hht[l])

    ltriggers = languoids.load_triggers(tree=tree)

    # Annotate with lgcode
    print('%s annotate lgcode' % time.ctime())
    m = markconservative(
        m,
        ltriggers['lgcode'],
        hhbib,
        hht,
        build_path('monstermark-lgc.txt', repos=repos))

    # Annotate with inlg
    print('%s add_inlg_e' % time.ctime())
    m = add_inlg_e(m, ltriggers['inlg'])

    # Print some statistics
    stats = Counter()
    print(time.ctime())
    for t, f in m.values():
        stats.update(['entry'])
        for field in ['lgcode', 'hhtype', 'macro_area']:
            if field in f:
                stats.update([field])
    print("# entries", stats['entry'])
    for field in ['lgcode', 'hhtype', 'macro_area']:
        print("with " + field, stats[field])

    # Update the CSV with the previous mappings for later reference
    print('%s update_previous' % time.ctime())
    db.to_csvfile(previous)

    print('%s save_replacements' % time.ctime())
    db.to_replacements(replacements)

    # Trickling back
    print('%s trickle' % time.ctime())
    db.trickle(bibfiles)

    # Save
    print('%s save as utf8' % time.ctime())
    monster.save(m, verbose=False)

    print('%s done.' % time.ctime())
Beispiel #12
0
import difflib
import operator
import itertools
import contextlib
import collections

from six import string_types
from clldutils.dsv import UnicodeWriter
from clldutils import jsonlib

from pyglottolog.util import build_path, unique, group_first
import _bibtex

__all__ = ['Database']

DBFILE = build_path('_bibfiles.sqlite3').as_posix()

UNION_FIELDS = {'fn', 'asjp_name', 'isbn'}
IGNORE_FIELDS = {'crossref', 'numnote', 'glotto_id'}


class Database(object):
    """Bibfile collection parsed into an sqlite3 file."""
    @staticmethod
    def _get_bibfiles(bibfiles):
        if bibfiles is None:  # pragma: no cover
            from _bibfiles import Collection
            return Collection()
        return bibfiles

    @staticmethod
Beispiel #13
0
def main(repos=DATA_DIR, rebuild=False):
    bibfiles = _bibfiles.Collection(references_path('bibtex', repos=repos))
    previous = references_path('monster.csv', repos=repos)
    replacements = build_path('monster-replacements.json', repos=repos)
    monster = _bibfiles.BibFile(
        build_path('monster-utf8.bib', repos=repos), encoding='utf-8', sortkey='bibkey')
    tree = languoids_path('tree', repos=repos)
    hht = HHTypes(repos=repos)

    print('%s open/rebuild bibfiles db' % time.ctime())
    db = bibfiles.to_sqlite(
        build_path('_bibfiles.sqlite3', repos=repos).as_posix(),
        rebuild=rebuild)

    print('%s compile_monster' % time.ctime())
    m = dict(db.merged())

    print('%s load hh.bib' % time.ctime())
    hhbib = bibfiles['hh.bib'].load()

    # Annotate with macro_area from lgcode when lgcode is assigned manually
    print('%s macro_area_from_lgcode' % time.ctime())
    m = macro_area_from_lgcode(m, tree)

    # Annotate with hhtype
    print('%s annotate hhtype' % time.ctime())
    m = markconservative(
        m,
        hht.triggers,
        hhbib,
        hht,
        build_path('monstermark-hht.txt', repos=repos),
        rank=lambda l: hht[l])

    ltriggers = languoids.load_triggers(tree=tree)

    # Annotate with lgcode
    print('%s annotate lgcode' % time.ctime())
    m = markconservative(
        m,
        ltriggers['lgcode'],
        hhbib,
        hht,
        build_path('monstermark-lgc.txt', repos=repos))

    # Annotate with inlg
    print('%s add_inlg_e' % time.ctime())
    m = add_inlg_e(m, ltriggers['inlg'])

    # Print some statistics
    stats = Counter()
    print(time.ctime())
    for t, f in m.values():
        stats.update(['entry'])
        for field in ['lgcode', 'hhtype', 'macro_area']:
            if field in f:
                stats.update([field])
    print("# entries", stats['entry'])
    for field in ['lgcode', 'hhtype', 'macro_area']:
        print("with " + field, stats[field])

    # Update the CSV with the previous mappings for later reference
    print('%s update_previous' % time.ctime())
    db.to_csvfile(previous)

    print('%s save_replacements' % time.ctime())
    db.to_replacements(replacements)

    # Trickling back
    print('%s trickle' % time.ctime())
    db.trickle(bibfiles)

    # Save
    print('%s save as utf8' % time.ctime())
    monster.save(m, verbose=False)

    print('%s done.' % time.ctime())
Beispiel #14
0
import os
import csv
import json
import sqlite3
import difflib
import operator
import itertools
import contextlib
import collections

from pyglottolog.util import references_path, build_path
import _bibtex

__all__ = ['Database']

DBFILE = build_path('_bibfiles.sqlite3').as_posix()
BIBFILE = build_path('monster-utf8.bib').as_posix()
CSVFILE = references_path('monster.csv').as_posix()
REPLACEMENTSFILE = build_path('monster-replacements.json').as_posix()

UNION_FIELDS = {'fn', 'asjp_name', 'isbn'}

IGNORE_FIELDS = {'crossref', 'numnote', 'glotto_id'}


class Database(object):
    """Bibfile collection parsed into an sqlite3 file."""

    @staticmethod
    def _get_bibfiles(bibfiles):
        if bibfiles is None: