Example #1
0
    def languoid(self, id_: typing.Union[str, lls.Languoid]) -> lls.Languoid:
        """
        Retrieve a languoid specified by language code.

        :param id_: Glottocode or ISO code.
        """
        if isinstance(id_, lls.Languoid):
            return id_

        if self.cache and id_ in self.cache:
            return self.cache[id_]

        if ISO_CODE_PATTERN.match(id_):
            for d in self._tree_dirs if self.cache else walk(self.tree,
                                                             mode='dirs'):
                if self.cache:
                    l_ = self.cache.add(d, self)
                else:
                    l_ = lls.Languoid.from_dir(d, _api=self)
                if l_.iso_code == id_:
                    return l_
        else:
            for d in self._tree_dirs if self.cache else walk(self.tree,
                                                             mode='dirs'):
                l_ = None
                if self.cache:
                    # If we cache Languoids, we might as well instantiate the ones we traverse:
                    l_ = self.cache.add(d, self)
                if d.name == id_:
                    if self.cache:
                        return l_
                    return lls.Languoid.from_dir(d, _api=self)
Example #2
0
    def test_walk(self):
        from clldutils.path import walk

        d = self.tmp_path('testdir')
        d.mkdir()
        self.make_file('testfile')
        res = [p.name for p in walk(self.tmp_path(), mode='files')]
        self.assertNotIn('testdir', res)
        self.assertIn('testfile', res)
        res = [p.name for p in walk(self.tmp_path(), mode='dirs')]
        self.assertIn('testdir', res)
        self.assertNotIn('testfile', res)
Example #3
0
def test_walk(tmppath):
    from clldutils.path import walk

    d = tmppath / 'testdir'
    d.mkdir()
    make_file(tmppath, name='testfile')
    res = [p.name for p in walk(d.parent, mode='files')]
    assert 'testdir' not in res
    assert 'testfile' in res
    res = [p.name for p in walk(d.parent, mode='dirs')]
    assert 'testdir' in res
    assert 'testfile' not in res
Example #4
0
    def languoid(self, id_):
        if isinstance(id_, languoids.Languoid):
            return id_

        if ISO_CODE_PATTERN.match(id_):
            for d in walk(self.tree, mode='dirs'):
                l_ = languoids.Languoid.from_dir(d)
                if l_.iso_code == id_:
                    return l_
        else:
            for d in walk(self.tree, mode='dirs'):
                if d.name == id_:
                    return languoids.Languoid.from_dir(d)
Example #5
0
    def languoids(
        self,
        ids: set = None,
        maxlevel: typing.Union[int, config.LanguoidLevel, str] = None,
        exclude_pseudo_families: bool = False
    ) -> typing.Generator[lls.Languoid, None, None]:
        """
        Yields languoid objects.

        :param ids: `set` of Glottocodes to limit the result to. This is useful to increase \
        performance, since INI file reading can be skipped for languoids not listed.
        :param maxlevel: Numeric maximal nesting depth of languoids, or Languoid.level.
        :param exclude_pseudo_families: Flag signaling whether to exclude pseud families, \
        i.e. languoids from non-genealogical trees.
        """
        is_max_level_int = isinstance(maxlevel, int)
        # Non-numeric levels are interpreted as `Languoid.level` descriptors.
        if not is_max_level_int:
            maxlevel = self.languoid_levels.get(maxlevel or 'dialect')

        # Since we traverse the tree topdown, we can cache a mapping of Languoid.id to triples
        # (name, id, level) for populating `Languoid.lineage`.
        nodes = {}
        for d in self._tree_dirs if self.cache else walk(self.tree,
                                                         mode='dirs'):
            if ids is None or d.name in ids:
                if self.cache:
                    lang = self.cache.add(d, self)
                else:
                    lang = lls.Languoid.from_dir(d, nodes=nodes, _api=self)
                if (is_max_level_int and len(lang.lineage) <= maxlevel) \
                        or ((not is_max_level_int) and lang.level <= maxlevel):
                    if (not exclude_pseudo_families
                        ) or not lang.category.startswith('Pseudo'):
                        yield lang
Example #6
0
    def create(self, path, metadata, filter_=filter_hidden, object_class=None):
        """
        Create objects in CDSTAR and register them in the catalog.

        Note that we guess the mimetype based on the filename extension, using
        `mimetypes.guess_type`. Thus, it is the caller's responsibility to add custom or
        otherwise uncommon types to the list of known types using `mimetypes.add_type`.

        :param path:
        :param metadata:
        :param filter_:
        :return:
        """
        path = Path(path)
        if path.is_file():
            fnames = [path]
        elif path.is_dir():
            fnames = list(walk(path, mode='files'))
        else:
            raise ValueError(
                'path must be a file or directory')  # pragma: no cover
        for fname in fnames:
            if not filter_ or filter_(fname):
                created, obj = self._create(fname,
                                            metadata,
                                            object_class=object_class)
                yield fname, created, obj
Example #7
0
 def languoid(self, id_):
     if ISO_CODE_PATTERN.match(id_):
         for l in languoids.walk_tree(tree=self.tree):
             if l.iso_code == id_:
                 return l
     else:
         for d in walk(self.tree, mode='dirs'):
             if d.name == id_:
                 return languoids.Languoid.from_dir(d)
Example #8
0
 def languoid(self, id_):
     if ISO_CODE_PATTERN.match(id_):
         for l in languoids.walk_tree(tree=self.tree):
             if l.iso_code == id_:
                 return l
     else:
         for d in walk(self.tree, mode='dirs'):
             if d.name == id_:
                 return languoids.Languoid.from_dir(d)
Example #9
0
def iter_datasets(d):
    """
    Discover CLDF datasets - by identifying metadata files - in a directory.

    :param d: directory
    :return: generator of `Dataset` instances.
    """
    for p in walk(d, mode='files'):
        if sniff(p):
            yield Dataset.from_metadata(p)
Example #10
0
def get_file_paths(raw_htmls, n=None):
    """
    Build a sorted list of PosixPath() objects for all files in the specified
    directory, e.g. numerals/raw/, skipping files defined in SKIP and as SKIP_RE.
    :param raw_htmls: Path to raw numerals HTML files.
    :param n: How many HTML files to process, useful for debugging.
    :return: A list of PosixPath() objects with path information for the files.
    """
    if n:  # pragma: no cover
        return sorted([
            f for f in walk(raw_htmls)
            if f.suffix.startswith(".htm") and not f.name.startswith("Copy of")
            and f.name not in SKIP and not re.search(SKIP_RE, f.name)
        ])[:n]
    else:
        return sorted([
            f for f in walk(raw_htmls)
            if f.suffix.startswith(".htm") and not f.name.startswith("Copy of")
            and f.name not in SKIP and not re.search(SKIP_RE, f.name)
        ])
Example #11
0
def run():
    terms = []
    for e in read_terms().iter():
        if ns('rdf:about') in e.attrib:
            terms.append(e.attrib[ns('rdf:about')])

    for d in ['components', 'modules']:
        for f in walk(REPO_DIR.joinpath(d)):
            if f.suffix == '.json':
                md = load(f)
                for k, v in iterproperties(md):
                    if k in ['propertyUrl', 'dc:conformsTo'] and v not in terms:
                        print(f)
                        print(v)
Example #12
0
 def languoids(self, ids=None):
     for d in walk(self.tree, mode='dirs'):
         if ids is None or d.name in ids:
             yield languoids.Languoid.from_dir(d)
Example #13
0
def walk_tree(tree=TREE, **kw):
    for fname in walk(tree, mode='files', followlinks=True):
        if fname.suffix == '.ini':
            yield Languoid.from_ini(fname, **kw)
Example #14
0
    def test_lff2tree(self):
        lfftext = self._set_lff(
            """# -*- coding: utf-8 -*-
Abkhaz-Adyge [abkh1242] aaa
    Ubykh [ubyk1235]uby
Abkhaz-Adyge [abkh1242] aaa; Abkhaz-Abaza [abkh1243]
    Abaza [abaz1241]abq
    Abkhazian [abkh1244]abk
Abkhaz-Adyge [abkh1242] aaa; Circassian [circ1239]
    Adyghe [adyg1241]ady
    Kabardian [kaba1278]kbd
""", 'lff.txt')

        self._set_lff(
            """# -*- coding: utf-8 -*-
Abaza [abaz1241] abq
    Ashkaraua [ashk1247]
    Bezshagh [bezs1238]
    Tapanta [tapa1256]
Abkhazian [abkh1244] abk
    Abzhui [abzh1238]
    Bzyb [bzyb1238]
    Samurzakan [samu1242]
""", 'dff.txt')

        lff2tree(self.api)
        self.assertEqual(self.api.languoid('abkh1242').iso, 'aaa')
        self.assertEqual(self.api.languoid('ashk1247').level, Level.dialect)
        self.assertEqual(self.api.languoid('abaz1241').level, Level.language)
        self.assertEqual(self.api.languoid('abaz1241').hid, 'abq')

        self._set_lff(lfftext.replace('Abkhaz-Abaza', 'Abkhaz-Abazzza'),
                      'lff.txt')
        lff2tree(self.api)
        glottocodes = [d.name for d in walk(self.api.tree, mode='dirs')]
        self.assertEqual(len(glottocodes), len(set(glottocodes)))
        self.assertEqual(self.api.languoid('abkh1243').name, 'Abkhaz-Abazzza')

        lfftext = self._set_lff(
            """# -*- coding: utf-8 -*-
Abkhaz-Adyge [abkh1242]
    Ubykh [ubyk1235]
Abkhaz-Adyge [abkh1242]; Abkhaz-Abaza [abkh1243]; Abaza [abaz1241]
    Ashkaraua [ashk1247]xyz
    Abkhazian [abkh1244]
Abkhaz-Adyge [abkh1242]; Circassian [circ1239]
    Adyghe [adyg1241]ady
    Kabardian [kaba1278]
Abkhaz-Adyge [abkh1242]; Circassian [circ1239]; New Group []
    New name []NOCODE_New-name
    Another one []
""", 'lff.txt')

        self._set_lff(
            """# -*- coding: utf-8 -*-
Ashkaraua [ashk1247]xyz
    Bezshagh [bezs1238]
    Tapanta [tapa1256]
Abkhazian [abkh1244]
    Abzhui [abzh1238]
    Bzyb [bzyb1238]
    Samurzakan [samu1242]
Kabardian [kaba1278]
    Dia []aaa
""", 'dff.txt')

        lff2tree(self.api)
        self.assertEqual(self.api.languoid('abaz1241').level, Level.family)
        # Now we test two things:
        # - aaa has been removed as ISO code from abkh1242
        # - aaa has been attached as ISO code to a newly created language
        self.assertEqual(self.api.languoid('aaa').name, 'Dia')
        langs = list(self.api.languoids())
        self.assertIn('newg1234', self.api.glottocodes)
        self.assertEqual(len([l for l in langs if l.name == 'New Group']), 1)
        self.assertEqual(len([l for l in langs if l.hid == 'NOCODE_New-name']),
                         1)

        # Test ISO code removal:
        self._set_lff(
            """# -*- coding: utf-8 -*-
Kabardian [kaba1278]
    Dia []
""", 'dff.txt')
        lff2tree(self.api)
        self.assertIsNone(self.api.languoid('aaa'))

        tree2lff(self.api)

        # Test hid adding
        self._set_lff(
            """# -*- coding: utf-8 -*-
Ashkaraua [ashk1247]xyz
    Ashkarauax [bezs1238]NOCODE_abc
""", 'dff.txt')
        lff2tree(self.api)
        self.assertEqual(self.api.languoid('bezs1238').hid, 'NOCODE_abc')

        #
        # Nodes must have unique names!
        #
        self._set_lff(
            """# -*- coding: utf-8 -*-
Ashkaraua [ashk1247]xyz
    Ashkaraua [bezs1238]
""", 'dff.txt')
        with self.assertRaisesRegexp(ValueError, 'duplicate'):
            lff2tree(self.api)

        #
        # Nodes must have consistent names!
        #
        self._set_lff(
            """# -*- coding: utf-8 -*-
Ashkxxxaraua [ashk1247]xyz
    Bezshagh [bezs1238]
""", 'dff.txt')
        with self.assertRaisesRegexp(ValueError, 'inconsistent'):
            lff2tree(self.api)

        #
        # Top-level nodes in dff must be languages:
        #
        self._set_lff(
            """# -*- coding: utf-8 -*-
Abaza [abaz1241]
    Bezshagh [bezs1238]
""", 'dff.txt')
        with self.assertRaises(ValueError):
            lff2tree(self.api)

        #
        # Top-level nodes in dff must be languages in lff:
        #
        self._set_lff(
            """# -*- coding: utf-8 -*-
None [xyzz1234]
    Dia []
""", 'dff.txt')
        with self.assertRaises(ValueError):
            lff2tree(self.api)

        #
        # Isolates must not have multiple ancestors:
        #
        self._set_lff(
            """# -*- coding: utf-8 -*-
None [xyzz1234]; Other [-isolate-]
    Dia []
""", 'dff.txt')
        with self.assertRaisesRegexp(ValueError, 'isolate'):
            lff2tree(self.api)

        #
        # Languages must appear after a classification line:
        #
        self._set_lff("""# -*- coding: utf-8 -*-
    Dia []
""", 'dff.txt')
        with self.assertRaisesRegexp(ValueError, 'classification'):
            lff2tree(self.api)
Example #15
0
def test_lff2tree(api_copy):
    lfftext = _set_lff(
        api_copy, 'lff.txt', """# -*- coding: utf-8 -*-
Abkhaz-Adyge [abkh1242] aaa
    Ubykh [ubyk1235]uby
Abkhaz-Adyge [abkh1242] aaa; Abkhaz-Abaza [abkh1243]
    Abaza [abaz1241]abq
    Abkhazian [abkh1244]abk
Abkhaz-Adyge [abkh1242] aaa; Circassian [circ1239]
    Adyghe [adyg1241]ady
    Kabardian [kaba1278]kbd
""")

    _set_lff(
        api_copy, 'dff.txt', """# -*- coding: utf-8 -*-
Abaza [abaz1241] abq
    Ashkaraua [ashk1247]
    Bezshagh [bezs1238]
    Tapanta [tapa1256]
Abkhazian [abkh1244] abk
    Abzhui [abzh1238]
    Bzyb [bzyb1238]
    Samurzakan [samu1242]
""")

    lff2tree(api_copy)
    assert api_copy.languoid('abkh1242').iso == 'aaa'
    assert api_copy.languoid(
        'ashk1247').level == api_copy.languoid_levels.dialect
    assert api_copy.languoid(
        'abaz1241').level == api_copy.languoid_levels.language
    assert api_copy.languoid('abaz1241').hid == 'abq'

    _set_lff(api_copy, 'lff.txt',
             lfftext.replace('Abkhaz-Abaza', 'Abkhaz-Abazzza'))
    lff2tree(api_copy)
    glottocodes = [d.name for d in walk(api_copy.tree, mode='dirs')]
    assert len(glottocodes) == len(set(glottocodes))

    abkh1243 = api_copy.languoid('abkh1243')
    # Make sure the new name is picked up ...
    assert abkh1243.name == 'Abkhaz-Abazzza'
    # ... and the old one retained as alternative name:
    assert 'Abkhaz-Abaza' in abkh1243.names['glottolog']

    lfftext = _set_lff(
        api_copy, 'lff.txt', """# -*- coding: utf-8 -*-
Abkhaz-Adyge [abkh1242]
    Ubykh [ubyk1235]
Abkhaz-Adyge [abkh1242]; Abkhaz-Abaza [abkh1243]; Abaza [abaz1241]
    Ashkaraua [ashk1247]xyz
    Abkhazian [abkh1244]
Abkhaz-Adyge [abkh1242]; Circassian [circ1239]
    Adyghe [adyg1241]ady
    Kabardian [kaba1278]
Abkhaz-Adyge [abkh1242]; Circassian [circ1239]; New Group []
    New name []NOCODE_New-name
    Another one []
""")

    _set_lff(
        api_copy, 'dff.txt', """# -*- coding: utf-8 -*-
Ashkaraua [ashk1247]xyz
    Bezshagh [bezs1238]
    Tapanta [tapa1256]
Abkhazian [abkh1244]
    Abzhui [abzh1238]
    Bzyb [bzyb1238]
    Samurzakan [samu1242]
Kabardian [kaba1278]
    Dia []aaa
""")

    lff2tree(api_copy)
    assert api_copy.languoid(
        'abaz1241').level == api_copy.languoid_levels.family
    # Now we test two things:
    # - aaa has been removed as ISO code from abkh1242
    # - aaa has been attached as ISO code to a newly created language
    assert api_copy.languoid('aaa').name == 'Dia'
    langs = list(api_copy.languoids())
    assert 'newg1234' in api_copy.glottocodes
    assert sum(1 for l in langs if l.name == 'New Group') == 1
    assert sum(1 for l in langs if l.hid == 'NOCODE_New-name') == 1

    # Test ISO code removal:
    _set_lff(api_copy, 'dff.txt', """# -*- coding: utf-8 -*-
Kabardian [kaba1278]
    Dia []
""")
    lff2tree(api_copy)
    assert api_copy.languoid('aaa') is None

    tree2lff(api_copy)

    # Test hid adding
    _set_lff(
        api_copy, 'dff.txt', """# -*- coding: utf-8 -*-
Ashkaraua [ashk1247]xyz
    Ashkarauax [bezs1238]NOCODE_abc
""")
    lff2tree(api_copy)
    assert api_copy.languoid('bezs1238').hid == 'NOCODE_abc'

    #
    # Nodes must have unique names!
    #
    _set_lff(
        api_copy, 'dff.txt', """# -*- coding: utf-8 -*-
Ashkaraua [ashk1247]xyz
    Ashkaraua [bezs1238]
""")
    with pytest.raises(ValueError, match=r'duplicate'):
        lff2tree(api_copy)

    #
    # Nodes must have consistent names!
    #
    _set_lff(
        api_copy, 'dff.txt', """# -*- coding: utf-8 -*-
Ashkxxxaraua [ashk1247]xyz
    Bezshagh [bezs1238]
""")
    with pytest.raises(ValueError, match=r'inconsistent'):
        lff2tree(api_copy)

    #
    # Top-level nodes in dff must be languages:
    #
    _set_lff(
        api_copy, 'dff.txt', """# -*- coding: utf-8 -*-
Abaza [abaz1241]
    Bezshagh [bezs1238]
""")
    with pytest.raises(ValueError, match=r'inconsistent'):
        lff2tree(api_copy)

    #
    # Top-level nodes in dff must be languages in lff:
    #
    _set_lff(api_copy, 'dff.txt', """# -*- coding: utf-8 -*-
None [xyzz1234]
    Dia []
""")
    with pytest.raises(ValueError, match=r'invalid'):
        lff2tree(api_copy)

    #
    # Isolates must not have multiple ancestors:
    #
    _set_lff(
        api_copy, 'dff.txt', """# -*- coding: utf-8 -*-
None [xyzz1234]; Other [-isolate-]
    Dia []
""")
    with pytest.raises(ValueError, match=r'isolate'):
        lff2tree(api_copy)

    #
    # Languages must appear after a classification line:
    #
    _set_lff(api_copy, 'dff.txt', """# -*- coding: utf-8 -*-
    Dia []
""")
    with pytest.raises(ValueError, match=r'classification'):
        lff2tree(api_copy)
Example #16
0
def languoids_from_tree(tree, **kw):
    for fname in walk(tree, mode='files'):
        if fname.suffix == '.ini':
            yield Languoid.from_ini(fname, **kw)
Example #17
0
 def languoids(self, ids=None):
     nodes = {}
     for d in walk(self.tree, mode='dirs'):
         if ids is None or d.name in ids:
             yield languoids.Languoid.from_dir(d, nodes=nodes)
Example #18
0
 def _tree_dirs(self):
     return list(walk(self.tree, mode='dirs'))
Example #19
0
def languoids_from_tree(tree, **kw):
    for fname in walk(tree, mode='files'):
        if fname.suffix == '.ini':
            yield Languoid.from_ini(fname, **kw)
Example #20
0
def walk_tree(tree=TREE, **kw):
    for fname in walk(tree, mode='files', followlinks=True):
        if fname.suffix == '.ini':
            yield Languoid.from_ini(fname, **kw)
Example #21
0
def find_languoid(tree=TREE, glottocode=None, **kw):
    for fname in walk(tree, mode='dirs', followlinks=True):
        if fname.name == glottocode:
            return Languoid.from_dir(fname)
Example #22
0
def find_languoid(tree=TREE, glottocode=None, **kw):
    for fname in walk(tree, mode='dirs', followlinks=True):
        if fname.name == glottocode:
            return Languoid.from_dir(fname)