def languoid(self, id_: typing.Union[str, lls.Languoid]) -> lls.Languoid: """ Retrieve a languoid specified by language code. :param id_: Glottocode or ISO code. """ if isinstance(id_, lls.Languoid): return id_ if self.cache and id_ in self.cache: return self.cache[id_] if ISO_CODE_PATTERN.match(id_): for d in self._tree_dirs if self.cache else walk(self.tree, mode='dirs'): if self.cache: l_ = self.cache.add(d, self) else: l_ = lls.Languoid.from_dir(d, _api=self) if l_.iso_code == id_: return l_ else: for d in self._tree_dirs if self.cache else walk(self.tree, mode='dirs'): l_ = None if self.cache: # If we cache Languoids, we might as well instantiate the ones we traverse: l_ = self.cache.add(d, self) if d.name == id_: if self.cache: return l_ return lls.Languoid.from_dir(d, _api=self)
def test_walk(self): from clldutils.path import walk d = self.tmp_path('testdir') d.mkdir() self.make_file('testfile') res = [p.name for p in walk(self.tmp_path(), mode='files')] self.assertNotIn('testdir', res) self.assertIn('testfile', res) res = [p.name for p in walk(self.tmp_path(), mode='dirs')] self.assertIn('testdir', res) self.assertNotIn('testfile', res)
def test_walk(tmppath): from clldutils.path import walk d = tmppath / 'testdir' d.mkdir() make_file(tmppath, name='testfile') res = [p.name for p in walk(d.parent, mode='files')] assert 'testdir' not in res assert 'testfile' in res res = [p.name for p in walk(d.parent, mode='dirs')] assert 'testdir' in res assert 'testfile' not in res
def languoid(self, id_): if isinstance(id_, languoids.Languoid): return id_ if ISO_CODE_PATTERN.match(id_): for d in walk(self.tree, mode='dirs'): l_ = languoids.Languoid.from_dir(d) if l_.iso_code == id_: return l_ else: for d in walk(self.tree, mode='dirs'): if d.name == id_: return languoids.Languoid.from_dir(d)
def languoids( self, ids: set = None, maxlevel: typing.Union[int, config.LanguoidLevel, str] = None, exclude_pseudo_families: bool = False ) -> typing.Generator[lls.Languoid, None, None]: """ Yields languoid objects. :param ids: `set` of Glottocodes to limit the result to. This is useful to increase \ performance, since INI file reading can be skipped for languoids not listed. :param maxlevel: Numeric maximal nesting depth of languoids, or Languoid.level. :param exclude_pseudo_families: Flag signaling whether to exclude pseud families, \ i.e. languoids from non-genealogical trees. """ is_max_level_int = isinstance(maxlevel, int) # Non-numeric levels are interpreted as `Languoid.level` descriptors. if not is_max_level_int: maxlevel = self.languoid_levels.get(maxlevel or 'dialect') # Since we traverse the tree topdown, we can cache a mapping of Languoid.id to triples # (name, id, level) for populating `Languoid.lineage`. nodes = {} for d in self._tree_dirs if self.cache else walk(self.tree, mode='dirs'): if ids is None or d.name in ids: if self.cache: lang = self.cache.add(d, self) else: lang = lls.Languoid.from_dir(d, nodes=nodes, _api=self) if (is_max_level_int and len(lang.lineage) <= maxlevel) \ or ((not is_max_level_int) and lang.level <= maxlevel): if (not exclude_pseudo_families ) or not lang.category.startswith('Pseudo'): yield lang
def create(self, path, metadata, filter_=filter_hidden, object_class=None): """ Create objects in CDSTAR and register them in the catalog. Note that we guess the mimetype based on the filename extension, using `mimetypes.guess_type`. Thus, it is the caller's responsibility to add custom or otherwise uncommon types to the list of known types using `mimetypes.add_type`. :param path: :param metadata: :param filter_: :return: """ path = Path(path) if path.is_file(): fnames = [path] elif path.is_dir(): fnames = list(walk(path, mode='files')) else: raise ValueError( 'path must be a file or directory') # pragma: no cover for fname in fnames: if not filter_ or filter_(fname): created, obj = self._create(fname, metadata, object_class=object_class) yield fname, created, obj
def languoid(self, id_): if ISO_CODE_PATTERN.match(id_): for l in languoids.walk_tree(tree=self.tree): if l.iso_code == id_: return l else: for d in walk(self.tree, mode='dirs'): if d.name == id_: return languoids.Languoid.from_dir(d)
def iter_datasets(d): """ Discover CLDF datasets - by identifying metadata files - in a directory. :param d: directory :return: generator of `Dataset` instances. """ for p in walk(d, mode='files'): if sniff(p): yield Dataset.from_metadata(p)
def get_file_paths(raw_htmls, n=None): """ Build a sorted list of PosixPath() objects for all files in the specified directory, e.g. numerals/raw/, skipping files defined in SKIP and as SKIP_RE. :param raw_htmls: Path to raw numerals HTML files. :param n: How many HTML files to process, useful for debugging. :return: A list of PosixPath() objects with path information for the files. """ if n: # pragma: no cover return sorted([ f for f in walk(raw_htmls) if f.suffix.startswith(".htm") and not f.name.startswith("Copy of") and f.name not in SKIP and not re.search(SKIP_RE, f.name) ])[:n] else: return sorted([ f for f in walk(raw_htmls) if f.suffix.startswith(".htm") and not f.name.startswith("Copy of") and f.name not in SKIP and not re.search(SKIP_RE, f.name) ])
def run(): terms = [] for e in read_terms().iter(): if ns('rdf:about') in e.attrib: terms.append(e.attrib[ns('rdf:about')]) for d in ['components', 'modules']: for f in walk(REPO_DIR.joinpath(d)): if f.suffix == '.json': md = load(f) for k, v in iterproperties(md): if k in ['propertyUrl', 'dc:conformsTo'] and v not in terms: print(f) print(v)
def languoids(self, ids=None): for d in walk(self.tree, mode='dirs'): if ids is None or d.name in ids: yield languoids.Languoid.from_dir(d)
def walk_tree(tree=TREE, **kw): for fname in walk(tree, mode='files', followlinks=True): if fname.suffix == '.ini': yield Languoid.from_ini(fname, **kw)
def test_lff2tree(self): lfftext = self._set_lff( """# -*- coding: utf-8 -*- Abkhaz-Adyge [abkh1242] aaa Ubykh [ubyk1235]uby Abkhaz-Adyge [abkh1242] aaa; Abkhaz-Abaza [abkh1243] Abaza [abaz1241]abq Abkhazian [abkh1244]abk Abkhaz-Adyge [abkh1242] aaa; Circassian [circ1239] Adyghe [adyg1241]ady Kabardian [kaba1278]kbd """, 'lff.txt') self._set_lff( """# -*- coding: utf-8 -*- Abaza [abaz1241] abq Ashkaraua [ashk1247] Bezshagh [bezs1238] Tapanta [tapa1256] Abkhazian [abkh1244] abk Abzhui [abzh1238] Bzyb [bzyb1238] Samurzakan [samu1242] """, 'dff.txt') lff2tree(self.api) self.assertEqual(self.api.languoid('abkh1242').iso, 'aaa') self.assertEqual(self.api.languoid('ashk1247').level, Level.dialect) self.assertEqual(self.api.languoid('abaz1241').level, Level.language) self.assertEqual(self.api.languoid('abaz1241').hid, 'abq') self._set_lff(lfftext.replace('Abkhaz-Abaza', 'Abkhaz-Abazzza'), 'lff.txt') lff2tree(self.api) glottocodes = [d.name for d in walk(self.api.tree, mode='dirs')] self.assertEqual(len(glottocodes), len(set(glottocodes))) self.assertEqual(self.api.languoid('abkh1243').name, 'Abkhaz-Abazzza') lfftext = self._set_lff( """# -*- coding: utf-8 -*- Abkhaz-Adyge [abkh1242] Ubykh [ubyk1235] Abkhaz-Adyge [abkh1242]; Abkhaz-Abaza [abkh1243]; Abaza [abaz1241] Ashkaraua [ashk1247]xyz Abkhazian [abkh1244] Abkhaz-Adyge [abkh1242]; Circassian [circ1239] Adyghe [adyg1241]ady Kabardian [kaba1278] Abkhaz-Adyge [abkh1242]; Circassian [circ1239]; New Group [] New name []NOCODE_New-name Another one [] """, 'lff.txt') self._set_lff( """# -*- coding: utf-8 -*- Ashkaraua [ashk1247]xyz Bezshagh [bezs1238] Tapanta [tapa1256] Abkhazian [abkh1244] Abzhui [abzh1238] Bzyb [bzyb1238] Samurzakan [samu1242] Kabardian [kaba1278] Dia []aaa """, 'dff.txt') lff2tree(self.api) self.assertEqual(self.api.languoid('abaz1241').level, Level.family) # Now we test two things: # - aaa has been removed as ISO code from abkh1242 # - aaa has been attached as ISO code to a newly created language self.assertEqual(self.api.languoid('aaa').name, 'Dia') langs = list(self.api.languoids()) self.assertIn('newg1234', self.api.glottocodes) self.assertEqual(len([l for l in langs if l.name == 'New Group']), 1) self.assertEqual(len([l for l in langs if l.hid == 'NOCODE_New-name']), 1) # Test ISO code removal: self._set_lff( """# -*- coding: utf-8 -*- Kabardian [kaba1278] Dia [] """, 'dff.txt') lff2tree(self.api) self.assertIsNone(self.api.languoid('aaa')) tree2lff(self.api) # Test hid adding self._set_lff( """# -*- coding: utf-8 -*- Ashkaraua [ashk1247]xyz Ashkarauax [bezs1238]NOCODE_abc """, 'dff.txt') lff2tree(self.api) self.assertEqual(self.api.languoid('bezs1238').hid, 'NOCODE_abc') # # Nodes must have unique names! # self._set_lff( """# -*- coding: utf-8 -*- Ashkaraua [ashk1247]xyz Ashkaraua [bezs1238] """, 'dff.txt') with self.assertRaisesRegexp(ValueError, 'duplicate'): lff2tree(self.api) # # Nodes must have consistent names! # self._set_lff( """# -*- coding: utf-8 -*- Ashkxxxaraua [ashk1247]xyz Bezshagh [bezs1238] """, 'dff.txt') with self.assertRaisesRegexp(ValueError, 'inconsistent'): lff2tree(self.api) # # Top-level nodes in dff must be languages: # self._set_lff( """# -*- coding: utf-8 -*- Abaza [abaz1241] Bezshagh [bezs1238] """, 'dff.txt') with self.assertRaises(ValueError): lff2tree(self.api) # # Top-level nodes in dff must be languages in lff: # self._set_lff( """# -*- coding: utf-8 -*- None [xyzz1234] Dia [] """, 'dff.txt') with self.assertRaises(ValueError): lff2tree(self.api) # # Isolates must not have multiple ancestors: # self._set_lff( """# -*- coding: utf-8 -*- None [xyzz1234]; Other [-isolate-] Dia [] """, 'dff.txt') with self.assertRaisesRegexp(ValueError, 'isolate'): lff2tree(self.api) # # Languages must appear after a classification line: # self._set_lff("""# -*- coding: utf-8 -*- Dia [] """, 'dff.txt') with self.assertRaisesRegexp(ValueError, 'classification'): lff2tree(self.api)
def test_lff2tree(api_copy): lfftext = _set_lff( api_copy, 'lff.txt', """# -*- coding: utf-8 -*- Abkhaz-Adyge [abkh1242] aaa Ubykh [ubyk1235]uby Abkhaz-Adyge [abkh1242] aaa; Abkhaz-Abaza [abkh1243] Abaza [abaz1241]abq Abkhazian [abkh1244]abk Abkhaz-Adyge [abkh1242] aaa; Circassian [circ1239] Adyghe [adyg1241]ady Kabardian [kaba1278]kbd """) _set_lff( api_copy, 'dff.txt', """# -*- coding: utf-8 -*- Abaza [abaz1241] abq Ashkaraua [ashk1247] Bezshagh [bezs1238] Tapanta [tapa1256] Abkhazian [abkh1244] abk Abzhui [abzh1238] Bzyb [bzyb1238] Samurzakan [samu1242] """) lff2tree(api_copy) assert api_copy.languoid('abkh1242').iso == 'aaa' assert api_copy.languoid( 'ashk1247').level == api_copy.languoid_levels.dialect assert api_copy.languoid( 'abaz1241').level == api_copy.languoid_levels.language assert api_copy.languoid('abaz1241').hid == 'abq' _set_lff(api_copy, 'lff.txt', lfftext.replace('Abkhaz-Abaza', 'Abkhaz-Abazzza')) lff2tree(api_copy) glottocodes = [d.name for d in walk(api_copy.tree, mode='dirs')] assert len(glottocodes) == len(set(glottocodes)) abkh1243 = api_copy.languoid('abkh1243') # Make sure the new name is picked up ... assert abkh1243.name == 'Abkhaz-Abazzza' # ... and the old one retained as alternative name: assert 'Abkhaz-Abaza' in abkh1243.names['glottolog'] lfftext = _set_lff( api_copy, 'lff.txt', """# -*- coding: utf-8 -*- Abkhaz-Adyge [abkh1242] Ubykh [ubyk1235] Abkhaz-Adyge [abkh1242]; Abkhaz-Abaza [abkh1243]; Abaza [abaz1241] Ashkaraua [ashk1247]xyz Abkhazian [abkh1244] Abkhaz-Adyge [abkh1242]; Circassian [circ1239] Adyghe [adyg1241]ady Kabardian [kaba1278] Abkhaz-Adyge [abkh1242]; Circassian [circ1239]; New Group [] New name []NOCODE_New-name Another one [] """) _set_lff( api_copy, 'dff.txt', """# -*- coding: utf-8 -*- Ashkaraua [ashk1247]xyz Bezshagh [bezs1238] Tapanta [tapa1256] Abkhazian [abkh1244] Abzhui [abzh1238] Bzyb [bzyb1238] Samurzakan [samu1242] Kabardian [kaba1278] Dia []aaa """) lff2tree(api_copy) assert api_copy.languoid( 'abaz1241').level == api_copy.languoid_levels.family # Now we test two things: # - aaa has been removed as ISO code from abkh1242 # - aaa has been attached as ISO code to a newly created language assert api_copy.languoid('aaa').name == 'Dia' langs = list(api_copy.languoids()) assert 'newg1234' in api_copy.glottocodes assert sum(1 for l in langs if l.name == 'New Group') == 1 assert sum(1 for l in langs if l.hid == 'NOCODE_New-name') == 1 # Test ISO code removal: _set_lff(api_copy, 'dff.txt', """# -*- coding: utf-8 -*- Kabardian [kaba1278] Dia [] """) lff2tree(api_copy) assert api_copy.languoid('aaa') is None tree2lff(api_copy) # Test hid adding _set_lff( api_copy, 'dff.txt', """# -*- coding: utf-8 -*- Ashkaraua [ashk1247]xyz Ashkarauax [bezs1238]NOCODE_abc """) lff2tree(api_copy) assert api_copy.languoid('bezs1238').hid == 'NOCODE_abc' # # Nodes must have unique names! # _set_lff( api_copy, 'dff.txt', """# -*- coding: utf-8 -*- Ashkaraua [ashk1247]xyz Ashkaraua [bezs1238] """) with pytest.raises(ValueError, match=r'duplicate'): lff2tree(api_copy) # # Nodes must have consistent names! # _set_lff( api_copy, 'dff.txt', """# -*- coding: utf-8 -*- Ashkxxxaraua [ashk1247]xyz Bezshagh [bezs1238] """) with pytest.raises(ValueError, match=r'inconsistent'): lff2tree(api_copy) # # Top-level nodes in dff must be languages: # _set_lff( api_copy, 'dff.txt', """# -*- coding: utf-8 -*- Abaza [abaz1241] Bezshagh [bezs1238] """) with pytest.raises(ValueError, match=r'inconsistent'): lff2tree(api_copy) # # Top-level nodes in dff must be languages in lff: # _set_lff(api_copy, 'dff.txt', """# -*- coding: utf-8 -*- None [xyzz1234] Dia [] """) with pytest.raises(ValueError, match=r'invalid'): lff2tree(api_copy) # # Isolates must not have multiple ancestors: # _set_lff( api_copy, 'dff.txt', """# -*- coding: utf-8 -*- None [xyzz1234]; Other [-isolate-] Dia [] """) with pytest.raises(ValueError, match=r'isolate'): lff2tree(api_copy) # # Languages must appear after a classification line: # _set_lff(api_copy, 'dff.txt', """# -*- coding: utf-8 -*- Dia [] """) with pytest.raises(ValueError, match=r'classification'): lff2tree(api_copy)
def languoids_from_tree(tree, **kw): for fname in walk(tree, mode='files'): if fname.suffix == '.ini': yield Languoid.from_ini(fname, **kw)
def languoids(self, ids=None): nodes = {} for d in walk(self.tree, mode='dirs'): if ids is None or d.name in ids: yield languoids.Languoid.from_dir(d, nodes=nodes)
def _tree_dirs(self): return list(walk(self.tree, mode='dirs'))
def find_languoid(tree=TREE, glottocode=None, **kw): for fname in walk(tree, mode='dirs', followlinks=True): if fname.name == glottocode: return Languoid.from_dir(fname)