def test_Sources(tmpdir): src = Sources() src.add(BIB, Source( 'book', 'huber2005', author='Herrmann Huber', year='2005', title='y')) for entry in src: assert entry.genre == 'book' break assert len(list(src.items())) == 3 assert len(list(src.keys())) == 3 refs = ['huber2005[1-6]', 'Obrazy', 'Elegie[34]'] assert src.format_refs(*list(src.expand_refs(refs))) == refs assert '%s' % src['huber2005'] == 'Huber, Herrmann. 2005. y.' with pytest.raises(ValueError): src.add(5) with pytest.raises(ValueError): src.add('@misc{a.b,\n author="a.b"\n}') with pytest.raises(ValueError): _ = src['unknown'] assert _ # pragma: no cover with pytest.raises(ValueError): src.parse('a[x') with pytest.raises(ValueError): src.parse('[x]') with pytest.raises(ValueError): src.validate(['x']) bib = str(tmpdir / 'test.bib') src.write(bib) src2 = Sources() src2.read(bib) src2.write(bib, ids=['huber2005']) src = Sources.from_file(bib) assert len(src) == 1
def __init__(self, tablegroup): self.tablegroup = tablegroup self.auto_constraints() self.sources = Sources.from_file(self.bibpath)
def cmd_makecldf(self, args): # Add sources sources = Sources.from_file(self.raw_dir / "sources.bib") args.writer.cldf.add_sources(*sources) glottolog = Glottolog(args.glottolog.dir) clts = CLTS(Config.from_file().get_clone('clts')) bipa = clts.bipa clts_saphon = clts.transcriptiondata_dict['saphon'] # Add components args.writer.cldf.add_columns("ValueTable", { "name": "Value_in_Source", "datatype": "string" }) cltstable = Terms()["cltsReference"].to_column().asdict() cltstable["datatype"]["format"] = "[a-z_-]+|NA" args.writer.cldf.add_columns('ParameterTable', cltstable, { 'name': 'CLTS_BIPA', 'datatype': 'string' }, { 'name': 'CLTS_Name', 'datatype': 'string' }) args.writer.cldf.add_component("LanguageTable", "Family", "Glottolog_Name") languages = [] #all_glottolog = {lng.id: lng for lng in glottolog.languoids()} #iso2glot = {lng.iso: lng.glottocode for lng in all_glottolog.values()} #args.log.info("loaded glottolog") for row in progressbar( self.etc_dir.read_csv("languages.csv", dicts=True)): #if row["SAPHON_Code"] in iso2glot: # glottocode = iso2glot[row["SAPHON_Code"]] #elif row["SAPHON_Code"][:3] in iso2glot: # glottocode = iso2glot[row["SAPHON_Code"][:3]] #else: # glottocode = "" #if glottocode and glottocode in all_glottolog: # lang = all_glottolog[glottocode] # update = { # "Family": lang.family.name if lang.family else '', # "Glottocode": glottocode, # "Latitude": lang.latitude, # "Longitude": lang.longitude, # "Macroarea": lang.macroareas[0].name if lang.macroareas else None, # "Glottolog_Name": lang.name, # } # row.update(update) languages.append(row) # Build source map from language source_map = { k: v for k, v in self.raw_dir.read_csv("references.tsv", delimiter="\t") } # Parse sources segments = [] values = [] counter = 1 unknowns = defaultdict(list) for lid, segment in self.raw_dir.read_csv('inventories.tsv', delimiter="\t"): normalized = normalize_grapheme(segment) if normalized in clts_saphon.grapheme_map: sound = bipa[clts_saphon.grapheme_map[normalized]] else: sound = bipa['<NA>'] unknowns[normalized] += [(lang_key, segment)] par_id = compute_id(normalized) if sound.type == 'unknownsound': bipa_grapheme = '' desc = '' else: bipa_grapheme = str(sound) desc = sound.name segments.append((par_id, normalized, bipa_grapheme, desc)) values.append({ "ID": str(counter), "Language_ID": lid, "Parameter_ID": par_id, "Value_in_Source": segment, "Value": normalized, "Source": [source_map[lid]] }) counter += 1 # Build segment data parameters = [{ "ID": ID, "Name": normalized, "Description": '', "CLTS_ID": desc.replace(' ', '_') if desc.strip() else "NA", "CLTS_BIPA": bipa_grapheme, "CLTS_Name": desc } for ID, normalized, bipa_grapheme, desc in set(segments)] # Write data and validate args.writer.write( **{ "ValueTable": values, "LanguageTable": languages, "ParameterTable": parameters, }) for g, rest in unknowns.items(): print('\t'.join([repr(g), str(len(rest)), g]))