Esempio n. 1
0
def test_Sources(tmpdir):
    src = Sources()
    src.add(BIB, Source(
        'book', 'huber2005', author='Herrmann Huber', year='2005', title='y'))
    for entry in src:
        assert entry.genre == 'book'
        break
    assert len(list(src.items())) == 3
    assert len(list(src.keys())) == 3
    refs = ['huber2005[1-6]', 'Obrazy', 'Elegie[34]']
    assert src.format_refs(*list(src.expand_refs(refs))) == refs
    assert '%s' % src['huber2005'] == 'Huber, Herrmann. 2005. y.'
    with pytest.raises(ValueError):
        src.add(5)
    with pytest.raises(ValueError):
        src.add('@misc{a.b,\n  author="a.b"\n}')
    with pytest.raises(ValueError):
        _ = src['unknown']
        assert _  # pragma: no cover
    with pytest.raises(ValueError):
        src.parse('a[x')
    with pytest.raises(ValueError):
        src.parse('[x]')
    with pytest.raises(ValueError):
        src.validate(['x'])

    bib = str(tmpdir / 'test.bib')
    src.write(bib)

    src2 = Sources()
    src2.read(bib)

    src2.write(bib, ids=['huber2005'])
    src = Sources.from_file(bib)
    assert len(src) == 1
Esempio n. 2
0
def test_Sources(tmpdir):
    src = Sources()
    src.add(BIB, Source(
        'book', 'huber2005', author='Herrmann Huber', year='2005', title='y'))
    for entry in src:
        assert entry.genre == 'book'
        break
    assert len(list(src.items())) == 3
    assert len(list(src.keys())) == 3
    refs = ['huber2005[1-6]', 'Obrazy', 'Elegie[34]']
    assert src.format_refs(*list(src.expand_refs(refs))) == refs
    assert '%s' % src['huber2005'] == 'Huber, Herrmann. 2005. y.'
    with pytest.raises(ValueError):
        src.add(5)
    with pytest.raises(ValueError):
        src.add('@misc{a.b,\n  author="a.b"\n}')
    with pytest.raises(ValueError):
        _ = src['unknown']
        assert _  # pragma: no cover
    with pytest.raises(ValueError):
        src.parse('a[x')
    with pytest.raises(ValueError):
        src.parse('[x]')
    with pytest.raises(ValueError):
        src.validate(['x'])

    bib = str(tmpdir / 'test.bib')
    src.write(bib)

    src2 = Sources()
    src2.read(bib)

    src2.write(bib, ids=['huber2005'])
    src = Sources.from_file(bib)
    assert len(src) == 1
Esempio n. 3
0
 def __init__(self, tablegroup):
     self.tablegroup = tablegroup
     self.auto_constraints()
     self.sources = Sources.from_file(self.bibpath)
Esempio n. 4
0
 def __init__(self, tablegroup):
     self.tablegroup = tablegroup
     self.auto_constraints()
     self.sources = Sources.from_file(self.bibpath)
Esempio n. 5
0
    def cmd_makecldf(self, args):

        # Add sources
        sources = Sources.from_file(self.raw_dir / "sources.bib")
        args.writer.cldf.add_sources(*sources)

        glottolog = Glottolog(args.glottolog.dir)
        clts = CLTS(Config.from_file().get_clone('clts'))
        bipa = clts.bipa
        clts_saphon = clts.transcriptiondata_dict['saphon']

        # Add components
        args.writer.cldf.add_columns("ValueTable", {
            "name": "Value_in_Source",
            "datatype": "string"
        })

        cltstable = Terms()["cltsReference"].to_column().asdict()
        cltstable["datatype"]["format"] = "[a-z_-]+|NA"
        args.writer.cldf.add_columns('ParameterTable', cltstable, {
            'name': 'CLTS_BIPA',
            'datatype': 'string'
        }, {
            'name': 'CLTS_Name',
            'datatype': 'string'
        })
        args.writer.cldf.add_component("LanguageTable", "Family",
                                       "Glottolog_Name")

        languages = []
        #all_glottolog = {lng.id: lng for lng in glottolog.languoids()}
        #iso2glot = {lng.iso: lng.glottocode for lng in all_glottolog.values()}
        #args.log.info("loaded glottolog")
        for row in progressbar(
                self.etc_dir.read_csv("languages.csv", dicts=True)):
            #if row["SAPHON_Code"] in iso2glot:
            #    glottocode = iso2glot[row["SAPHON_Code"]]
            #elif row["SAPHON_Code"][:3] in iso2glot:
            #    glottocode = iso2glot[row["SAPHON_Code"][:3]]
            #else:
            #    glottocode = ""

            #if glottocode and glottocode in all_glottolog:
            #    lang = all_glottolog[glottocode]
            #    update = {
            #        "Family": lang.family.name if lang.family else '',
            #        "Glottocode": glottocode,
            #        "Latitude": lang.latitude,
            #        "Longitude": lang.longitude,
            #        "Macroarea": lang.macroareas[0].name if lang.macroareas else None,
            #        "Glottolog_Name": lang.name,
            #    }
            #    row.update(update)
            languages.append(row)

        # Build source map from language
        source_map = {
            k: v
            for k, v in self.raw_dir.read_csv("references.tsv", delimiter="\t")
        }

        # Parse sources
        segments = []
        values = []
        counter = 1
        unknowns = defaultdict(list)
        for lid, segment in self.raw_dir.read_csv('inventories.tsv',
                                                  delimiter="\t"):
            normalized = normalize_grapheme(segment)
            if normalized in clts_saphon.grapheme_map:
                sound = bipa[clts_saphon.grapheme_map[normalized]]
            else:
                sound = bipa['<NA>']
                unknowns[normalized] += [(lang_key, segment)]
            par_id = compute_id(normalized)
            if sound.type == 'unknownsound':
                bipa_grapheme = ''
                desc = ''
            else:
                bipa_grapheme = str(sound)
                desc = sound.name

            segments.append((par_id, normalized, bipa_grapheme, desc))

            values.append({
                "ID": str(counter),
                "Language_ID": lid,
                "Parameter_ID": par_id,
                "Value_in_Source": segment,
                "Value": normalized,
                "Source": [source_map[lid]]
            })
            counter += 1

        # Build segment data
        parameters = [{
            "ID": ID,
            "Name": normalized,
            "Description": '',
            "CLTS_ID": desc.replace(' ', '_') if desc.strip() else "NA",
            "CLTS_BIPA": bipa_grapheme,
            "CLTS_Name": desc
        } for ID, normalized, bipa_grapheme, desc in set(segments)]

        # Write data and validate
        args.writer.write(
            **{
                "ValueTable": values,
                "LanguageTable": languages,
                "ParameterTable": parameters,
            })
        for g, rest in unknowns.items():
            print('\t'.join([repr(g), str(len(rest)), g]))