コード例 #1
0
ファイル: test_sources.py プロジェクト: LinguList/pycldf
    def test_Sources_with_None_values(self):
        from pycldf.sources import Sources, Source

        src = Sources()
        src.add(Source('book', 'huber2005', title=None))
        bib = self.tmp_path('test.bib')
        src.write(bib.name, bib.parent)
コード例 #2
0
ファイル: db.py プロジェクト: glottobank/pycldf
    def to_cldf(self, dest, mdname='cldf-metadata.json'):
        """
        Write the data from the db to a CLDF dataset according to the metadata in `self.dataset`.

        :param dest:
        :param mdname:
        :return: path of the metadata file
        """
        dest = Path(dest)
        if not dest.exists():
            dest.mkdir()

        data = self.read()

        if data[self.source_table_name]:
            sources = Sources()
            for src in data[self.source_table_name]:
                sources.add(Source(
                    src['genre'],
                    src['id'],
                    **{k: v for k, v in src.items() if k not in ['id', 'genre']}))
            sources.write(dest / self.dataset.properties.get('dc:source', 'sources.bib'))

        for table_type, items in data.items():
            try:
                table = self.dataset[table_type]
                table.common_props['dc:extent'] = table.write(
                    [self.retranslate(table, item) for item in items],
                    base=dest)
            except KeyError:
                assert table_type == self.source_table_name, table_type
        return self.dataset.write_metadata(dest / 'cldf-metadata.json')
コード例 #3
0
ファイル: test_sources.py プロジェクト: glottobank/pycldf
def test_Source_expand_refs():
    sources = Sources()
    src = Source(
        'book', 'Meier2005', author='Hans Meier', year='2005', title='The Book')
    assert 'Meier2005' in repr(src)
    sources.add(src)
    bib = sources._bibdata.to_string(bib_format='bibtex')
    assert len(bib.split('author')) == 2
    assert len(list(sources.expand_refs('Meier2005'))) == 1
    bib = sources._bibdata.to_string(bib_format='bibtex')
    assert len(bib.split('author')) == 2
    assert len(list(sources.expand_refs('12345'))) == 1
コード例 #4
0
ファイル: test_sources.py プロジェクト: LinguList/pycldf
    def test_Source_expand_refs(self):
        from pycldf.sources import Sources, Source

        sources = Sources()
        src = Source(
            'book', 'Meier2005', author='Hans Meier', year='2005', title='The Book')
        self.assertIn('Meier2005', repr(src))
        sources.add(src)
        bib = sources._bibdata.to_string(bib_format='bibtex')
        self.assertEqual(len(bib.split('author')), 2)
        self.assertEqual(len(list(sources.expand_refs('Meier2005'))), 1)
        bib = sources._bibdata.to_string(bib_format='bibtex')
        self.assertEqual(len(bib.split('author')), 2)
コード例 #5
0
    def cmd_makecldf(self, args):
        """
        Convert the raw data to a CLDF dataset.
        """
        with self.cldf_writer(args) as writer:
            writer.cldf.add_component('CognatesetTable')
            writer.add_sources(*self.raw_dir.read_bib('cariban_resolved.bib'))
            cmap = writer.add_concepts(lookup_factory=lambda c: c.english)
            cmap['you'] = cmap['thou']
            cmap['grease/fat'] = cmap['grease']
            cmap['breast'] = cmap['breasts']
            cmap['son'] = cmap['person']
            data = pycldf.Dataset.from_metadata(self.raw_dir /
                                                'cariban_data.json')
            for lang in data['LanguageTable']:
                writer.add_language(ID=lang['ID'],
                                    Name=lang['Name'],
                                    Glottocode=lang["Glottocode"])

            cs_seen = set()
            reconstructions = {
                tuple(c['ID'].split('-')): c['Form']
                for c in self.raw_dir.read_csv(
                    'cariban_lexical_reconstructions.csv', dicts=True)
            }
            for lex in self.raw_dir.read_csv('cariban_swadesh_list.csv',
                                             dicts=True):
                #"Language_ID","Swadesh_Nr","Feature_ID","Value","Cognateset_ID","Source","Comment","Full_Form"
                if lex['Feature_ID'] not in cmap:
                    print(lex['Feature_ID'])
                    continue
                for form in writer.add_lexemes(
                        Value=lex['Value'],
                        Parameter_ID=cmap[lex['Feature_ID']],
                        Language_ID=lex['Language_ID'],
                        Source=[
                            Reference(*d) for d in
                            [Sources.parse(lex['Source'].replace(';', ','))]
                        ] if lex['Source']
                        and not lex['Source'].startswith('pc') else [],
                ):
                    cs_key = (lex['Feature_ID'], lex['Cognateset_ID'])
                    cs_id = '{}-{}'.format(cmap[cs_key[0]], cs_key[1])
                    if cs_key not in cs_seen:
                        writer.objects['CognatesetTable'].append(
                            dict(
                                ID=cs_id,
                                Description=reconstructions.get(cs_key),
                            ))
                        cs_seen.add(cs_key)
                    writer.add_cognate(lexeme=form, Cognateset_ID=cs_id)

            # Note: We want to re-use LanguageTable across the two CLDF datasets:
            LanguageTable = writer.cldf['LanguageTable']

        with self.cldf_writer(args, cldf_spec='structure',
                              clean=False) as writer:
            writer.cldf.add_component(
                LanguageTable)  # we reuse the one from above!
コード例 #6
0
    def to_cldf(self, dest, mdname='cldf-metadata.json', coordinate_precision=4):
        """
        Write the data from the db to a CLDF dataset according to the metadata in `self.dataset`.

        :param dest:
        :param mdname:
        :return: path of the metadata file
        """
        dest = pathlib.Path(dest)
        if not dest.exists():
            dest.mkdir()

        data = self.read()

        if data[self.source_table_name]:
            sources = Sources()
            for src in data[self.source_table_name]:
                sources.add(Source(
                    src['genre'],
                    src['id'],
                    **{k: v for k, v in src.items() if k not in ['id', 'genre']}))
            sources.write(dest / self.dataset.properties.get('dc:source', 'sources.bib'))

        for table_type, items in data.items():
            try:
                table = self.dataset[table_type]
                items = [
                    self.round_geocoordinates(item, precision=coordinate_precision)
                    for item in items]
                table.common_props['dc:extent'] = table.write(
                    [self.retranslate(table, item) for item in items],
                    base=dest)
            except KeyError:
                assert table_type == self.source_table_name, table_type
        return self.dataset.write_metadata(dest / mdname)
コード例 #7
0
ファイル: dataset.py プロジェクト: LinguList/pycldf
    def __init__(self, name):
        assert NAME_PATTERN.match(name)
        self.name = name
        self.sources = Sources()
        self.metadata = Metadata()
        self._rows = OrderedDict()

        # We store the fields (a.k.a. header) as tuple because it must be immutable after
        # first assignment (since changing is not well defined when there are already
        # rows).
        self._fields = ()
        self._source_count = None
        self._cited_sources = set()
        self._table = None
コード例 #8
0
ファイル: test_sources.py プロジェクト: Anaphory/pycldf
def test_field_order(tmpdir):
    srcs = Sources()
    src = Source('misc', 'x')  # src is an OrderedDict and we add title *after* year.
    src['year'] = '2018'
    src['title'] = 'The Title'
    srcs.add(src)
    bib = tmpdir / 'test.bib'
    srcs.write(str(bib))
    res = bib.read_text(encoding='utf8')
    # Still, title should be printed in the BibTeX before year:
    assert res.index('title =') < res.index('year =')
コード例 #9
0
def test_Source_expand_refs():
    sources = Sources()
    src = Source(
        'book', 'Meier2005', author='Hans Meier', year='2005', title='The Book')
    assert 'Meier2005' in repr(src)
    sources.add(src)
    bib = sources._bibdata.to_string(bib_format='bibtex')
    assert len(bib.split('author')) == 2
    assert len(list(sources.expand_refs('Meier2005'))) == 1
    bib = sources._bibdata.to_string(bib_format='bibtex')
    assert len(bib.split('author')) == 2
    assert len(list(sources.expand_refs('12345'))) == 1
コード例 #10
0
ファイル: util.py プロジェクト: HedvigS/grambank
def import_values(values, lang, features, codes, contributors,
                  sources):  # pragma: no cover
    c = Contribution(
        id=lang['ID'],
        name='Dataset for {0}'.format(lang['Name']),
    )
    for i, cid in enumerate(lang['Coders'], start=1):
        DBSession.add(
            ContributionContributor(
                contribution=c,
                contributor_pk=contributors[cid],
                ord=i,
            ))
    l = GrambankLanguage(
        id=lang['ID'],
        name=lang['Name'],
        macroarea=lang['Macroarea'],
        latitude=lang['Latitude'],
        longitude=lang['Longitude'],
    )
    for value in values:
        vs = ValueSet(
            id=value['ID'],
            parameter_pk=features[value['Parameter_ID']],
            language=l,
            contribution=c,
        )
        Value(id=value['ID'],
              valueset=vs,
              name=value['Value'],
              description=value['Comment'],
              domainelement_pk=codes[value['Code_ID']
                                     or '{}-NA'.format(value['Parameter_ID'])])

        if value['Source']:
            for ref in value['Source']:
                sid, pages = Sources.parse(ref)
                ValueSetReference(valueset=vs,
                                  source_pk=sources[sid],
                                  description=pages)
    DBSession.add(c)
コード例 #11
0
def test_Sources_roundtrip_latex(tmpdir, bibtex, expected):
    src = Sources()
    src.add(bibtex)
    bib = tmpdir / 'test.bib'
    src.write(str(bib))
    assert expected in bib.read_text('utf8')
コード例 #12
0
def test_Sources_with_None_values(tmpdir):
    src = Sources()
    src.add(Source('book', 'huber2005', title=None))
    bib = tmpdir / 'test.bib'
    src.write(str(bib))
コード例 #13
0
def test_Sources(tmpdir):
    src = Sources()
    src.add(BIB, Source(
        'book', 'huber2005', author='Herrmann Huber', year='2005', title='y'))
    for entry in src:
        assert entry.genre == 'book'
        break
    assert len(list(src.items())) == 3
    assert len(list(src.keys())) == 3
    refs = ['huber2005[1-6]', 'Obrazy', 'Elegie[34]']
    assert src.format_refs(*list(src.expand_refs(refs))) == refs
    assert '%s' % src['huber2005'] == 'Huber, Herrmann. 2005. y.'
    with pytest.raises(ValueError):
        src.add(5)
    with pytest.raises(ValueError):
        src.add('@misc{a.b,\n  author="a.b"\n}')
    with pytest.raises(ValueError):
        _ = src['unknown']
        assert _  # pragma: no cover
    with pytest.raises(ValueError):
        src.parse('a[x')
    with pytest.raises(ValueError):
        src.parse('[x]')
    with pytest.raises(ValueError):
        src.validate(['x'])

    bib = str(tmpdir / 'test.bib')
    src.write(bib)

    src2 = Sources()
    src2.read(bib)

    src2.write(bib, ids=['huber2005'])
    src = Sources.from_file(bib)
    assert len(src) == 1
コード例 #14
0
 def __init__(self, tablegroup):
     self.tablegroup = tablegroup
     self.auto_constraints()
     self.sources = Sources.from_file(self.bibpath)
コード例 #15
0
ファイル: dataset.py プロジェクト: glottobank/pycldf
 def __init__(self, tablegroup):
     self.tablegroup = tablegroup
     self.auto_constraints()
     self.sources = Sources.from_file(self.bibpath)
コード例 #16
0
ファイル: test_sources.py プロジェクト: LinguList/pycldf
    def test_Sources(self):
        from pycldf.sources import Sources, Source

        src = Sources()
        src.add(BIB, Source(
            'book', 'huber2005', author='Herrmann Huber', year='2005', title='y'))
        self.assertEqual(len(list(src.items())), 3)
        self.assertEqual(len(list(src.keys())), 3)
        refs = 'huber2005[1-6];Obrazy;Elegie[34]'
        self.assertEqual(src.format_refs(*list(src.expand_refs(refs))), refs)
        self.assertEqual('%s' % src['huber2005'], 'Huber, Herrmann. 2005. y.')
        with self.assertRaises(ValueError):
            src.add(5)
        with self.assertRaises(ValueError):
            src.add('@misc{a.b,\n  author="a.b"\n}')

        bib = self.tmp_path('test.bib')
        src.write(bib.name, bib.parent)

        src2 = Sources()
        src2.read(bib.name, bib.parent)

        bib = self.tmp_path('test.bib')
        src2.write(bib.name, bib.parent, ids=['huber2005'])
        src = Sources()
        src.read(bib.name, bib.parent)
        self.assertEqual(len(src), 1)
コード例 #17
0
ファイル: test_sources.py プロジェクト: glottobank/pycldf
def test_Sources_roundtrip_latex(tmpdir, bibtex, expected):
    src = Sources()
    src.add(bibtex)
    bib = tmpdir / 'test.bib'
    src.write(str(bib))
    assert expected in bib.read_text('utf8')
コード例 #18
0
ファイル: test_sources.py プロジェクト: glottobank/pycldf
def test_Sources_with_None_values(tmpdir):
    src = Sources()
    src.add(Source('book', 'huber2005', title=None))
    bib = tmpdir / 'test.bib'
    src.write(str(bib))
コード例 #19
0
    def cmd_makecldf(self, args):

        # Add sources
        sources = Sources.from_file(self.raw_dir / "sources.bib")
        args.writer.cldf.add_sources(*sources)

        glottolog = Glottolog(args.glottolog.dir)
        clts = CLTS(Config.from_file().get_clone('clts'))
        bipa = clts.bipa
        clts_saphon = clts.transcriptiondata_dict['saphon']

        # Add components
        args.writer.cldf.add_columns("ValueTable", {
            "name": "Value_in_Source",
            "datatype": "string"
        })

        cltstable = Terms()["cltsReference"].to_column().asdict()
        cltstable["datatype"]["format"] = "[a-z_-]+|NA"
        args.writer.cldf.add_columns('ParameterTable', cltstable, {
            'name': 'CLTS_BIPA',
            'datatype': 'string'
        }, {
            'name': 'CLTS_Name',
            'datatype': 'string'
        })
        args.writer.cldf.add_component("LanguageTable", "Family",
                                       "Glottolog_Name")

        languages = []
        #all_glottolog = {lng.id: lng for lng in glottolog.languoids()}
        #iso2glot = {lng.iso: lng.glottocode for lng in all_glottolog.values()}
        #args.log.info("loaded glottolog")
        for row in progressbar(
                self.etc_dir.read_csv("languages.csv", dicts=True)):
            #if row["SAPHON_Code"] in iso2glot:
            #    glottocode = iso2glot[row["SAPHON_Code"]]
            #elif row["SAPHON_Code"][:3] in iso2glot:
            #    glottocode = iso2glot[row["SAPHON_Code"][:3]]
            #else:
            #    glottocode = ""

            #if glottocode and glottocode in all_glottolog:
            #    lang = all_glottolog[glottocode]
            #    update = {
            #        "Family": lang.family.name if lang.family else '',
            #        "Glottocode": glottocode,
            #        "Latitude": lang.latitude,
            #        "Longitude": lang.longitude,
            #        "Macroarea": lang.macroareas[0].name if lang.macroareas else None,
            #        "Glottolog_Name": lang.name,
            #    }
            #    row.update(update)
            languages.append(row)

        # Build source map from language
        source_map = {
            k: v
            for k, v in self.raw_dir.read_csv("references.tsv", delimiter="\t")
        }

        # Parse sources
        segments = []
        values = []
        counter = 1
        unknowns = defaultdict(list)
        for lid, segment in self.raw_dir.read_csv('inventories.tsv',
                                                  delimiter="\t"):
            normalized = normalize_grapheme(segment)
            if normalized in clts_saphon.grapheme_map:
                sound = bipa[clts_saphon.grapheme_map[normalized]]
            else:
                sound = bipa['<NA>']
                unknowns[normalized] += [(lang_key, segment)]
            par_id = compute_id(normalized)
            if sound.type == 'unknownsound':
                bipa_grapheme = ''
                desc = ''
            else:
                bipa_grapheme = str(sound)
                desc = sound.name

            segments.append((par_id, normalized, bipa_grapheme, desc))

            values.append({
                "ID": str(counter),
                "Language_ID": lid,
                "Parameter_ID": par_id,
                "Value_in_Source": segment,
                "Value": normalized,
                "Source": [source_map[lid]]
            })
            counter += 1

        # Build segment data
        parameters = [{
            "ID": ID,
            "Name": normalized,
            "Description": '',
            "CLTS_ID": desc.replace(' ', '_') if desc.strip() else "NA",
            "CLTS_BIPA": bipa_grapheme,
            "CLTS_Name": desc
        } for ID, normalized, bipa_grapheme, desc in set(segments)]

        # Write data and validate
        args.writer.write(
            **{
                "ValueTable": values,
                "LanguageTable": languages,
                "ParameterTable": parameters,
            })
        for g, rest in unknowns.items():
            print('\t'.join([repr(g), str(len(rest)), g]))
コード例 #20
0
    def load(self, ds, args=None, verbose=False):
        """
        Load a CLDF dataset into the database.

        :param dataset:
        :return:
        """
        print(ds)
        try:
            self.fetchone('select ID from dataset')
        except sqlite3.OperationalError:
            self.create(force=True)
        self.unload(ds)

        dataset = ds.cldf_reader()
        tables, ref_tables = schema(dataset)

        # update the DB schema:
        for t in tables:
            if self._create_table_if_not_exists(t):
                continue
            db_cols = {k.lower(): v for k, v in self.tables[t.name].items()}
            for col in t.columns:
                if col.name.lower() not in db_cols:
                    with self.connection() as conn:
                        conn.execute(
                            "ALTER TABLE {0} ADD COLUMN `{1.name}` {1.db_type}".format(
                                t.name, col))
                else:
                    if db_cols[col.name.lower()] != col.db_type:
                        raise ValueError(
                            'column {0}:{1} {2} redefined with new type {3}'.format(
                                t.name, col.name, db_cols[col.name.lower()], col.db_type))

        for t in ref_tables.values():
            self._create_table_if_not_exists(t)

        self.update_schema()

        # then load the data:
        with self.connection() as db:
            db.execute('PRAGMA foreign_keys = ON;')
            insert(
                db,
                'dataset',
                'ID,name,version,metadata_json',
                (
                    ds.id,
                    '{0}'.format(dataset),
                    ds.repo.hash() if ds.repo else '',
                    json.dumps(dataset.metadata_dict)))
            insert(
                db,
                'datasetmeta',
                'dataset_ID,key,value',
                *[(ds.id, k, '{0}'.format(v)) for k, v in dataset.properties.items()])

            # load sources:
            rows = []
            for src in dataset.sources.items():
                values = [ds.id, src.id, src.genre] + [src.get(k) for k in BIBTEX_FIELDS]
                values.append(
                    json.dumps({k: v for k, v in src.items() if k not in BIBTEX_FIELDS}))
                rows.append(tuple(values))
            insert(
                db,
                'SourceTable',
                ['dataset_ID', 'ID', 'bibtex_type'] + BIBTEX_FIELDS + ['extra'],
                *rows)

            # For regular tables, we extract and keep references to sources.
            refs = collections.defaultdict(list)

            for t in tables:
                # We want to lookup columns by the name used in the CLDF dataset.
                cols = {col.cldf_name: col for col in t.columns}
                # But we also want to look up primary keys by the database column name.
                cols_by_name = {col.name: col for col in t.columns}

                ref_table = ref_tables.get(t.name)
                rows, keys = [], []
                try:
                    for row in dataset[t.name]:
                        keys, values = ['dataset_ID'], [ds.id]
                        for k, v in row.items():
                            if ref_table and k == ref_table.consumes:
                                col = cols_by_name[t.primary_key]
                                refs[ref_table.name].append((row[col.cldf_name], v))
                            else:
                                col = cols[k]
                                if isinstance(v, list):
                                    v = (col.separator or ';').join(
                                        nfilter(col.convert(vv) for vv in v))
                                else:
                                    v = col.convert(v)
                                # FIXME: only if non-local!
                                keys.append("`{0}`".format(col.name))
                                values.append(v)
                        keys, values = self.update_row(t.name, keys, values)
                        rows.append(tuple(values))
                    insert(db, t.name, keys, *rows, **{'verbose': verbose})
                except FileNotFoundError:  # pragma: no cover
                    if t.name != 'CognateTable':  # An empty CognateTable is allowed.
                        raise  # pragma: no cover

            # Now insert the references, i.e. the associations with sources:
            for tname, items in refs.items():
                rows = []
                for oid, sources in items:
                    for source in sources:
                        sid, context = Sources.parse(source)
                        rows.append([ds.id, oid, sid, context])
                oid_col = '{0}_ID'.format(tname.replace('Source', ''))
                insert(db, tname, ['dataset_ID', oid_col, 'Source_ID', 'Context'], *rows)
            db.commit()
コード例 #21
0
ファイル: dataset.py プロジェクト: LinguList/pycldf
class Dataset(object):
    """
    API to access a CLDF dataset.
    """
    def __init__(self, name):
        assert NAME_PATTERN.match(name)
        self.name = name
        self.sources = Sources()
        self.metadata = Metadata()
        self._rows = OrderedDict()

        # We store the fields (a.k.a. header) as tuple because it must be immutable after
        # first assignment (since changing is not well defined when there are already
        # rows).
        self._fields = ()
        self._source_count = None
        self._cited_sources = set()
        self._table = None

    def __repr__(self):
        return '<%s %s>' % (self.__class__.__name__, self.name)

    def __len__(self):
        """The length of a dataset is the number of rows in the values file."""
        return len(self.rows)

    def __getitem__(self, item):
        """
        Individual rows can be accessed by integer index or by row ID.

        :param item: `int` to access row by index, `str` to access by row ID
        :return: `OrderedDict`
        """
        if isinstance(item, int):
            return self.rows[item]
        return self._rows[item]

    @property
    def fields(self):
        """
        Read-only property to access the fields (a.k.a. header) defined for the dataset.

        :return: `tuple` of field names
        """
        return self._fields

    @property
    def table(self):
        return self._table

    @fields.setter
    def fields(self, value):
        """
        Fields can be assigned (but only once) for a dataset.

        :param value: `tuple` of field names.
        """
        if self._fields:
            raise ValueError('fields can only be assigned once!')
        assert isinstance(value, tuple)
        assert all(any(field in value for field in variants)
                   for variants in REQUIRED_FIELDS)
        table = self.metadata.get_table()
        if table:
            assert list(value) == list(table.schema.columns.keys())
        else:
            table = self.metadata.add_table(
                'values',
                '',
                [{'name': col, 'datatype': 'string'} for col in value])
            table.schema.primaryKey = 'ID'
        self._table = table
        self._fields = value

    @property
    def rows(self):
        return list(self._rows.values())

    @property
    def stats(self):
        return dict(
            languages=set(row['Language_ID'] for row in self.rows),
            parameters=set(row['Parameter_ID'] for row in self.rows),
            rowcount=(
                len(self),
                sum([1 for row in self.rows
                     if row['Language_ID'] and row['Parameter_ID']])),
            values=Counter(row['Value'] for row in self.rows),
        )

    def add_row(self, row):
        if not row:
            return

        d = ValuesRow.from_list(self, row)
        if d['ID'] in self._rows:
            raise ValueError('duplicate row ID: %s' % d['ID'])
        for ref in self.sources.expand_refs(d.get('Source', '')):
            self._cited_sources.add(ref.source.id)
        self._rows[d['ID']] = d
        return d

    @staticmethod
    def filename(fname, type_):
        """
        Compute the path for optional CLDF files relative to a given values file.

        :param fname: Path of the values file
        :param type_: Type of the optional file
        :return: name of the optional file
        """
        if type_ == 'sources':
            return fname.stem + '.bib'
        if type_ == 'metadata':
            return fname.stem + fname.suffix + MD_SUFFIX
        raise ValueError(type_)  # pragma: no cover

    @staticmethod
    def _existing_file(fname):
        fname = Path(fname)
        assert fname.exists() and fname.is_file()
        return fname

    @classmethod
    def _from(cls, data, container=None, skip_on_error=False):
        container = container or data.parent
        dataset = cls(data.stem)
        dataset.metadata.read(Dataset.filename(data, 'metadata'), container)
        dataset._table = dataset.metadata.get_table()
        dataset.sources.read(Dataset.filename(data, 'sources'), container)
        delimiter = ','
        if dataset.table:
            delimiter = dataset.table.dialect.delimiter
        if data.suffix in TAB_SUFFIXES:
            delimiter = '\t'

        if isinstance(container, Archive):
            rows = container.read_text(data.name).split('\n')
        else:
            rows = data

        for i, row in enumerate(reader(rows, delimiter=delimiter)):
            if i == 0:
                dataset.fields = tuple(row)
            else:
                try:
                    dataset.add_row(row)
                except ValueError as e:
                    if skip_on_error:
                        log.warn('skipping row in line %s: %s' % (i + 1, e))
                    else:
                        raise e
        dataset.table.dialect.delimiter = delimiter
        dataset.table.url = data.name
        return dataset

    @classmethod
    def from_zip(cls, fname, name=None):
        archive = Archive(cls._existing_file(fname))
        return cls._from(
            Path(archive.metadata_name(prefix=name)[:-len(MD_SUFFIX)]), archive)

    @classmethod
    def from_metadata(cls, fname, container=None):
        fname = Path(fname)
        if not fname.name.endswith(MD_SUFFIX):
            raise ValueError('metadata file name must end with %s' % MD_SUFFIX)
        return cls._from(
            fname.parent.joinpath(fname.name[:-len(MD_SUFFIX)]), container=container)

    @classmethod
    def from_file(cls, fname, skip_on_error=False):
        """
        Factory method to create a `Dataset` from a CLDF values file.

        :param fname: Path of the CLDF values file.
        :return: `Dataset` instance.
        """
        return cls._from(cls._existing_file(fname), skip_on_error=skip_on_error)

    def write(self, outdir='.', suffix='.csv', cited_sources_only=False, archive=False):
        outdir = Path(outdir)
        if not outdir.exists():
            raise ValueError(outdir.as_posix())

        close = False
        if archive:
            if isinstance(archive, Archive):
                container = archive
            else:
                container = Archive(outdir.joinpath(self.name + '.zip'), mode='w')
                close = True
        else:
            container = outdir

        fname = Path(outdir).joinpath(self.name + suffix)
        if fname.suffix in TAB_SUFFIXES:
            self.table.dialect.delimiter = '\t'

        with UnicodeWriter(
                None if isinstance(container, Archive) else fname,
                delimiter=self.table.dialect.delimiter) as writer:
            writer.writerow(self.fields)
            for row in self.rows:
                writer.writerow(row.to_list())

        if isinstance(container, Archive):
            container.write_text(writer.read(), fname.name)
        self.table.url = fname.name

        self.metadata.write(Dataset.filename(fname, 'metadata'), container)
        ids = self._cited_sources if cited_sources_only else None
        self.sources.write(Dataset.filename(fname, 'sources'), container, ids=ids)
        if close:
            container.close()
コード例 #22
0
ファイル: db.py プロジェクト: afcarl/pycldf
    def load(self, dataset):
        """
        Load a CLDF dataset into the database.

        :param dataset:
        :return:
        """
        tables, ref_tables = schema(dataset)

        # update the DB schema:
        for t in tables:
            if self._create_table_if_not_exists(t):
                continue
            db_cols = {
                r[1]: r[2]
                for r in self.fetchall("PRAGMA table_info({0})".format(t.name))
            }
            for col in t.columns:
                if col.name not in db_cols:
                    with self.connection() as conn:
                        conn.execute(
                            "ALTER TABLE {0} ADD COLUMN \"{1.name}\" {1.db_type}"
                            .format(t.name, col))
                else:
                    if db_cols[col.name] != col.db_type:
                        raise ValueError(
                            'column {0}:{1} {2} redefined with new type {3}'.
                            format(t.name, col.name, db_cols[col.name],
                                   col.db_type))

        for t in ref_tables.values():
            self._create_table_if_not_exists(t)

        # then load the data:
        with self.connection() as db:
            db.execute('PRAGMA foreign_keys = ON;')
            pk = max([
                r[0] for r in self.fetchall("SELECT ID FROM dataset", conn=db)
            ] or [0]) + 1
            insert(db, 'dataset', 'ID,name,module,metadata_json',
                   (pk, '{0}'.format(dataset), dataset.module,
                    dumps(dataset.metadata_dict)))
            insert(
                db, 'datasetmeta', 'dataset_ID,key,value',
                *[(pk, k, '{0}'.format(v))
                  for k, v in dataset.properties.items()])

            # load sources:
            rows = []
            for src in dataset.sources.items():
                values = [pk, src.id, src.genre
                          ] + [src.get(k) for k in BIBTEX_FIELDS]
                values.append(
                    dumps({
                        k: v
                        for k, v in src.items() if k not in BIBTEX_FIELDS
                    }))
                rows.append(tuple(values))
            insert(db, 'SourceTable', ['dataset_ID', 'ID', 'bibtex_type'] +
                   BIBTEX_FIELDS + ['extra'], *rows)

            # For regular tables, we extract and keep references to sources.
            refs = defaultdict(list)

            for t in tables:
                cols = {col.name: col for col in t.columns}
                ref_table = ref_tables.get(t.name)
                rows, keys = [], []
                for row in dataset[t.name]:
                    keys, values = ['dataset_ID'], [pk]
                    for k, v in row.items():
                        if ref_table and k == ref_table.consumes:
                            refs[ref_table.name].append(
                                (row[t.primary_key], v))
                        else:
                            col = cols[k]
                            if isinstance(v, list):
                                v = (col.separator
                                     or ';').join(col.convert(vv) for vv in v)
                            else:
                                v = col.convert(v)
                            keys.append(k)
                            values.append(v)
                    rows.append(tuple(values))
                insert(db, t.name, keys, *rows)

            # Now insert the references, i.e. the associations with sources:
            for tname, items in refs.items():
                rows = []
                for oid, sources in items:
                    for source in sources:
                        sid, context = Sources.parse(source)
                        rows.append([pk, oid, sid, context])
                oid_col = '{0}_ID'.format(tname.replace('Source', ''))
                insert(db, tname, [
                    'dataset_ID', '{:}'.format(oid_col), 'Source_ID', 'Context'
                ], *rows)
            db.commit()
コード例 #23
0
ファイル: test_sources.py プロジェクト: glottobank/pycldf
def test_Sources(tmpdir):
    src = Sources()
    src.add(BIB, Source(
        'book', 'huber2005', author='Herrmann Huber', year='2005', title='y'))
    for entry in src:
        assert entry.genre == 'book'
        break
    assert len(list(src.items())) == 3
    assert len(list(src.keys())) == 3
    refs = ['huber2005[1-6]', 'Obrazy', 'Elegie[34]']
    assert src.format_refs(*list(src.expand_refs(refs))) == refs
    assert '%s' % src['huber2005'] == 'Huber, Herrmann. 2005. y.'
    with pytest.raises(ValueError):
        src.add(5)
    with pytest.raises(ValueError):
        src.add('@misc{a.b,\n  author="a.b"\n}')
    with pytest.raises(ValueError):
        _ = src['unknown']
        assert _  # pragma: no cover
    with pytest.raises(ValueError):
        src.parse('a[x')
    with pytest.raises(ValueError):
        src.parse('[x]')
    with pytest.raises(ValueError):
        src.validate(['x'])

    bib = str(tmpdir / 'test.bib')
    src.write(bib)

    src2 = Sources()
    src2.read(bib)

    src2.write(bib, ids=['huber2005'])
    src = Sources.from_file(bib)
    assert len(src) == 1