Beispiel #1
0
def test_Sources(tmpdir):
    src = Sources()
    src.add(BIB, Source(
        'book', 'huber2005', author='Herrmann Huber', year='2005', title='y'))
    for entry in src:
        assert entry.genre == 'book'
        break
    assert len(list(src.items())) == 3
    assert len(list(src.keys())) == 3
    refs = ['huber2005[1-6]', 'Obrazy', 'Elegie[34]']
    assert src.format_refs(*list(src.expand_refs(refs))) == refs
    assert '%s' % src['huber2005'] == 'Huber, Herrmann. 2005. y.'
    with pytest.raises(ValueError):
        src.add(5)
    with pytest.raises(ValueError):
        src.add('@misc{a.b,\n  author="a.b"\n}')
    with pytest.raises(ValueError):
        _ = src['unknown']
        assert _  # pragma: no cover
    with pytest.raises(ValueError):
        src.parse('a[x')
    with pytest.raises(ValueError):
        src.parse('[x]')
    with pytest.raises(ValueError):
        src.validate(['x'])

    bib = str(tmpdir / 'test.bib')
    src.write(bib)

    src2 = Sources()
    src2.read(bib)

    src2.write(bib, ids=['huber2005'])
    src = Sources.from_file(bib)
    assert len(src) == 1
Beispiel #2
0
def test_Sources(tmpdir):
    src = Sources()
    src.add(BIB, Source(
        'book', 'huber2005', author='Herrmann Huber', year='2005', title='y'))
    for entry in src:
        assert entry.genre == 'book'
        break
    assert len(list(src.items())) == 3
    assert len(list(src.keys())) == 3
    refs = ['huber2005[1-6]', 'Obrazy', 'Elegie[34]']
    assert src.format_refs(*list(src.expand_refs(refs))) == refs
    assert '%s' % src['huber2005'] == 'Huber, Herrmann. 2005. y.'
    with pytest.raises(ValueError):
        src.add(5)
    with pytest.raises(ValueError):
        src.add('@misc{a.b,\n  author="a.b"\n}')
    with pytest.raises(ValueError):
        _ = src['unknown']
        assert _  # pragma: no cover
    with pytest.raises(ValueError):
        src.parse('a[x')
    with pytest.raises(ValueError):
        src.parse('[x]')
    with pytest.raises(ValueError):
        src.validate(['x'])

    bib = str(tmpdir / 'test.bib')
    src.write(bib)

    src2 = Sources()
    src2.read(bib)

    src2.write(bib, ids=['huber2005'])
    src = Sources.from_file(bib)
    assert len(src) == 1
Beispiel #3
0
    def cmd_makecldf(self, args):
        """
        Convert the raw data to a CLDF dataset.
        """
        with self.cldf_writer(args) as writer:
            writer.cldf.add_component('CognatesetTable')
            writer.add_sources(*self.raw_dir.read_bib('cariban_resolved.bib'))
            cmap = writer.add_concepts(lookup_factory=lambda c: c.english)
            cmap['you'] = cmap['thou']
            cmap['grease/fat'] = cmap['grease']
            cmap['breast'] = cmap['breasts']
            cmap['son'] = cmap['person']
            data = pycldf.Dataset.from_metadata(self.raw_dir /
                                                'cariban_data.json')
            for lang in data['LanguageTable']:
                writer.add_language(ID=lang['ID'],
                                    Name=lang['Name'],
                                    Glottocode=lang["Glottocode"])

            cs_seen = set()
            reconstructions = {
                tuple(c['ID'].split('-')): c['Form']
                for c in self.raw_dir.read_csv(
                    'cariban_lexical_reconstructions.csv', dicts=True)
            }
            for lex in self.raw_dir.read_csv('cariban_swadesh_list.csv',
                                             dicts=True):
                #"Language_ID","Swadesh_Nr","Feature_ID","Value","Cognateset_ID","Source","Comment","Full_Form"
                if lex['Feature_ID'] not in cmap:
                    print(lex['Feature_ID'])
                    continue
                for form in writer.add_lexemes(
                        Value=lex['Value'],
                        Parameter_ID=cmap[lex['Feature_ID']],
                        Language_ID=lex['Language_ID'],
                        Source=[
                            Reference(*d) for d in
                            [Sources.parse(lex['Source'].replace(';', ','))]
                        ] if lex['Source']
                        and not lex['Source'].startswith('pc') else [],
                ):
                    cs_key = (lex['Feature_ID'], lex['Cognateset_ID'])
                    cs_id = '{}-{}'.format(cmap[cs_key[0]], cs_key[1])
                    if cs_key not in cs_seen:
                        writer.objects['CognatesetTable'].append(
                            dict(
                                ID=cs_id,
                                Description=reconstructions.get(cs_key),
                            ))
                        cs_seen.add(cs_key)
                    writer.add_cognate(lexeme=form, Cognateset_ID=cs_id)

            # Note: We want to re-use LanguageTable across the two CLDF datasets:
            LanguageTable = writer.cldf['LanguageTable']

        with self.cldf_writer(args, cldf_spec='structure',
                              clean=False) as writer:
            writer.cldf.add_component(
                LanguageTable)  # we reuse the one from above!
Beispiel #4
0
def import_values(values, lang, features, codes, contributors,
                  sources):  # pragma: no cover
    c = Contribution(
        id=lang['ID'],
        name='Dataset for {0}'.format(lang['Name']),
    )
    for i, cid in enumerate(lang['Coders'], start=1):
        DBSession.add(
            ContributionContributor(
                contribution=c,
                contributor_pk=contributors[cid],
                ord=i,
            ))
    l = GrambankLanguage(
        id=lang['ID'],
        name=lang['Name'],
        macroarea=lang['Macroarea'],
        latitude=lang['Latitude'],
        longitude=lang['Longitude'],
    )
    for value in values:
        vs = ValueSet(
            id=value['ID'],
            parameter_pk=features[value['Parameter_ID']],
            language=l,
            contribution=c,
        )
        Value(id=value['ID'],
              valueset=vs,
              name=value['Value'],
              description=value['Comment'],
              domainelement_pk=codes[value['Code_ID']
                                     or '{}-NA'.format(value['Parameter_ID'])])

        if value['Source']:
            for ref in value['Source']:
                sid, pages = Sources.parse(ref)
                ValueSetReference(valueset=vs,
                                  source_pk=sources[sid],
                                  description=pages)
    DBSession.add(c)
Beispiel #5
0
    def load(self, dataset):
        """
        Load a CLDF dataset into the database.

        :param dataset:
        :return:
        """
        tables, ref_tables = schema(dataset)

        # update the DB schema:
        for t in tables:
            if self._create_table_if_not_exists(t):
                continue
            db_cols = {
                r[1]: r[2]
                for r in self.fetchall("PRAGMA table_info({0})".format(t.name))
            }
            for col in t.columns:
                if col.name not in db_cols:
                    with self.connection() as conn:
                        conn.execute(
                            "ALTER TABLE {0} ADD COLUMN \"{1.name}\" {1.db_type}"
                            .format(t.name, col))
                else:
                    if db_cols[col.name] != col.db_type:
                        raise ValueError(
                            'column {0}:{1} {2} redefined with new type {3}'.
                            format(t.name, col.name, db_cols[col.name],
                                   col.db_type))

        for t in ref_tables.values():
            self._create_table_if_not_exists(t)

        # then load the data:
        with self.connection() as db:
            db.execute('PRAGMA foreign_keys = ON;')
            pk = max([
                r[0] for r in self.fetchall("SELECT ID FROM dataset", conn=db)
            ] or [0]) + 1
            insert(db, 'dataset', 'ID,name,module,metadata_json',
                   (pk, '{0}'.format(dataset), dataset.module,
                    dumps(dataset.metadata_dict)))
            insert(
                db, 'datasetmeta', 'dataset_ID,key,value',
                *[(pk, k, '{0}'.format(v))
                  for k, v in dataset.properties.items()])

            # load sources:
            rows = []
            for src in dataset.sources.items():
                values = [pk, src.id, src.genre
                          ] + [src.get(k) for k in BIBTEX_FIELDS]
                values.append(
                    dumps({
                        k: v
                        for k, v in src.items() if k not in BIBTEX_FIELDS
                    }))
                rows.append(tuple(values))
            insert(db, 'SourceTable', ['dataset_ID', 'ID', 'bibtex_type'] +
                   BIBTEX_FIELDS + ['extra'], *rows)

            # For regular tables, we extract and keep references to sources.
            refs = defaultdict(list)

            for t in tables:
                cols = {col.name: col for col in t.columns}
                ref_table = ref_tables.get(t.name)
                rows, keys = [], []
                for row in dataset[t.name]:
                    keys, values = ['dataset_ID'], [pk]
                    for k, v in row.items():
                        if ref_table and k == ref_table.consumes:
                            refs[ref_table.name].append(
                                (row[t.primary_key], v))
                        else:
                            col = cols[k]
                            if isinstance(v, list):
                                v = (col.separator
                                     or ';').join(col.convert(vv) for vv in v)
                            else:
                                v = col.convert(v)
                            keys.append(k)
                            values.append(v)
                    rows.append(tuple(values))
                insert(db, t.name, keys, *rows)

            # Now insert the references, i.e. the associations with sources:
            for tname, items in refs.items():
                rows = []
                for oid, sources in items:
                    for source in sources:
                        sid, context = Sources.parse(source)
                        rows.append([pk, oid, sid, context])
                oid_col = '{0}_ID'.format(tname.replace('Source', ''))
                insert(db, tname, [
                    'dataset_ID', '{:}'.format(oid_col), 'Source_ID', 'Context'
                ], *rows)
            db.commit()
Beispiel #6
0
    def load(self, ds, args=None, verbose=False):
        """
        Load a CLDF dataset into the database.

        :param dataset:
        :return:
        """
        print(ds)
        try:
            self.fetchone('select ID from dataset')
        except sqlite3.OperationalError:
            self.create(force=True)
        self.unload(ds)

        dataset = ds.cldf_reader()
        tables, ref_tables = schema(dataset)

        # update the DB schema:
        for t in tables:
            if self._create_table_if_not_exists(t):
                continue
            db_cols = {k.lower(): v for k, v in self.tables[t.name].items()}
            for col in t.columns:
                if col.name.lower() not in db_cols:
                    with self.connection() as conn:
                        conn.execute(
                            "ALTER TABLE {0} ADD COLUMN `{1.name}` {1.db_type}".format(
                                t.name, col))
                else:
                    if db_cols[col.name.lower()] != col.db_type:
                        raise ValueError(
                            'column {0}:{1} {2} redefined with new type {3}'.format(
                                t.name, col.name, db_cols[col.name.lower()], col.db_type))

        for t in ref_tables.values():
            self._create_table_if_not_exists(t)

        self.update_schema()

        # then load the data:
        with self.connection() as db:
            db.execute('PRAGMA foreign_keys = ON;')
            insert(
                db,
                'dataset',
                'ID,name,version,metadata_json',
                (
                    ds.id,
                    '{0}'.format(dataset),
                    ds.repo.hash() if ds.repo else '',
                    json.dumps(dataset.metadata_dict)))
            insert(
                db,
                'datasetmeta',
                'dataset_ID,key,value',
                *[(ds.id, k, '{0}'.format(v)) for k, v in dataset.properties.items()])

            # load sources:
            rows = []
            for src in dataset.sources.items():
                values = [ds.id, src.id, src.genre] + [src.get(k) for k in BIBTEX_FIELDS]
                values.append(
                    json.dumps({k: v for k, v in src.items() if k not in BIBTEX_FIELDS}))
                rows.append(tuple(values))
            insert(
                db,
                'SourceTable',
                ['dataset_ID', 'ID', 'bibtex_type'] + BIBTEX_FIELDS + ['extra'],
                *rows)

            # For regular tables, we extract and keep references to sources.
            refs = collections.defaultdict(list)

            for t in tables:
                # We want to lookup columns by the name used in the CLDF dataset.
                cols = {col.cldf_name: col for col in t.columns}
                # But we also want to look up primary keys by the database column name.
                cols_by_name = {col.name: col for col in t.columns}

                ref_table = ref_tables.get(t.name)
                rows, keys = [], []
                try:
                    for row in dataset[t.name]:
                        keys, values = ['dataset_ID'], [ds.id]
                        for k, v in row.items():
                            if ref_table and k == ref_table.consumes:
                                col = cols_by_name[t.primary_key]
                                refs[ref_table.name].append((row[col.cldf_name], v))
                            else:
                                col = cols[k]
                                if isinstance(v, list):
                                    v = (col.separator or ';').join(
                                        nfilter(col.convert(vv) for vv in v))
                                else:
                                    v = col.convert(v)
                                # FIXME: only if non-local!
                                keys.append("`{0}`".format(col.name))
                                values.append(v)
                        keys, values = self.update_row(t.name, keys, values)
                        rows.append(tuple(values))
                    insert(db, t.name, keys, *rows, **{'verbose': verbose})
                except FileNotFoundError:  # pragma: no cover
                    if t.name != 'CognateTable':  # An empty CognateTable is allowed.
                        raise  # pragma: no cover

            # Now insert the references, i.e. the associations with sources:
            for tname, items in refs.items():
                rows = []
                for oid, sources in items:
                    for source in sources:
                        sid, context = Sources.parse(source)
                        rows.append([ds.id, oid, sid, context])
                oid_col = '{0}_ID'.format(tname.replace('Source', ''))
                insert(db, tname, ['dataset_ID', oid_col, 'Source_ID', 'Context'], *rows)
            db.commit()