Beispiel #1
0
def update_providers(args):
    if not args.data_file(args.version, 'provider.txt').exists():
        return

    with open(args.data_file(args.version, 'provider.txt')) as fp:
        content = fp.read().decode('latin1')

    if '\r\n' in content:
        content = content.replace('\r\n', '\n')

    provider_map = get_map(Provider)
    for block in content.split('\n\n\n\n'):
        lines = block.split('\n')
        id_, abbr = lines[0].strip().split(':')
        id_ = id_.split('.')[0]
        description = unescape('\n'.join(lines[1:]))
        name = description.split('.')[0]

        if id_ == 'hedvig-tirailleur':
            id_ = u'skirgard'

        if slug(id_) not in provider_map:
            args.log.info('adding provider %s' % slug(id_))
            DBSession.add(
                Provider(id=slug(id_), name=name, description=description, abbr=abbr))
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=u'An Crúbadán',
        name=u'An Crúbadán',
        publisher_name="Saint Louis University",
        publisher_place="Saint Louis, USA",
        publisher_url="http://www.slu.edu/",
        description=
        "Linguistic datasets for over 2000 languages created from web-crawled text corpora",
        contact="*****@*****.**",
        license='http://creativecommons.org/licenses/by/4.0/',
        jsondata={
            'license_icon':
            'https://licensebuttons.net/l/by/4.0/88x31.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License',
        },
        domain='crubadan.org',
    )

    DBSession.add(dataset)
    DBSession.flush()

    editor = data.add(common.Contributor,
                      "Kevin Scannell",
                      id="Kevin Scannell",
                      name="Kevin Scannell",
                      email="*****@*****.**")
    common.Editor(dataset=dataset, contributor=editor, ord=0)
    DBSession.flush()

    fillTable(DBSession)
Beispiel #3
0
def update_lang(lang, **kw):
    """
    store original name in hname

    .. notes::

        We don't update the alternative names (for name search) here, instead, the script
        to update these names in bulk must be run after this function.
    """
    name = kw.pop('name', None)
    if name and name != lang.name:
        if 'hname' not in lang.jsondatadict:
            lang.update_jsondata(hname=lang.name)
        print 'renamed', lang.name, 'to', name
        lang.name = name
        print lang.jsondata

    for k, v in kw.items():
        if k not in lang.datadict():
            DBSession.add(Language_data(key=k, value=v, object_pk=lang.pk))
        else:
            for d in lang.data:
                if d.key == k and d.value != v:
                    print 'updated', k
                    d.value = v
                    break
Beispiel #4
0
def update(args):
    count = 0
    assert args.json

    iid = int(DBSession.execute(
        "select max(cast(id as integer)) from identifier").fetchone()[0]) + 1
    pk = DBSession.execute(
        "select max(pk) from identifier").fetchone()[0] + 1

    langs = {}
    for gid, name in args.json['wikipedia'].items():
        if gid not in langs:
            langs[gid] = Languoid.get(gid)
        langs[gid].update_jsondata(wikipedia=name.split('/')[-1])

    for gid, codes in args.json['multitree'].items():
        l = langs[gid]
        lcodes = [i.name for i in l.identifiers if i.type == 'multitree']

        for code in set(codes):
            if code not in lcodes:
                identifier = DBSession.query(common.Identifier)\
                    .filter(common.Identifier.type == 'multitree')\
                    .filter(common.Identifier.name == code)\
                    .first()
                if not identifier:
                    identifier = common.Identifier(
                        pk=pk, id=str(iid), name=code, type='multitree')
                    iid += 1
                    pk += 1
                count += 1
                DBSession.add(
                    common.LanguageIdentifier(language=l, identifier=identifier))

    print count, 'new multitree identifiers'
Beispiel #5
0
def main(args):
    user = getpass.getuser()
    data = Data()
    datadir = 'C:\\Python27\\glottobank\\Grambank\\' if user != 'robert' \
        else '/home/robert/venvs/glottobank/Grambank'

    dataset = common.Dataset(
        id=grambank.__name__,
        name="GramBank",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='grambank.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)

    import_features_collaborative_sheet(datadir, data)
    import_cldf(os.path.join(datadir, 'datasets'), data)
    #print data.keys()
    #print data['Parameter'].keys()
    #parameter = data['Parameter'].get(row['Feature_ID'])

    load_families(data, data['GrambankLanguage'].values(), isolates_icon='tcccccc')
Beispiel #6
0
def update_lang(lang, **kw):
    """
    store original name in hname

    .. notes::

        We don't update the alternative names (for name search) here, instead, the script
        to update these names in bulk must be run after this function.
    """
    name = kw.pop('name', None)
    if name and name != lang.name:
        if 'hname' not in lang.jsondata:
            lang.update_jsondata(hname=lang.name)
        print 'renamed', lang.name, 'to', name
        lang.name = name
        print lang.jsondata

    for k, v in kw.items():
        if k not in lang.datadict():
            DBSession.add(Language_data(key=k, value=v, object_pk=lang.pk))
        else:
            for d in lang.data:
                if d.key == k and d.value != v:
                    print 'updated', k
                    d.value = v
                    break
Beispiel #7
0
def import_cognatesets(dataset, forms, bibliography, contribution, cognatesets={}):
    cognateset_by_formid = {}
    cognateset_forms = {}

    for row in dataset["CognateTable"].iterdicts():
        # Only incorporate the newest cognate codings, and be robust about that
        try:
            cs = cognateset_forms.setdefault(row["Cognateset_ID"], [])
            cs.append(forms[row["Form_ID"]].name)
            row["CognateForms"] = cs
            cognateset_by_formid[row["Form_ID"]] = row
        except KeyError:
            continue
    for row in cognateset_by_formid.values():
        cognateset_id = row["Cognateset_ID"]
        try:
            cognateset = cognatesets[cognateset_id]
        except KeyError:
            row["CognateForms"].sort()
            cognateset = cognatesets[cognateset_id] = Cognateset(
                id=row["Cognateset_ID"],
                contribution=contribution,
                name=row["CognateForms"][len(row["CognateForms"])//2])
        assoc = (
            CognatesetCounterpart(
                cognateset=cognateset,
                doubt=True if "LexStat" in row["Source"] else False,
                alignment=(None if not row["Alignment"] else " ".join(row["Alignment"])),
                counterpart=forms[row["Form_ID"]]))
        for source in row["Source"]:
            DBSession.add(CognatesetCounterpartReference(
                cognatesetcounterpart_pk=assoc.pk,
                source=bibliography[source]))
Beispiel #8
0
def main(args):
    datadir = '/home/robert/venvs/glottobank/lexibank'

    with transaction.manager:
        dataset = common.Dataset(
            id=lexibank.__name__,
            name="LexiBank",
            publisher_name="Max Planck Institute for the Science of Human History",
            publisher_place="Jena",
            publisher_url="http://shh.mpg.de",
            license="http://creativecommons.org/licenses/by/4.0/",
            domain='lexibank.clld.org',
            contact='*****@*****.**',
            jsondata={
                'license_icon': 'cc-by.png',
                'license_name': 'Creative Commons Attribution 4.0 International License'})
        DBSession.add(dataset)

    for provider in [
        'transnewguinea',
        'abvd',
        'ids',
    ]:
        import_cldf(os.path.join(datadir, provider, 'cldf'), provider)

    with transaction.manager:
        load_families(Data(), DBSession.query(LexibankLanguage), isolates_icon='tcccccc')
Beispiel #9
0
def main(args):
    data = Data()
    dataset = common.Dataset(
        id=culturebank.__name__,
        name="CultureBank",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='culturebank.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'No license yet'}) # Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)

    import_features_collaborative_sheet(CULTUREBANK_REPOS, data)
    import_cldf(os.path.join(CULTUREBANK_REPOS, 'datasets'), data)
    ##import_cldf("C:\\python27\\dbs\\bwohh\\", data, add_missing_features = True)

    load_families(
        data,
        list(data['CulturebankLanguage'].values()),
        isolates_icon='tcccccc')

    return 
Beispiel #10
0
def update_providers(args, verbose=False):
    filepath = args.data_dir.joinpath('references', 'bibtex', 'BIBFILES.ini')
    p = RawConfigParser()
    with io.open(filepath, encoding='utf-8-sig') as fp:
        p.readfp(fp)

    provider_map = get_map(Provider)
    for section in p.sections():
        sectname = section[:-4] if section.endswith('.bib') else section
        id_ = slug(sectname)
        attrs = {
            'name': p.get(section, 'title'),
            'description': p.get(section, 'description'),
            'abbr': p.get(section, 'abbr'),
        }
        if id_ in provider_map:
            provider = provider_map[id_]
            for a in list(attrs):
                before, after = getattr(provider, a), attrs[a]
                if before == after:
                    del attrs[a]
                else:
                    setattr(provider, a, after)
                    attrs[a] = (before, after)
            if attrs:
                args.log.info('updating provider %s %s' % (slug(id_), sorted(attrs)))
            if verbose:
                for a, (before, after) in attrs.items():
                    before, after = (' '.join(_.split()) for _ in (before, after))
                    if before != after:
                        args.log.info('%s\n%r\n%r' % (a, before, after))
        else:
            args.log.info('adding provider %s' % slug(id_))
            DBSession.add(Provider(id=id_, **attrs))
Beispiel #11
0
def testapp():
    from webtest import TestApp
    from clld.db.meta import DBSession, VersionedDBSession, Base
    from clld.db.models import common
    from clld_cognacy_plugin.models import Cognateset, Cognate

    def main():
        cfg = config.Configurator(settings={
            'sqlalchemy.url': 'sqlite://',
            'mako.directories': [
                'clld:web/templates',
                'clld_cognacy_plugin:templates'
            ]})
        cfg.include('clld.web.app')
        cfg.include('clld_cognacy_plugin')
        return cfg.make_wsgi_app()

    DBSession.remove()
    VersionedDBSession.remove()
    wsgi_app = main()
    Base.metadata.bind = DBSession.bind
    Base.metadata.create_all()
    DBSession.add(common.Dataset(id='1', name='test app', domain='example.org'))
    cs = Cognateset(id='1', name='cs: test')
    lang = common.Language(id='l', latitude=2, longitude=2)
    param = common.Parameter(id='l')
    vs = common.ValueSet(id='vs', language=lang, parameter=param)
    v = common.Value(id='v', name='abc', valueset=vs)
    DBSession.add(Cognate(cognateset=cs, counterpart=v))
    yield TestApp(wsgi_app)
Beispiel #12
0
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=cdk.__name__,
        name="CDK",
        description="Comprehensive Dictionary of Ket",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='cdk.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    DBSession.add(dataset)

    contrib = common.Contribution(id='ket', name=dataset.name)
    DBSession.add(contrib)
    for i, (id, name) in enumerate([
        ('kotorov', 'E.G. Kotorova'), ('nefedov', 'A.V. Nefedov'),
    ]):
        dataset.editors.append(
            common.Editor(contributor=common.Contributor(id=id, name=name), ord=i))

    ket = data.add(
        common.Language, 'ket',
        id='ket',
        name='Ket',
        latitude=63.76,
        longitude=87.55)
    add_language_codes(data, ket, 'ket', glottocode='kett1243')
    for abbr, name in DIALECTS.items():
        data.add(common.Language, abbr, id=abbr, name=name)

    with args.data_file('sources.txt').open(encoding='utf8') as fp:
        for i, chunk in enumerate(fp.read().split('\n\n\n')):
            try:
                id_, year, author, desc = chunk.split('\n')
            except:
                print(chunk)
                raise
            data.add(
                common.Source,
                id_,
                id=str(i + 1),
                name=id_,
                author=author,
                year=year,
                description=desc)

    with UnicodeReader(args.data_file('Ket_nouns_and_other_pos_table.docx.csv')) as reader:
        load(data, reader, ket, contrib, verbs=False)

    with UnicodeReader(args.data_file('Ket_verbs_table.docx.csv')) as reader:
        load(data, reader, ket, contrib)

    print('parsing examples problematic in %s cases' % len(PROBLEMS))
Beispiel #13
0
def import_sources(wordlist, contribution, contributors = {}):
    """Load the bibliography

    """
    contributions = {}
    by_name = {}
    for source in wordlist.sources.items():
        fields = source.entry.fields

        # Generate a citation from the source
        citation_contrib = None
        for role, people in source.entry.persons.items():
            if not people:
                continue
            names = " and ".join(map(str, people))
            fields[role] = names

            if not citation_contrib:
                if len(people) == 1:
                    citation_contrib = " ".join(people[0].last_names)
                elif len(people) == 2:
                    citation_contrib = "{:} & {:}".format(" ".join(people[0].last_names),
                                                          " ".join(people[1].last_names))
                else:
                    citation_contrib = "{:} et al.".format(" ".join(people[0].last_names))

        if citation_contrib:
            if fields.get("year"):
                name = "{:}, {:}".format(citation_contrib, fields["year"])
            else:
                name = "{:}".format(citation_contrib)
        else:
            title_like = fields.get("title") or fields.get("note")
            if fields.get("year"):
                name = "{:}, {:}".format(title_like, fields["year"])
            else:
                name = "{:}".format(title_like)
        if name in by_name:
            name = "{:}a".format(name)
        while name in by_name:
            name = name[:-1]+chr(ord(name[-1]) + 1)

        # create a contribution
        contrib = LexiRumahSource(
            id=source.id,
            name=name,
            bibtex_type=vars(EntryType).get(source.genre) or EntryType.misc,
            provider=contribution)
        for key, value in fields.items():
            if hasattr(contrib, key) and not getattr(contrib, key):
                setattr(contrib, key, value)
            else:
                contrib.jsondata[key] = value

        DBSession.add(contrib)
        contributions[source.id] = contrib
        by_name[name] = contrib

    return contributions
Beispiel #14
0
    def test_Dataset(self):
        from clld import RESOURCES
        from clld.db.models.common import Dataset, Source

        d = Dataset(id='abc', domain='test')
        DBSession.add(d)
        DBSession.flush()
        d.get_stats(RESOURCES, source=Source.id == None)
Beispiel #15
0
    def test_CustomModelMixin(self):
        from clld.tests.fixtures import CustomLanguage

        DBSession.add(CustomLanguage(id='abc', name='Name', custom='c'))
        DBSession.flush()
        for lang in DBSession.query(Language).filter(Language.id == 'abc'):
            self.assertEqual(lang.custom, 'c')
            break
Beispiel #16
0
    def test_CustomModelMixin(self):
        from clld.tests.fixtures import CustomLanguage

        DBSession.add(CustomLanguage(id='abc', name='Name', custom='c'))
        DBSession.flush()
        for lang in DBSession.query(Language).filter(Language.id == 'abc'):
            self.assertEqual(lang.custom, 'c')
            break
Beispiel #17
0
    def test_Dataset(self):
        from clld import RESOURCES
        from clld.db.models.common import Dataset, Source

        d = Dataset(id='abc', domain='test')
        DBSession.add(d)
        DBSession.flush()
        d.get_stats(RESOURCES, source=Source.id == None)
Beispiel #18
0
def main(args):  # pragma: no cover
    global MAX_IDENTIFIER_PK

    with transaction.manager:
        MAX_IDENTIFIER_PK = DBSession.query(
            Identifier.pk).order_by(desc(Identifier.pk)).first()[0]

        gl_name = glottolog_name()
        gl_names = glottolog_names()

        languoids = {l.pk: l for l in DBSession.query(Languoid)}
        for attrs in jsonload(args.data_dir.joinpath('languoids', 'changes.json')):
            replacement = attrs.pop('replacement', None)
            hname = attrs.pop('hname', None)

            for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]:
                if name in attrs:
                    attrs[name] = enum.from_string(attrs[name])

            l = languoids.get(attrs['pk'])
            if l:
                for k, v in attrs.items():
                    setattr(l, k, v)
                #
                # We do not assign ISO codes for existing languages, because it could be
                # that the ISO code is now assigned to a family node, due to a change
                # request, e.g. see https://github.com/clld/glottolog-data/issues/40
                #
                if len(l.hid or '') == 3 and not l.iso_code:
                    args.log.warn('Language with hid %s but no iso code!' % l.hid)
            else:
                l = Languoid(**attrs)
                DBSession.add(l)
                languoids[l.pk] = l

                if len(attrs.get('hid', '')) == 3:
                    create_identifier(
                        None, l, name=attrs['hid'], type=IdentifierType.iso.value)

                create_identifier(
                    gl_names.get(l.name),
                    l,
                    name=l.name,
                    description=gl_name.description,
                    type=gl_name.type)

            if hname:
                l.update_jsondata(hname=hname)

            if replacement:
                DBSession.add(Superseded(
                    languoid_pk=l.pk,
                    replacement_pk=replacement,
                    relation='classification update'))

            DBSession.flush()

        recreate_treeclosure()
Beispiel #19
0
def test_Base_jsondata(db):
    l = Language(id='abc', name='Name')
    DBSession.add(l)
    DBSession.flush()
    l.update_jsondata(a=1)
    assert 'a' in l.jsondata
    l.update_jsondata(b=1)
    assert 'b' in l.jsondata and 'a' in l.jsondata
    assert 'b' in l.__json__(None)['jsondata']
Beispiel #20
0
def test_JSONEncodedDict(db):
    l = Language(id='abc', name='Name', jsondata={'i': 2})
    DBSession.add(l)
    DBSession.flush()

    DBSession.expunge(l)
    for lang in DBSession.query(Language).filter(Language.id == 'abc'):
        assert lang.jsondata['i'] == 2
        break
Beispiel #21
0
def test_JSONEncodedDict(db):
    l = Language(id='abc', name='Name', jsondata={'i': 2})
    DBSession.add(l)
    DBSession.flush()

    DBSession.expunge(l)
    for lang in DBSession.query(Language).filter(Language.id == 'abc'):
        assert lang.jsondata['i'] == 2
        break
Beispiel #22
0
def create_identifier(identifier, l, **kw):
    global MAX_IDENTIFIER_PK
    if identifier is None:
        MAX_IDENTIFIER_PK += 1
        DBSession.add(Identifier(pk=MAX_IDENTIFIER_PK, id=str(MAX_IDENTIFIER_PK), **kw))
        pk = MAX_IDENTIFIER_PK
    else:
        pk = identifier.pk
    DBSession.add(LanguageIdentifier(language_pk=l.pk, identifier_pk=pk))
Beispiel #23
0
def test_Data(db):
    from clld.db.models.common import Language, Language_data

    l = Language(id='abc', name='Name')
    l.data.append(Language_data(key='abstract', value='c'))
    DBSession.add(l)
    DBSession.flush()
    DBSession.refresh(l)
    assert l.datadict()['abstract'] == 'c'
Beispiel #24
0
    def test_Data(self):
        from clld.db.models.common import Language, Language_data

        l = Language(id='abc', name='Name')
        l.data.append(Language_data(key='abstract', value='c'))
        DBSession.add(l)
        DBSession.flush()
        DBSession.refresh(l)
        self.assertEqual(l.datadict()['abstract'], 'c')
Beispiel #25
0
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=cognition.__name__,
        name="COSTATOL",
        description="Cognitive Structures across the Tree of Life",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='cognition.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    DBSession.add(dataset)

    #
    # TODO: add editors!
    #

    for rec in Database.from_file(args.data_file('sources.bib')):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    contrib = common.Contribution(id='costatol', name='COSTATOL')
    for datapoint in reader(args.data_file('values.csv'), delimiter=',', dicts=True):
        param = data['Parameter'].get(datapoint['cognitive capacity'])
        if not param:
            name = datapoint['cognitive capacity']
            param = data.add(common.Parameter, name, id=slug(name), name=name)

        species = data['Language'].get(datapoint['species'])
        if not species:
            name = datapoint['species']
            species = data.add(common.Language, name, id=slug(name), name=name)

        vid = '%s-%s' % (species.id, param.id)
        vs = data.add(
            common.ValueSet,
            vid,
            id=vid,
            language=species,
            parameter=param,
            contribution=contrib)
        data.add(common.Value, vid, id=vid, name=datapoint['value'], valueset=vs)
        match = source_pattern.match(datapoint['source'])
        if match:
            DBSession.add(common.ValueSetReference(
                valueset=vs,
                source=data['Source'][match.group('key')],
                description=match.group('pages')))

    for species in reader(args.data_file('species.csv'), delimiter=',', namedtuples=True):
        data['Language'][species.name].longitude = species.longitude
        data['Language'][species.name].latitude = species.latitude
Beispiel #26
0
def migrate(from_, to_, converter):  # pragma: no cover
    for row in DB.execute("select * from %s" % from_):
        res = converter(row)
        if not res:
            continue
        if isinstance(res, dict):
            DBSession.add(to_(**res))
        else:
            data.add(to_, res[0], **res[1])
    DBSession.flush()
Beispiel #27
0
def test_Base(db):
    l = Language(id='abc', name='Name')
    DBSession.add(l)
    DBSession.flush()
    DBSession.expunge(l)
    l = Language.get('abc', session=DBSession)
    assert l.name == 'Name'

    Language().__str__()
    assert repr(l) == "<Language 'abc'>"
Beispiel #28
0
def migrate(from_, to_, converter):  # pragma: no cover
    for row in DB.execute("select * from %s" % from_):
        res = converter(row)
        if not res:
            continue
        if isinstance(res, dict):
            DBSession.add(to_(**res))
        else:
            data.add(to_, res[0], **res[1])
    DBSession.flush()
Beispiel #29
0
    def test_compute_language_sources(self):
        from clld.db.models.common import Source, Sentence, Language, SentenceReference
        from clld.db.meta import DBSession
        from clld.db.util import compute_language_sources

        s = Sentence(id='sentenced', language=Language(id='newlang'))
        sr = SentenceReference(sentence=s, source=Source.first())
        DBSession.add(sr)
        DBSession.flush()
        compute_language_sources()
Beispiel #30
0
 def add_morpheme_reference(morpheme, source_string):
     bib_key, pages = get_key_and_page(source_string)
     if bib_key in data["Source"]:
         source = data["Source"][bib_key]
         DBSession.add(models.MorphemeReference(
             morpheme=morpheme,
             source=source,
             key=source.id,
             description=pages.replace("--","–")
             )
         )
Beispiel #31
0
def import_features(cldf, contributors):  # pragma: no cover
    """
    ? = gray cbbbbbb (is ? mapped? if not then don't worry)
    0 = blue c0077bb
    1 = red ccc3311
    2 = teal c009988
    3 = orange cee7733
    """
    features, codes = {}, {}
    icons = [
        'cffffff',  # 'c0077bb'
        'cff0000',  # 'ccc3311'
        'c0000ff',  # 'c009988'
        'cffff00',  # 'cee7733'
    ]
    domains = {}
    for fid, des in itertools.groupby(
            sorted(cldf['CodeTable'], key=lambda c: c['Parameter_ID']),
            lambda c: c['Parameter_ID']):
        domains[fid] = list(des) + [
            dict(ID=fid + '-NA', Name='?', Description='Not known')
        ]

    for feature in tqdm(list(cldf['ParameterTable']), desc='loading features'):
        fid = feature['ID']
        f = Feature(
            id=fid,
            name=feature['Name'],
            description=feature['Description'],
        )
        for ord, patron in enumerate(feature['Patrons'], start=1):
            DBSession.add(
                FeaturePatron(ord=1,
                              feature=f,
                              contributor_pk=contributors[patron]))
        for code in domains[fid]:
            if code['Name'] == '?':
                icon, number, value = 'tcccccc', 999, None
            else:
                icon, number, value = icons[int(code['Name'])], int(
                    code['Name']), code['Name']
            DomainElement(id=code['ID'],
                          parameter=f,
                          name=code['Name'],
                          number=number,
                          description=code['Description'],
                          jsondata=dict(icon=icon))
        DBSession.add(f)
        DBSession.flush()
        features[fid] = f.pk
        for de in f.domain:
            codes[de.id] = de.pk

    return features, codes
Beispiel #32
0
def _addSource(lp):
    """For a lighter 'main' function."""

    DBSession.add(
        common.Source(id=lp[0],
                      name=lp[0],
                      author=lp[2],
                      year=lp[3],
                      title=lp[4],
                      url=lp[5],
                      note=lp[6]))
    DBSession.flush()
Beispiel #33
0
def update(args):
    pid, cid = 'vitality', 'unesco'
    count = 0
    notfound = {}
    contrib = common.Contribution.get(cid, default=None)
    if not contrib:
        contrib = common.Contribution(
            id=cid,
            name='Atlas of the World’s Languages in Danger',
            description='Atlas of the World’s Languages in Danger, © UNESCO, http://www.unesco.org/culture/languages-atlas')
    param = common.Parameter.get(pid, default=None)
    if param is None:
        param = common.Parameter(
            id=pid,
            name='Degree of endangerment')
    domain = {de.name: de for de in param.domain}
    for i, spec in enumerate(VITALITY_VALUES):
        name, desc = spec
        if name not in domain:
            number = i + 1
            domain[name] = common.DomainElement(
                id='%s-%s' % (pid, number),
                name=name,
                description=desc,
                number=number,
                parameter=param)
    valuesets = {vs.id: vs for vs in param.valuesets}
    for item in reader(args.data_file(DATA_FILE), dicts=True):
        if item['ISO639-3 codes']:
            for code in item['ISO639-3 codes'].split(','):
                code = code.strip()
                lang = Languoid.get(code, key='hid', default=None)
                if lang:
                    count += 1
                    item['url'] = 'http://www.unesco.org/culture/languages-atlas/en/atlasmap/language-iso-%s.html' % code
                    lang.update_jsondata(unesco=item)
                    de = domain[item['Degree of endangerment']]
                    vsid = '%s-%s' % (pid, lang.id)
                    vs = valuesets.get(vsid)
                    if not vs:
                        vs = common.ValueSet(
                            id='vitality-%s' % lang.id,
                            parameter=param,
                            contribution=contrib,
                            language=lang)
                        DBSession.add(common.Value(valueset=vs, name=de.name, domainelement=de))
                        valuesets[vsid] = vs
                    else:
                        vs.values[0].domainelement = de
                else:
                    notfound[code] = 1
    print 'assigned', count, 'unesco urls'
    print 'missing iso codes:', notfound
Beispiel #34
0
 def add(self, model, key, **kw):
     if kw.keys() == ['_obj']:
         # if a single keyword parameter _obj is passed, we take it to be the object
         # which should be added to the session.
         new = kw['_obj']
     else:
         for k, v in self.defaults.items():
             kw.setdefault(k, v)
         new = model(**kw)
     self[model.mapper_name()][key] = new
     DBSession.add(new)
     return new
Beispiel #35
0
def main(args):
    glottocodes = {}
    if getuser() == "robert":
        glottocodes = glottocodes_by_isocode("postgresql://robert@/glottolog3")

    data = Data()
    dataset = common.Dataset(id=autotyp.__name__, name="AUTOTYP", description="AUTOTYP", domain="autotyp.clld.org")
    DBSession.add(dataset)

    bib = Database.from_file(args.data_file("LenaBib.bib"), lowercase=True)

    for i, spec in enumerate(
        [
            ("bickel", "Balthasar Bickel", "University of Zurich"),
            ("nichols", "Johanna Nichols", "University of California, Berkeley"),
        ]
    ):
        contributor = data.add(common.Contributor, spec[0], id=spec[0], name=spec[1])
        DBSession.add(common.Editor(dataset=dataset, ord=i + 1, contributor=contributor))

    for l in rows(
        args.data_file("backbone_09Jan2014_directexport.tab"), newline="\r", encoding="macroman", namedtuples=True
    ):
        # LID	language	ISO639.3.2013	stock	continent	area	latitude	longitude
        if l.stock not in data["Stock"]:
            stock = data.add(models.Stock, l.stock, id=slug(l.stock), name=l.stock)
        else:
            stock = data["Stock"][l.stock]

        if l.continent not in data["Continent"]:
            continent = data.add(models.Continent, l.continent, id=slug(l.continent), name=l.continent)
        else:
            continent = data["Continent"][l.continent]

        if l.area not in data["Area"]:
            area = data.add(models.Area, l.area, id=slug(l.area), name=l.area, continent=continent)
        else:
            area = data["Area"][l.area]

        lang = data.add(
            models.Languoid,
            l.LID,
            id=l.LID,
            name=l.language,
            latitude=coord(l.latitude),
            longitude=coord(l.longitude),
            stock=stock,
            area=area,
        )
        add_language_codes(data, lang, l.ISO639_3_2013, glottocodes=glottocodes)

    loader.case_alignment(args, data, bib)
    loader.inclusive_excusive(args, data, bib)
Beispiel #36
0
 def add(self, model, key, **kw):
     if kw.keys() == ['_obj']:
         # if a single keyword parameter _obj is passed, we take it to be the object
         # which should be added to the session.
         new = kw['_obj']
     else:
         for k, v in self.defaults.items():
             kw.setdefault(k, v)
         new = model(**kw)
     self[model.mapper_name()][key] = new
     DBSession.add(new)
     return new
Beispiel #37
0
    def test_Files(self):
        from clld.db.models.common import Language, Language_files

        if PY3:
            return  # pragma: no cover

        l = Language(id='abc', name='Name')
        assert l.iso_code is None
        l._files.append(Language_files(id='abstract'))
        DBSession.add(l)
        DBSession.flush()
        DBSession.refresh(l)
        f = l.files['abstract']
Beispiel #38
0
    def test_Files(self):
        from clld.db.models.common import Language, Language_files

        if PY3:
            return  # pragma: no cover

        l = Language(id='abc', name='Name')
        assert l.iso_code is None
        l._files.append(Language_files(id='abstract'))
        DBSession.add(l)
        DBSession.flush()
        DBSession.refresh(l)
        f = l.files['abstract']
Beispiel #39
0
 def add_refs(self, data, table, row, obj):
     if table == 'EntryTable':
         model, kw = models.WordReference, dict(word=obj)
     elif table == 'SenseTable':
         model, kw = models.MeaningReference, dict(meaning=obj)
     else:
         raise ValueError(table)
     refs_col = self.cldf.get((table, 'source'))
     if refs_col:
         for sid, context in map(self.cldf.sources.parse, row.get(refs_col.name, [])):
             if sid in data['DictionarySource']:
                 DBSession.add(model(
                     source=data['DictionarySource'][sid], description=context, **kw))
Beispiel #40
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    repos = Path(
        os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data')

    with transaction.manager:
        dataset = common.Dataset(
            id=lexirumah.__name__,
            name="lexirumah",
            publisher_name=
            "Max Planck Institute for the Science of Human History",
            publisher_place="Jena",
            publisher_url="http://shh.mpg.de",
            license="http://creativecommons.org/licenses/by/4.0/",
            domain='lexirumah.model-ling.eu',
            contact='*****@*****.**',
            jsondata={
                'license_icon':
                'cc-by.png',
                'license_name':
                'Creative Commons Attribution 4.0 International License'
            })
        DBSession.add(dataset)

    glottolog_repos = Path(
        lexirumah.__file__).parent.parent.parent.parent.joinpath(
            'glottolog3', 'glottolog')
    languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()}
    concepticon = Concepticon(
        Path(lexirumah.__file__).parent.parent.parent.parent.joinpath(
            'concepticon', 'concepticon-data'))
    conceptsets = {c.id: c for c in concepticon.conceptsets.values()}

    skip = True
    for dname in sorted(repos.joinpath('datasets').iterdir(),
                        key=lambda p: p.name):
        #if dname.name == 'benuecongo':
        #    skip = False
        #if skip:
        #    continue
        if dname.is_dir() and dname.name != '_template':
            mdpath = dname.joinpath('cldf', 'metadata.json')
            if mdpath.exists():
                print(dname.name)
                import_cldf(dname, load(mdpath), languoids, conceptsets)

    with transaction.manager:
        load_families(Data(),
                      DBSession.query(LexiRumahLanguage),
                      glottolog_repos=glottolog_repos,
                      isolates_icon='tcccccc')
Beispiel #41
0
 def add(self, model, key, **kw):
     if '.' in kw.get('id', ''):
         raise ValueError('Object id contains illegal character "."')
     if list(kw.keys()) == ['_obj']:
         # if a single keyword parameter _obj is passed, we take it to be the object
         # which should be added to the session.
         new = kw['_obj']
     else:
         for k, v in self.defaults.items():
             kw.setdefault(k, v)
         new = model(**kw)
     self[model.__name__][key] = new
     DBSession.add(new)
     return new
Beispiel #42
0
def testapp():
    def main():
        cfg = config.Configurator(settings={
            'sqlalchemy.url': 'sqlite://',
            'mako.directories': ['clldmpg:templates', 'clld:web/templates']})
        cfg.include('clldmpg')
        return cfg.make_wsgi_app()

    DBSession.remove()
    wsgi_app = main()
    Base.metadata.bind = DBSession.bind
    Base.metadata.create_all()
    DBSession.add(common.Dataset(id='1', name='test app', domain='example.org'))
    yield ExtendedTestApp(wsgi_app)
Beispiel #43
0
def prime_cache(args):

    # add number of data points per parameter
    for np in DBSession.query(models.NumberParameter, func.count(common.Parameter.pk)) \
            .join(common.Parameter) \
            .join(common.ValueSet) \
            .join(common.Value) \
            .group_by(models.NumberParameter.pk, common.Parameter.pk):
        np[0].count_of_datapoints = np[1]

    # add number of distinct varieties per parameter based on assigned glottocodes
    for np in DBSession.query(models.NumberParameter, func.count(common.Identifier.name)) \
            .join(common.ValueSet) \
            .join(common.Value) \
            .join(common.Language, common.ValueSet.language_pk == common.Language.pk) \
            .join(common.LanguageIdentifier) \
            .join(common.Identifier) \
            .filter(common.Identifier.type == common.IdentifierType.glottolog.value) \
            .group_by(models.NumberParameter.pk, common.Parameter.pk):
        np[0].count_of_varieties = np[1]

    # add number of data points of parameter "base"
    base_pk, cnt_base = DBSession.query(common.Parameter.pk, func.count(common.ValueSet.pk)) \
        .join(common.Parameter) \
        .filter(common.Parameter.name == 'Base') \
        .group_by(common.Parameter.pk).all()[0]
    for np in DBSession.query(models.Parameter) \
            .join(models.NumberParameter) \
            .filter(common.Parameter.pk == base_pk):
        np.count_of_datapoints = cnt_base
        break

    DBSession.query(LanguageTreeLabel).delete()
    DBSession.query(TreeLabel).delete()
    DBSession.query(Phylogeny).delete()

    langs = [l for l in DBSession.query(common.Language) if l.glottocode]

    newick, _ = tree(
        [l.glottocode for l in langs], gl_repos=gl_repos
    )

    phylo = Phylogeny(id="phy", name="glottolog global tree", newick=newick)

    for l in langs:
        LanguageTreeLabel(
            language=l, treelabel=TreeLabel(id=l.id, name=l.glottocode, phylogeny=phylo)
        )

    DBSession.add(phylo)
Beispiel #44
0
def add_identifier(languoid, data, name, type, description, lang='en'):
    identifier = data['Identifier'].get((name, type, description, lang))
    if not identifier:
        identifier = data.add(common.Identifier,
                              (name, type, description, lang),
                              id='{0}-{1}-{2}-{3}'.format(
                                  slug(name), slug(type),
                                  slug(description or ''), lang),
                              name=name,
                              type=type,
                              description=description,
                              lang=lang)
    DBSession.add(
        common.LanguageIdentifier(language=languoid, identifier=identifier))
Beispiel #45
0
 def add(self, model, key, **kw):
     if '.' in kw.get('id', ''):
         raise ValueError('Object id contains illegal character "."')
     if kw.keys() == ['_obj']:
         # if a single keyword parameter _obj is passed, we take it to be the object
         # which should be added to the session.
         new = kw['_obj']
     else:
         for k, v in self.defaults.items():
             kw.setdefault(k, v)
         new = model(**kw)
     self[model.mapper_name()][key] = new
     DBSession.add(new)
     return new
Beispiel #46
0
def testapp():
    def main():
        cfg = config.Configurator(settings={
            'sqlalchemy.url': 'sqlite://',
            'mako.directories': ['clldlucl:templates', 'clld:web/templates']})
        cfg.include('clldlucl')
        return cfg.make_wsgi_app()

    DBSession.remove()
    wsgi_app = main()
    Base.metadata.bind = DBSession.bind
    Base.metadata.create_all()
    DBSession.add(common.Dataset(id='1', name='test app', domain='example.org'))
    yield ExtendedTestApp(wsgi_app)
Beispiel #47
0
def test_Files(db, tmppath):
    from clld.db.models.common import Sentence, Sentence_files

    l = Sentence(id='abc', name='Name')
    f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg')
    p = f.create(tmppath, 'content')
    assert Path(p).exists()

    l._files.append(f)
    DBSession.add(l)
    DBSession.flush()
    DBSession.refresh(l)
    assert l.files
    assert l.audio
Beispiel #48
0
def test_Files(db, tmppath):
    from clld.db.models.common import Sentence, Sentence_files

    l = Sentence(id='abc', name='Name')
    f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg')
    p = f.create(Path(tmppath), 'content')
    assert Path(p).exists()

    l._files.append(f)
    DBSession.add(l)
    DBSession.flush()
    DBSession.refresh(l)
    assert l.files
    assert l.audio
Beispiel #49
0
 def add(self, model, key, **kw):
     if "." in kw.get("id", ""):
         raise ValueError('Object id contains illegal character "."')
     if list(kw.keys()) == ["_obj"]:
         # if a single keyword parameter _obj is passed, we take it to be the object
         # which should be added to the session.
         new = kw["_obj"]
     else:
         for k, v in self.defaults.items():
             kw.setdefault(k, v)
         new = model(**kw)
     self[model.__name__][key] = new
     DBSession.add(new)
     return new
Beispiel #50
0
def add_language_codes(data, lang, isocode, glottocodes=None):
    def identifier(type_, id_):
        return data.add(
            common.Identifier, '%s:%s' % (type_, id_),
            id='%s:%s' % (type_, id_),
            name=id_,
            type=getattr(common.IdentifierType, type_).value)

    if isocode and len(isocode) == 3:
        DBSession.add(common.LanguageIdentifier(
            language=lang, identifier=identifier('iso', isocode)))

        if glottocodes and isocode in glottocodes:
            DBSession.add(common.LanguageIdentifier(
                language=lang, identifier=identifier('glottolog', glottocodes[isocode])))
Beispiel #51
0
def test_CsvMixin(db):
    l1 = Language(id='abc', name='Name', latitude=12.4, jsondata=dict(a=None))
    DBSession.add(l1)
    DBSession.flush()
    l1 = Language.csv_query(DBSession).first()
    cols = l1.csv_head()
    row = l1.to_csv()
    for k, v in zip(cols, row):
        if k == 'jsondata':
            assert 'a' in json.loads(v)
    l2 = Language.from_csv(row)
    assert pytest.approx(l1.latitude) == l2.latitude
    row[cols.index('latitude')] = '3,5'
    l2 = Language.from_csv(row)
    assert l2.latitude < l1.latitude
Beispiel #52
0
def test_CsvMixin(db):
    l1 = Language(id='abc', name='Name', latitude=12.4, jsondata=dict(a=None))
    DBSession.add(l1)
    DBSession.flush()
    l1 = Language.csv_query(DBSession).first()
    cols = l1.csv_head()
    row = l1.to_csv()
    for k, v in zip(cols, row):
        if k == 'jsondata':
            assert 'a' in json.loads(v)
    l2 = Language.from_csv(row)
    assert pytest.approx(l1.latitude) == l2.latitude
    row[cols.index('latitude')] = '3,5'
    l2 = Language.from_csv(row)
    assert l2.latitude < l1.latitude
Beispiel #53
0
 def test_CsvMixin(self):
     l1 = Language(id='abc', name='Name', latitude=12.4, jsondata=dict(a=None))
     DBSession.add(l1)
     DBSession.flush()
     l1 = Language.csv_query(DBSession).first()
     cols = l1.csv_head()
     row = l1.to_csv()
     for k, v in zip(cols, row):
         if k == 'jsondata':
             self.assertIn('a', json.loads(v))
     l2 = Language.from_csv(row)
     assert_almost_equal(l1.latitude, l2.latitude)
     row[cols.index('latitude')] = '3,5'
     l2 = Language.from_csv(row)
     self.assertLess(l2.latitude, l1.latitude)
Beispiel #54
0
def main(args):
    data = Data()

    # fetch language data from glottolog:
    glottolog = glottocodes_by_isocode(
        'postgresql://robert@/glottolog3', ['id', 'name', 'latitude', 'longitude'])

    dataset = common.Dataset(
        id=jcld.__name__,
        name="Journal of Cross-Linguistic Databases",
        domain='jcld.clld.org')
    DBSession.add(dataset)

    contribution = data.add(common.Contribution, '1', id='1', name='fb')

    for i, row in enumerate(reader(file(args.data_file('fb_jcld.tab')), namedtuples=True, encoding='latin1')):
        if row.Feature not in data['Parameter']:
            parameter = data.add(common.Parameter, row.Feature, id='1', name=row.Feature)
        else:
            parameter = data['Parameter'][row.Feature]

        if row.Value not in data['DomainElement']:
            de = data.add(
                common.DomainElement, row.Value,
                id='%s-%s' % (parameter.id, slug(row.Value)), parameter=parameter, name=row.Value)
        else:
            de = data['DomainElement'][row.Value]

        if row.Language not in data['Language']:
            if row.Language not in glottolog:
                print '--->', row.Language
                continue
            glottocode, name, lat, lon = glottolog[row.Language]
            language = data.add(
                common.Language, row.Language,
                id=slug(row.Language), name=name, latitude=lat, longitude=lon)
        else:
            language = data['Language'][row.Language]

        id_ = str(i + 1)  #'%s-%s' % (parameter.id, language.id)
        vs = common.ValueSet(
            id=id_,
            parameter=parameter,
            language=language,
            contribution=contribution,
            description=row.Comment,
            source=row.Source)
        common.Value(valueset=vs, name=row.Value, domainelement=de)
Beispiel #55
0
 def add_file(self, type_, checksum, file_cls, obj):
     if checksum in self.cdstar:
         jsondata = {k: v for k, v in self.props.get(type_, {}).items()}
         jsondata.update(self.cdstar[checksum])
         f = file_cls(
             id='%s-%s' % (obj.id, checksum),
             name=self.cdstar[checksum]['original'],
             object_pk=obj.pk,
             mime_type=self.cdstar[checksum]['mimetype'],
             jsondata=jsondata)
         DBSession.add(f)
         DBSession.flush()
         DBSession.refresh(f)
         return
     print('{0} file missing: {1}'.format(type_, checksum))
     return
Beispiel #56
0
def add_values(data, dblang, pid, values, with_de=True, **vskw):
    vs = None
    for i, (vid, vname) in enumerate(values):
        if i == 0:
            vs = common.ValueSet(
                id=idjoin(pid, dblang.id),
                language=dblang,
                parameter=data['Parameter'][pid],
                contribution=data['Contribution']['glottolog'],
                **vskw)
        vkw = dict(id=idjoin(pid, slug(vid), dblang.id),
                   name=vname,
                   valueset=vs)
        if with_de:
            vkw['domainelement'] = data['DomainElement'][pid, vid]
        DBSession.add(common.Value(**vkw))
Beispiel #57
0
def _addEditor(dataset, count, lp):
    """For a lighter 'main' function."""
    eds = ['Frank Seifart', 'Ludger Paschen', 'Matthew Stave']
    ed = dorEditor(id=lp[0],
                   name=lp[0],
                   url=lp[1],
                   email=lp[2],
                   address=lp[3],
                   team=lp[4],
                   function=lp[5])
    if lp[0] in eds:
        common.Editor(dataset=dataset, contributor=ed, ord=count + 1)
        count += 1
    DBSession.add(ed)
    DBSession.flush()
    return dataset, count
Beispiel #58
0
def add_identifier(languoid, data, name, type, description, lang='en'):
    if len(lang) > 3:
        # Weird stuff introduced via hhbib_lgcode names. Roll back language parsing.
        name, lang = '{0} [{1}]'.format(name, lang), 'en'
    identifier = data['Identifier'].get((name, type, description, lang))
    if not identifier:
        identifier = data.add(
            common.Identifier,
            (name, type, description, lang),
            id='{0}-{1}-{2}-{3}'.format(
                slug(name), slug(type), slug(description or ''), lang),
            name=name,
            type=type,
            description=description,
            lang=lang)
    DBSession.add(common.LanguageIdentifier(language=languoid, identifier=identifier))
Beispiel #59
0
def _addText(lp):
    """For a lighter 'main' function and because of checks."""

    for a in range(1, len(lp)):
        if a == 18:
            if lp[a] == "no":
                lp[a] = False
            else:
                lp[a] = True
        elif a == 17:
            if not lp[a] or str(lp[a]).startswith("check"):
                lp[a] = 0
        elif a == 9:
            genre = lp[9].lower()
            if genre == "personal narrative":
                genre = "pers. narr."
            elif genre == "traditional narrative":
                genre = "trad. narr."
            elif genre == "conversation":
                genre = "convers."
            elif genre == "stimulus-based":
                genre = "stimulus"
            lp[9] = genre
        elif not lp[a]:
            lp[a] = 'na'
    DBSession.add(
        doreContrib(id=lp[1],
                    tname=lp[2],
                    spks=lp[3],
                    spks_age=lp[4],
                    spks_agec=lp[5],
                    spks_sex=lp[6],
                    recdate=lp[7],
                    recdatec=lp[8],
                    genre=lp[9],
                    subgenre=lp[10],
                    gloss=lp[11],
                    transl=lp[12],
                    sound=lp[13],
                    overlap=lp[14],
                    process=lp[15],
                    NAK=lp[16],
                    glottocode=lp[0],
                    words=lp[17],
                    extended=lp[18]))
    DBSession.flush()