def upgrade():
    conn = Connection(op.get_bind())
    example_map = {}

    sid = 204
    for example in jsonload(data_file('lingala_examples.json')):
        sid += 1
        kw = {
            'id': '60-%s' % sid,
            'language_pk': conn.pk(Language, '60'),
            'name': example['Text'],
            'description': example['Translation'],
            'gloss': '\t'.join(example['Gloss'].split()),
            'analyzed': '\t'.join(example['Text'].split()),
            'type': example['Type'].strip().lower(),
            'jsondata': {'sort': int(example['Order_number']), 'alt_translation': None}
        }
        example_map[example['Example_number']] = conn.insert(Sentence, **kw)

    for ve in jsonload(data_file('lingala_value_examples.json')):
        vspk = conn.pk(ValueSet, '60-%s' % ve['Features::Feature_number'])
        vpk = conn.pk(Value, vspk, attr='valueset_pk')
        conn.insert(
            ValueSentence, value_pk=vpk, sentence_pk=example_map[ve['Example_number']])

    for i, comment in enumerate(reader(data_file('lingala_valueset_comments.tab'), delimiter='\t', dicts=True)):
        vspk = conn.pk(ValueSet, '60-%s' % comment['Features::Feature_number'])
        comment['Comments_on_value_assignment'] = comment['Comments_on_value_assignment'].replace('\x0b', '\n')
        conn.update(
            ValueSet,
            {
                'description': comment['Comments_on_value_assignment'],
                'markup_description': None,
            },
            pk=vspk)
Example #2
0
def load_ecoregions(data_file, data):
    ecoregions = jsonload(data_file('ecoregions.json'))['features']

    biome_map = {
        1: ('Tropical & Subtropical Moist Broadleaf Forests', '008001'),
        2: ('Tropical & Subtropical Dry Broadleaf Forests', '557715'),
        3: ('Tropical & Subtropical Coniferous Forests', ''),
        4: ('Temperate Broadleaf & Mixed Forests', ''),
        5: ('Temperate Conifer Forests', ''),
        6: ('Boreal Forests/Taiga', ''),
        7:
        ('Tropical & Subtropical Grasslands, Savannas & Shrublands', '98ff66'),
        8: ('Temperate Grasslands, Savannas & Shrublands', ''),
        9: ('Flooded Grasslands & Savannas', '0265fe'),
        10: ('Montane Grasslands & Shrublands', 'cdffcc'),
        11: ('Tundra', ''),
        12: ('Mediterranean Forests, Woodlands & Scrub', 'cc9900'),
        13: ('Deserts & Xeric Shrublands', 'feff99'),
        14: ('Mangroves', '870083'),
    }

    for eco_code, features in groupby(
            sorted(ecoregions, key=lambda e: e['properties']['eco_code']),
            key=lambda e: e['properties']['eco_code']):
        features = list(features)
        props = features[0]['properties']
        if int(props['BIOME']) not in biome_map:
            continue
        biome = data['Biome'].get(props['BIOME'])
        if not biome:
            name, color = biome_map[int(props['BIOME'])]
            biome = data.add(Biome,
                             props['BIOME'],
                             id=str(int(props['BIOME'])),
                             name=name,
                             description=color or 'ffffff')
        centroid = (None, None)
        f = sorted(features, key=lambda _f: _f['properties']['AREA'])[-1]
        if f['geometry']:
            coords = f['geometry']['coordinates'][0]
            if f['geometry']['type'] == 'MultiPolygon':
                coords = coords[0]
            centroid = get_center(coords)

        polygons = nfilter([_f['geometry'] for _f in features])
        data.add(Ecoregion,
                 eco_code,
                 id=eco_code,
                 name=props['ECO_NAME'],
                 description=props['G200_REGIO'],
                 latitude=centroid[1],
                 longitude=centroid[0],
                 biome=biome,
                 area=props['area_km2'],
                 gbl_stat=Ecoregion.gbl_stat_map[int(props['GBL_STAT'])],
                 realm=Ecoregion.realm_map[props['REALM']],
                 jsondata=dict(polygons=polygons))
Example #3
0
def load_ecoregions(data_file, data):
    ecoregions = jsonload(data_file('ecoregions.json'))['features']

    biome_map = {
        1: ('Tropical & Subtropical Moist Broadleaf Forests', '008001'),
        2: ('Tropical & Subtropical Dry Broadleaf Forests', '557715'),
        3: ('Tropical & Subtropical Coniferous Forests', ''),
        4: ('Temperate Broadleaf & Mixed Forests', ''),
        5: ('Temperate Conifer Forests', ''),
        6: ('Boreal Forests/Taiga', ''),
        7: ('Tropical & Subtropical Grasslands, Savannas & Shrublands', '98ff66'),
        8: ('Temperate Grasslands, Savannas & Shrublands', ''),
        9: ('Flooded Grasslands & Savannas', '0265fe'),
        10: ('Montane Grasslands & Shrublands', 'cdffcc'),
        11: ('Tundra', ''),
        12: ('Mediterranean Forests, Woodlands & Scrub', 'cc9900'),
        13: ('Deserts & Xeric Shrublands', 'feff99'),
        14: ('Mangroves', '870083'),
    }

    for eco_code, features in groupby(
            sorted(ecoregions, key=lambda e: e['properties']['eco_code']),
            key=lambda e: e['properties']['eco_code']):
        features = list(features)
        props = features[0]['properties']
        if int(props['BIOME']) not in biome_map:
            continue
        biome = data['Biome'].get(props['BIOME'])
        if not biome:
            name, color = biome_map[int(props['BIOME'])]
            biome = data.add(
                Biome, props['BIOME'],
                id=str(int(props['BIOME'])),
                name=name,
                description=color or 'ffffff')
        centroid = (None, None)
        f = sorted(features, key=lambda _f: _f['properties']['AREA'])[-1]
        if f['geometry']:
            coords = f['geometry']['coordinates'][0]
            if f['geometry']['type'] == 'MultiPolygon':
                coords = coords[0]
            centroid = get_center(coords)

        polygons = nfilter([_f['geometry'] for _f in features])
        data.add(
            Ecoregion, eco_code,
            id=eco_code,
            name=props['ECO_NAME'],
            description=props['G200_REGIO'],
            latitude=centroid[1],
            longitude=centroid[0],
            biome=biome,
            area=props['area_km2'],
            gbl_stat=Ecoregion.gbl_stat_map[int(props['GBL_STAT'])],
            realm=Ecoregion.realm_map[props['REALM']],
            jsondata=dict(polygons=polygons))
Example #4
0
def stability(req):
    fs = jsonload(abspath_from_asset_spec('grambank:static/stability.json'))
    #lfv = DBSession.query(Value).join(Value.valueset)\
    #        .options(
    #        joinedload(Value.valueset, ValueSet.language),
    #        joinedload(Value.valueset, ValueSet.parameter)
    #    )

    #trp = [(v.id,) for v in lfv]
    #sorted(fs.items(), key = lambda (f, s): s, reverse=True)
    return {'data': fs}
Example #5
0
def stability(req):
    fs = jsonload(abspath_from_asset_spec('culturebank:static/stability.json'))
    #lfv = DBSession.query(Value).join(Value.valueset)\
    #        .options(
    #        joinedload(Value.valueset, ValueSet.language),
    #        joinedload(Value.valueset, ValueSet.parameter)
    #    )

    #trp = [(v.id,) for v in lfv]
    #sorted(fs.items(), key = lambda (f, s): s, reverse=True)
    return {'data': fs}
Example #6
0
def upgrade():
    conn = Connection(op.get_bind())
    example_map = {}

    sid = 204
    for example in jsonload(data_file('lingala_examples.json')):
        sid += 1
        kw = {
            'id': '60-%s' % sid,
            'language_pk': conn.pk(Language, '60'),
            'name': example['Text'],
            'description': example['Translation'],
            'gloss': '\t'.join(example['Gloss'].split()),
            'analyzed': '\t'.join(example['Text'].split()),
            'type': example['Type'].strip().lower(),
            'jsondata': {
                'sort': int(example['Order_number']),
                'alt_translation': None
            }
        }
        example_map[example['Example_number']] = conn.insert(Sentence, **kw)

    for ve in jsonload(data_file('lingala_value_examples.json')):
        vspk = conn.pk(ValueSet, '60-%s' % ve['Features::Feature_number'])
        vpk = conn.pk(Value, vspk, attr='valueset_pk')
        conn.insert(ValueSentence,
                    value_pk=vpk,
                    sentence_pk=example_map[ve['Example_number']])

    for i, comment in enumerate(
            reader(data_file('lingala_valueset_comments.tab'),
                   delimiter='\t',
                   dicts=True)):
        vspk = conn.pk(ValueSet, '60-%s' % comment['Features::Feature_number'])
        comment['Comments_on_value_assignment'] = comment[
            'Comments_on_value_assignment'].replace('\x0b', '\n')
        conn.update(ValueSet, {
            'description': comment['Comments_on_value_assignment'],
            'markup_description': None,
        },
                    pk=vspk)
Example #7
0
def dependency(req):
    print "HEJ2"
    deps = jsonload(abspath_from_asset_spec('grambank:static/dependencies.json'))
    #lfv = DBSession.query(Value).join(Value.valueset)\
    #        .options(
    #        joinedload(Value.valueset, ValueSet.language),
    #        joinedload(Value.valueset, ValueSet.parameter)
    #    )

    #trp = [(v.id,) for v in lfv]
    #sorted(fs.items(), key = lambda (f, s): s, reverse=True)
    return {'data': deps}
Example #8
0
def coverage(req):
    gl = jsonload(
        abspath_from_asset_spec('grambank:static/stats_by_macroarea.json'))

    stats = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    for ma in gl:
        for dt in gl[ma]:
            ids = gl[ma][dt]

            isolates = select([Language.__table__.c.id
                               ]).where(Language.__table__.c.id.in_(ids))
            families = select([Family.__table__.c.id
                               ]).where(Family.__table__.c.id.in_(ids))
            stats[ma][dt] = dict(
                glottolog=len(ids),
                grambank=DBSession.query(
                    isolates.union(families).alias('u')).count())
        stats[ma]['total'] = {}
        for src in ['glottolog', 'grambank']:
            stats[ma]['total'][src] = \
                stats[ma]['grammar'][src] + stats[ma]['grammarsketch'][src]

    gl = jsonload(
        abspath_from_asset_spec(
            'grambank:static/stats_by_classification.json'))
    gb_langs = set([r[0] for r in DBSession.query(Language.id)])

    cstats = OrderedDict()
    for fid, spec in sorted(gl.items(), key=lambda k: k[1]['name']):
        d = dict(macroareas=spec['macroareas'],
                 grammar=Counter(),
                 grammarsketch=Counter(),
                 total=Counter(),
                 covered=gb_langs.intersection(set(spec['extension'])),
                 isolate=not bool(spec.get('subgroups')),
                 subgroups={})
        if not spec.get('subgroups'):
            # an isolate!
            d[spec['doctype']].update(['glottolog'])
            d['total'].update(['glottolog'])
            if gb_langs.intersection(set(spec['extension'])):
                d[spec['doctype']].update(['grambank'])
                d['total'].update(['grambank'])
        for sfid, sub in spec.get('subgroups', {}).items():
            if not sub.get('subgroups'):
                sub['name'] = '%s*' % sub['name']
            d[sub['doctype']].update(['glottolog'])
            d['total'].update(['glottolog'])
            if gb_langs.intersection(set(sub['extension'])):
                d[sub['doctype']].update(['grambank'])
                d['total'].update(['grambank'])
            d['subgroups'][(sfid, sub['name'])] = dict(
                macroareas=spec['macroareas'],
                covered=gb_langs.intersection(set(sub['extension'])),
                grammar=Counter(),
                grammarsketch=Counter(),
                total=Counter())
            if not sub.get('subgroups'):
                # a language attached directly to the top-level family
                d['subgroups'][(sfid, sub['name'])][sub['doctype']].update(
                    ['glottolog'])
                d['subgroups'][(sfid,
                                sub['name'])]['total'].update(['glottolog'])
                if gb_langs.intersection(set(sub['extension'])):
                    d['subgroups'][(sfid, sub['name'])][sub['doctype']].update(
                        ['grambank'])
                    d['subgroups'][(sfid,
                                    sub['name'])]['total'].update(['grambank'])
            for ssfid, ssub in sub.get('subgroups', {}).items():
                if ssub['doctype']:
                    d['subgroups'][(sfid,
                                    sub['name'])][ssub['doctype']].update(
                                        ['glottolog'])
                    d['subgroups'][(sfid, sub['name'])]['total'].update(
                        ['glottolog'])
                    if gb_langs.intersection(set(ssub['extension'])):
                        d['subgroups'][(sfid,
                                        sub['name'])][ssub['doctype']].update(
                                            ['grambank'])
                        d['subgroups'][(sfid, sub['name'])]['total'].update(
                            ['grambank'])
        cstats[(fid, spec['name'])] = d

    return dict(
        stats=stats,
        cstats=cstats,
        macroareas=jsonload(
            abspath_from_asset_spec('grambank:static/stats_macroareas.json')))
Example #9
0
def import_dataset(path, data, icons, add_missing_features = False):
    # look for metadata
    # look for sources
    # then loop over values
    
    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)
    glottolog = Glottolog()

    try:
        contrib = CulturebankContribution(id=basename, name=basename, desc=glottolog.languoid(basename).name)
    except:
        print("Basename {:s} did not match a glottolog languoid, skipped.".format(basename))
        return

    md = {}
    mdpath = path + '-metadata.json'
    if os.path.exists(mdpath):
        md = jsonload(mdpath)
    contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS'))
    contributor_id = slug(contributor_name.last + contributor_name.first)
    contributor = data['Contributor'].get(contributor_id)
    if not contributor:
        contributor = data.add(
            Contributor,
            contributor_id,
            id=contributor_id,
            name='%s' % contributor_name)
    DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor))

    bibpath = os.path.join(dirpath, basename + '.bib')
    if os.path.exists(bibpath):
        for rec in Database.from_file(bibpath):
            if rec['key'] not in data['Source']:
                data.add(Source, rec['key'], _obj=bibtex2source(rec))

    languages = {f['properties']['glottocode']: f for f in md.get('features', [])}

    for i, row in pandas.io.parsers.read_csv(
            path,
            sep=',' if 'c' in ext else '\t',
            encoding='utf-16').iterrows():
        if pandas.isnull(row['Value']) or pandas.isnull(row['Feature_ID']):
            print("Expected columns not found: ", row)
            continue
        vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID'])
        vid = row.get('ID', '%s-%s' % (basename, i + 1))

        parameter = data['Feature'].get(row['Feature_ID'])
        if parameter is None:
            if add_missing_features:
                parameter = data.add(Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID']))
            else: 
                print(('skip value for invalid feature %s' % row['Feature_ID']))
                continue

        language = data['CulturebankLanguage'].get(row['Language_ID'])
        if language is None:
            # query glottolog!
            try:
                languoid = glottolog.languoid(row['Language_ID'])
            except AttributeError:
                print(('Skipping, no Glottocode found for %s' % row['Language_ID']))
                continue
            
            gl_md = {
                'name': languoid.name,
                'longitude': languoid.longitude,
                'latitude': languoid.latitude}
            lmd = languages.get(row['Language_ID'])
            if lmd:
                if lmd.get('properties', {}).get('name'):
                    gl_md['name'] = lmd['properties']['name']
                if lmd.get('geometry', {}).get('coordinates'):
                    gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates']

            language = data.add(
                CulturebankLanguage, row['Language_ID'],
                id=row['Language_ID'],
                name=gl_md['name'],
                latitude=gl_md.get('latitude'),
                longitude=gl_md.get('longitude'))

        
        vs = data['ValueSet'].get(vsid)
        if vs is None:
            vs = data.add(
                ValueSet, vsid,
                id=vsid,
                parameter=parameter,
                language=language,
                contribution=contrib,
                source=row['Source'])

        domain = {de.abbr: de for de in parameter.domain}    
        name = row['Value']
        if name in domain:
            name = domain[name].name
        else:
            name = str(name)
            if name in domain:
                name = domain[name].name
            else:
                raise ValueError("For feature {:s} in language {:s}: Name {:s} not found among domain values {:}".format(
                    row['Language_ID'],
                    row['Feature_ID'],
                    name,
                    {d: de for d, de in domain.items()}))

        data.add(Value,
            vid,
            id=vid,
            valueset=vs,
            name=name,
            description=row['Comment'],
            domainelement=domain.get(row['Value']))

        print(".", end="")
        if vs.source is not None:
            for key, src in list(data['Source'].items()):
                if key in vs.source:
                    ValueSetReference(valueset=vs, source=src, key=key)
Example #10
0
def main(args):
    if DBSession.bind.dialect.name == 'postgresql':
        Index('ducet', collkey(common.Value.name)).create(DBSession.bind)

    def data_file(*comps):
        return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(
        common.Dataset,
        'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex")

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        add_language_codes(
            data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None)
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace(
            '/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):

        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(
                url=image.source_url,
                thumbnail=image_url(image.source_url, 'thumbnail'),
                web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(
                object=data['Taxon'][image.taxa__id],
                id=image.id,
                name=image.tags,
                jsondata=jsondata,
                mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)
Example #11
0
def import_dataset(path,
                   data,
                   languoids,
                   invalid_features,
                   add_missing_features=False):
    # look for metadata
    # look for sources
    # then loop over values

    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)

    contrib = GrambankContribution(id=basename,
                                   name=basename,
                                   desc=languoids[basename].name)

    md = {}
    mdpath = path + '-metadata.json'
    if os.path.exists(mdpath):
        md = jsonload(mdpath)
    contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS'))
    contributor_id = slug(contributor_name.last + contributor_name.first)
    contributor = data['Contributor'].get(contributor_id)
    if not contributor:
        contributor = data.add(Contributor,
                               contributor_id,
                               id=contributor_id,
                               name='%s' % contributor_name)
    DBSession.add(
        ContributionContributor(contribution=contrib, contributor=contributor))

    bibpath = os.path.join(dirpath, basename + '.bib')
    if os.path.exists(bibpath):
        for rec in Database.from_file(bibpath):
            if rec['key'] not in data['Source']:
                data.add(Source, rec['key'], _obj=bibtex2source(rec))

    languages = {
        f['properties']['glottocode']: f
        for f in md.get('features', [])
    }

    for i, row in enumerate(
            reader(path,
                   dicts=True,
                   quoting=csv.QUOTE_NONE,
                   delimiter=',' if 'c' in ext else '\t')):
        if not row['Value'] or not row['Feature_ID']:
            continue
        vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID'])
        vid = row.get('ID', '%s-%s' % (basename, i + 1))

        parameter = data['Feature'].get(row['Feature_ID'])
        if parameter is None:
            if add_missing_features:
                parameter = data.add(Feature,
                                     row['Feature_ID'],
                                     id=row['Feature_ID'],
                                     name=row.get('Feature',
                                                  row['Feature_ID']))
            else:
                invalid_features.update([row['Feature_ID']])
                continue

        language = data['GrambankLanguage'].get(row['Language_ID'])
        if language is None:
            languoid = languoids.get(row['Language_ID'])
            if languoid is None:
                print('Skipping, no Glottocode found for %s' %
                      row['Language_ID'])
                continue

            gl_md = {
                'name': languoid.name,
                'longitude': languoid.longitude,
                'latitude': languoid.latitude
            }
            lmd = languages.get(row['Language_ID'])
            if lmd:
                if lmd.get('properties', {}).get('name'):
                    gl_md['name'] = lmd['properties']['name']
                if lmd.get('geometry', {}).get('coordinates'):
                    gl_md['longitude'], gl_md['latitude'] = lmd['geometry'][
                        'coordinates']

            language = data.add(GrambankLanguage,
                                row['Language_ID'],
                                id=row['Language_ID'],
                                name=gl_md['name'],
                                latitude=gl_md.get('latitude'),
                                longitude=gl_md.get('longitude'))

        domain = {de.abbr: de for de in parameter.domain}
        if not domain.get(row['Value']):
            #print "skipped", row, "not in", domain
            continue

        vs = data['ValueSet'].get(vsid)
        if vs is None:
            vs = data.add(ValueSet,
                          vsid,
                          id=vsid,
                          parameter=parameter,
                          language=language,
                          contribution=contrib,
                          source=row['Source'])

        name = row['Value']
        if name in domain:
            name = domain[name].name

        data.add(Value,
                 vid,
                 id=vid,
                 valueset=vs,
                 name=name,
                 description=row['Comment'],
                 domainelement=domain.get(row['Value']))

        for key, src in data['Source'].items():
            if key in vs.source:
                ValueSetReference(valueset=vs, source=src, key=key)
Example #12
0
def coverage(req):
    gl = jsonload(abspath_from_asset_spec('culturebank:static/stats_by_macroarea.json'))

    stats = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    for ma in gl:
        for dt in gl[ma]:
            ids = gl[ma][dt]

            isolates = select(
                [Language.__table__.c.id]).where(Language.__table__.c.id.in_(ids))
            families = select(
                [Family.__table__.c.id]).where(Family.__table__.c.id.in_(ids))
            stats[ma][dt] = dict(
                glottolog=len(ids),
                culturebank=DBSession.query(isolates.union(families).alias('u')).count())
        stats[ma]['total'] = {}
        for src in ['glottolog', 'culturebank']:
            stats[ma]['total'][src] = \
                stats[ma]['grammar'][src] + stats[ma]['grammarsketch'][src]

    gl = jsonload(abspath_from_asset_spec('culturebank:static/stats_by_classification.json'))
    gb_langs = set([r[0] for r in DBSession.query(Language.id)])

    cstats = OrderedDict()
    for fid, spec in sorted(list(gl.items()), key=lambda k: k[1]['name']):
        d = dict(
            macroareas=spec['macroareas'],
            grammar=Counter(),
            grammarsketch=Counter(),
            total=Counter(),
            covered=gb_langs.intersection(set(spec['extension'])),
            isolate=not bool(spec.get('subgroups')),
            subgroups={})
        if not spec.get('subgroups'):
            # an isolate!
            d[spec['doctype']].update(['glottolog'])
            d['total'].update(['glottolog'])
            if gb_langs.intersection(set(spec['extension'])):
                d[spec['doctype']].update(['culturebank'])
                d['total'].update(['culturebank'])
        for sfid, sub in list(spec.get('subgroups', {}).items()):
            if not sub.get('subgroups'):
                sub['name'] = '%s*' % sub['name']
            d[sub['doctype']].update(['glottolog'])
            d['total'].update(['glottolog'])
            if gb_langs.intersection(set(sub['extension'])):
                d[sub['doctype']].update(['culturebank'])
                d['total'].update(['culturebank'])
            d['subgroups'][(sfid, sub['name'])] = dict(
                macroareas=spec['macroareas'],
                covered=gb_langs.intersection(set(sub['extension'])),
                grammar=Counter(),
                grammarsketch=Counter(),
                total=Counter())
            if not sub.get('subgroups'):
                # a language attached directly to the top-level family
                d['subgroups'][(sfid, sub['name'])][sub['doctype']].update(['glottolog'])
                d['subgroups'][(sfid, sub['name'])]['total'].update(['glottolog'])
                if gb_langs.intersection(set(sub['extension'])):
                    d['subgroups'][(sfid, sub['name'])][sub['doctype']].update(['culturebank'])
                    d['subgroups'][(sfid, sub['name'])]['total'].update(['culturebank'])
            for ssfid, ssub in list(sub.get('subgroups', {}).items()):
                if ssub['doctype']:
                    d['subgroups'][(sfid, sub['name'])][ssub['doctype']].update(['glottolog'])
                    d['subgroups'][(sfid, sub['name'])]['total'].update(['glottolog'])
                    if gb_langs.intersection(set(ssub['extension'])):
                        d['subgroups'][(sfid, sub['name'])][ssub['doctype']].update(['culturebank'])
                        d['subgroups'][(sfid, sub['name'])]['total'].update(['culturebank'])
        cstats[(fid, spec['name'])] = d

    return dict(
        stats=stats,
        cstats=cstats,
        macroareas=jsonload(
            abspath_from_asset_spec('culturebank:static/stats_macroareas.json')))
Example #13
0
def main(args):
    def data_file(*comps):
        return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(
        common.Dataset,
        'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    data.add(common.Contribution,
             'tsammalex',
             name="Tsammalex",
             id="tsammalex")

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec,
                 rec.id,
                 _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        add_language_codes(data,
                           lang,
                           lang.id.split('-')[0],
                           None,
                           glottocode=row[2] or None)
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg',
                      source_url).replace('/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):

        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(url=image.source_url,
                            thumbnail=image_url(image.source_url, 'thumbnail'),
                            web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(object=data['Taxon'][image.taxa__id],
                                       id=image.id,
                                       name=image.tags,
                                       jsondata=jsondata,
                                       mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)