def upgrade():
    conn = Connection(op.get_bind())
    example_map = {}

    sid = 204
    for example in jsonload(data_file('lingala_examples.json')):
        sid += 1
        kw = {
            'id': '60-%s' % sid,
            'language_pk': conn.pk(Language, '60'),
            'name': example['Text'],
            'description': example['Translation'],
            'gloss': '\t'.join(example['Gloss'].split()),
            'analyzed': '\t'.join(example['Text'].split()),
            'type': example['Type'].strip().lower(),
            'jsondata': {'sort': int(example['Order_number']), 'alt_translation': None}
        }
        example_map[example['Example_number']] = conn.insert(Sentence, **kw)

    for ve in jsonload(data_file('lingala_value_examples.json')):
        vspk = conn.pk(ValueSet, '60-%s' % ve['Features::Feature_number'])
        vpk = conn.pk(Value, vspk, attr='valueset_pk')
        conn.insert(
            ValueSentence, value_pk=vpk, sentence_pk=example_map[ve['Example_number']])

    for i, comment in enumerate(reader(data_file('lingala_valueset_comments.tab'), dicts=True)):
        vspk = conn.pk(ValueSet, '60-%s' % comment['Features::Feature_number'])
        comment['Comments_on_value_assignment'] = comment['Comments_on_value_assignment'].replace('\x0b', '\n')
        conn.update(
            ValueSet,
            {
                'description': comment['Comments_on_value_assignment'],
                'markup_description': None,
            },
            pk=vspk)
Exemple #2
0
def chapter(request):
    _html = get_html(ppath('Atlas', '%s.html' % request.matchdict['id']))
    return {
        'md': jsonload(ppath('Atlas', '%s.json' % request.matchdict['id'])),
        'html': lambda vt: _html.replace('<p>value-table</p>', vt),
        'ctx': Feature.get(request.matchdict['id']),
    }
def main(args):  # pragma: no cover
    global MAX_IDENTIFIER_PK

    with transaction.manager:
        MAX_IDENTIFIER_PK = DBSession.query(
            Identifier.pk).order_by(desc(Identifier.pk)).first()[0]

        gl_name = glottolog_name()
        gl_names = glottolog_names()

        languoids = {l.pk: l for l in DBSession.query(Languoid)}
        for attrs in jsonload(args.data_dir.joinpath('languoids', 'changes.json')):
            replacement = attrs.pop('replacement', None)
            hname = attrs.pop('hname', None)

            for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]:
                if name in attrs:
                    attrs[name] = enum.from_string(attrs[name])

            l = languoids.get(attrs['pk'])
            if l:
                for k, v in attrs.items():
                    setattr(l, k, v)
                #
                # We do not assign ISO codes for existing languages, because it could be
                # that the ISO code is now assigned to a family node, due to a change
                # request, e.g. see https://github.com/clld/glottolog-data/issues/40
                #
                if len(l.hid or '') == 3 and not l.iso_code:
                    args.log.warn('Language with hid %s but no iso code!' % l.hid)
            else:
                l = Languoid(**attrs)
                DBSession.add(l)
                languoids[l.pk] = l

                if len(attrs.get('hid', '')) == 3:
                    create_identifier(
                        None, l, name=attrs['hid'], type=IdentifierType.iso.value)

                create_identifier(
                    gl_names.get(l.name),
                    l,
                    name=l.name,
                    description=gl_name.description,
                    type=gl_name.type)

            if hname:
                l.update_jsondata(hname=hname)

            if replacement:
                DBSession.add(Superseded(
                    languoid_pk=l.pk,
                    replacement_pk=replacement,
                    relation='classification update'))

            DBSession.flush()

        recreate_treeclosure()
Exemple #4
0
def load_ecoregions(data_file, data):
    ecoregions = jsonload(data_file('ecoregions.json'))['features']

    biome_map = {
        1: ('Tropical & Subtropical Moist Broadleaf Forests', '008001'),
        2: ('Tropical & Subtropical Dry Broadleaf Forests', '557715'),
        3: ('Tropical & Subtropical Coniferous Forests', ''),
        4: ('Temperate Broadleaf & Mixed Forests', ''),
        5: ('Temperate Conifer Forests', ''),
        6: ('Boreal Forests/Taiga', ''),
        7: ('Tropical & Subtropical Grasslands, Savannas & Shrublands', '98ff66'),
        8: ('Temperate Grasslands, Savannas & Shrublands', ''),
        9: ('Flooded Grasslands & Savannas', '0265fe'),
        10: ('Montane Grasslands & Shrublands', 'cdffcc'),
        11: ('Tundra', ''),
        12: ('Mediterranean Forests, Woodlands & Scrub', 'cc9900'),
        13: ('Deserts & Xeric Shrublands', 'feff99'),
        14: ('Mangroves', '870083'),
    }

    for eco_code, features in groupby(
            sorted(ecoregions, key=lambda e: e['properties']['eco_code']),
            key=lambda e: e['properties']['eco_code']):
        features = list(features)
        props = features[0]['properties']
        if int(props['BIOME']) not in biome_map:
            continue
        biome = data['Biome'].get(props['BIOME'])
        if not biome:
            name, color = biome_map[int(props['BIOME'])]
            biome = data.add(
                Biome, props['BIOME'],
                id=str(int(props['BIOME'])),
                name=name,
                description=color or 'ffffff')
        centroid = (None, None)
        f = sorted(features, key=lambda _f: _f['properties']['AREA'])[-1]
        if f['geometry']:
            coords = f['geometry']['coordinates'][0]
            if f['geometry']['type'] == 'MultiPolygon':
                coords = coords[0]
            centroid = get_center(coords)

        polygons = nfilter([_f['geometry'] for _f in features])
        data.add(
            Ecoregion, eco_code,
            id=eco_code,
            name=props['ECO_NAME'],
            description=props['G200_REGIO'],
            latitude=centroid[1],
            longitude=centroid[0],
            biome=biome,
            area=props['area_km2'],
            gbl_stat=Ecoregion.gbl_stat_map[int(props['GBL_STAT'])],
            realm=Ecoregion.realm_map[props['REALM']],
            jsondata=dict(polygons=polygons))
def main(args):
    repls = set((i['id'], i['replacement']) for i in
                jsonload(args.data_dir.joinpath('scripts', 'monster-replacements.json')))

    with transaction.manager:
        for ref_id, repl_id in repls:
            ref = Source.get('%s' % ref_id, default=None)
            if ref:
                Config.add_replacement(
                    ref, '%s' % repl_id, session=DBSession, model=Source)
                # FIXME: "redirect" relations, e.g. from valuesetreference as well!
                DBSession.delete(ref)
    args.log.info('%s replacements' % len(repls))
Exemple #6
0
def main(args):
    if args.cmd == 'convert':
        outdir = args.data_file('texts', args.what).joinpath('lo')
        if args.what == 'Atlas':
            for p in args.data_file('texts', args.what).joinpath('in').files():
                if p.ext in ['.doc', '.docx']:
                    convert_chapter(p, outdir)
        elif args.what == 'Surveys':
            pass
    if args.cmd == 'parse':
        outdir = args.data_file('texts', args.what).joinpath('processed')
        for p in args.data_file('texts', args.what).joinpath('lo').files():
            if args.in_name in p.namebase:
                globals()[args.what](p)(outdir)
    if args.cmd == 'refs':
        refs = []
        for p in args.data_file(
                'texts', args.what).joinpath('processed').files('*.json'):
            if args.in_name in p.namebase:
                md = jsonload(p)
                refs.extend(md['refs'])
        db = get_bibtex(refs)
        unmatched = 0
        distinct = defaultdict(list)
        for i, rec in enumerate(db):
            if 'all' in rec:
                unmatched += 1
            distinct[(slug(rec.get('key', unicode(uuid4().hex))),
                      slug(unicode(rec.get('title',
                                           uuid4().hex)),
                           remove_whitespace=False))] = 1
        print unmatched, 'of', i, 'distinct', len(distinct)

        c = 0
        for key, refs in groupby(sorted(distinct.keys()), key=lambda t: t[0]):
            refs = list(refs)
            if len(refs) > 1:
                for t1, t2 in combinations([t[1] for t in refs], 2):
                    if fuzz.partial_ratio(t1, t2) > 80:
                        print t1
                        print t2
                        print
                        c += 1
        print c
        return
Exemple #7
0
def main(args):
    if args.cmd == 'convert':
        outdir = args.data_file('texts', args.what).joinpath('lo')
        if args.what == 'Atlas':
            for p in args.data_file('texts', args.what).joinpath('in').files():
                if p.ext in ['.doc', '.docx']:
                    convert_chapter(p, outdir)
        elif args.what == 'Surveys':
            pass
    if args.cmd == 'parse':
        outdir = args.data_file('texts', args.what).joinpath('processed')
        for p in args.data_file('texts', args.what).joinpath('lo').files():
            if args.in_name in p.namebase:
                globals()[args.what](p)(outdir)
    if args.cmd == 'refs':
        refs = []
        for p in args.data_file('texts', args.what).joinpath('processed').files('*.json'):
            if args.in_name in p.namebase:
                md = jsonload(p)
                refs.extend(md['refs'])
        db = get_bibtex(refs)
        unmatched = 0
        distinct = defaultdict(list)
        for i, rec in enumerate(db):
            if 'all' in rec:
                unmatched += 1
            distinct[(
                slug(rec.get('key', unicode(uuid4().hex))),
                slug(unicode(rec.get('title', uuid4().hex)), remove_whitespace=False)
            )] = 1
        print unmatched, 'of', i, 'distinct', len(distinct)

        c = 0
        for key, refs in groupby(sorted(distinct.keys()), key=lambda t: t[0]):
            refs = list(refs)
            if len(refs) > 1:
                for t1, t2 in combinations([t[1] for t in refs], 2):
                    if fuzz.partial_ratio(t1, t2) > 80:
                        print t1
                        print t2
                        print
                        c += 1
        print c
        return
Exemple #8
0
def read(args, table, sortkey=None):
    """Read APiCS data from a json file created from filemaker's xml export.
    """
    load = lambda t: jsonload(args.data_file('fm', '%s.json' % t))
    res = load(table)

    if table == 'Features':
        # merge the data from two other sources:
        secondary = [
            dict((r['Feature_number'], r) for r in load(table + l)) for l in ['p', 'v']]
        for r in res:
            for d in secondary:
                r.update(d[r['Feature_number']])
    if sortkey:
        res = sorted(res, key=lambda d: d[sortkey])
    for d in res:
        for k, v in d.items():
            if isinstance(v, unicode):
                d[k] = v.strip()
        yield d
Exemple #9
0
def main(args):
    sources = jsonload(args.data_file('sources.json'))
    fields = ['href', 'name', 'author', 'iso', 'source', 'notes', 'wordlist']
    with UnicodeWriter(args.data_file('..', 'sources.csv')) as fp:
        fp.writerow(fields)
        for source in sorted(sources, key=lambda i: i['name']):
            fp.writerow([source.get(f, '') for f in fields])
    return
    ethnologue_names = {
        r.ISO_639: r.Language_Name for r in reader(args.data_file(
        '..', '..', 'ethnologue-17-data', 'Table_of_Languages.tab'), namedtuples=True)}

    # ASJP name for language, Ethnologue's name, ISO code
    rows = [['ASJP Name', 'Ethnologue name', 'ISO code']]
    subquery = DBSession.query(LanguageSource.language_pk).distinct().subquery()
    for i, l in enumerate(DBSession.query(Doculect).order_by(Doculect.pk).filter(not_(Doculect.pk.in_(subquery)))):
        rows.append([l.id, ethnologue_names.get(l.code_iso, ''), l.code_iso or ''])
    #print i
    with UnicodeWriter(args.data_file('..', 'doculects_without_source.csv')) as fp:
        fp.writerows(rows)
Exemple #10
0
def coverage(req):
    gl = jsonload(abspath_from_asset_spec('grambank:static/stats_glottolog.json'))

    stats = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    for ma in gl:
        for dt in gl[ma]:
            ids = gl[ma][dt]

            isolates = select(
                [Language.__table__.c.id]).where(Language.__table__.c.id.in_(ids))
            families = select(
                [Family.__table__.c.id]).where(Family.__table__.c.id.in_(ids))
            stats[ma][dt] = dict(
                glottolog=len(ids),
                grambank=DBSession.query(isolates.union(families).alias('u')).count())
        stats[ma]['total'] = {}
        for src in ['glottolog', 'grambank']:
            stats[ma]['total'][src] = \
                stats[ma]['grammar'][src] + stats[ma]['grammarsketch'][src]
    return dict(stats=stats)
Exemple #11
0
def read(args, table, sortkey=None):
    """Read APiCS data from a json file created from filemaker's xml export.
    """
    load = lambda t: jsonload(args.data_file('fm', '%s.json' % t))
    res = load(table)

    if table == 'Features':
        # merge the data from two other sources:
        secondary = [
            dict((r['Feature_number'], r) for r in load(table + l))
            for l in ['p', 'v']
        ]
        for r in res:
            for d in secondary:
                r.update(d[r['Feature_number']])
    if sortkey:
        res = sorted(res, key=lambda d: d[sortkey])
    for d in res:
        for k, v in d.items():
            if isinstance(v, unicode):
                d[k] = v.strip()
        yield d
Exemple #12
0
def survey(request):
    id_ = request.matchdict['id']
    md = jsonload(ppath('Surveys', '%s.json' % id_))
    html = get_html(ppath('Surveys', '%s.html' % id_))
    maps = []
    for fname in sorted(
            ppath('Surveys', processed='maps').files(
                            '%s*.png' % id_.split('.')[1].replace('-', '_')),
            key=lambda fn: fn.namebase):
        img = b64encode(open(fname, 'rb').read())
        if 'figure' in fname.namebase:
            html = html.replace('{%s}' % fname.namebase, 'data:image/png;base64,%s' % img)
        else:
            maps.append(img)

    return {
        'maps': maps,
        'md': md,
        'authors': [Contributor.get(a['id']) for a in md['authors']],
        'html': html,
        'ctx': ApicsContribution.get(id_.split('.')[0]),
    }
Exemple #13
0
def main(args):
    data = Data()

    editors = OrderedDict()
    editors['Susanne Maria Michaelis'] = None
    editors['Philippe Maurer'] = None
    editors['Martin Haspelmath'] = None
    editors['Magnus Huber'] = None

    for row in read(args, 'People'):
        name = row['First name'] + ' ' if row['First name'] else ''
        name += row['Last name']
        kw = dict(
            name=name,
            id=slug('%(Last name)s%(First name)s' % row),
            url=row['Contact Website'].split()[0]
            if row['Contact Website'] else None,
            address=row['Comments on database'],
        )
        contrib = data.add(common.Contributor, row['Author ID'], **kw)
        if kw['name'] in editors:
            editors[kw['name']] = contrib

    DBSession.flush()

    dataset = common.Dataset(
        id='apics',
        name='APiCS Online',
        description='Atlas of Pidgin and Creole Language Structures Online',
        domain='apics-online.info',
        published=date(2013, 11, 4),
        license='http://creativecommons.org/licenses/by/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'
        })
    DBSession.add(dataset)
    for i, editor in enumerate(editors.values()):
        common.Editor(dataset=dataset, contributor=editor, ord=i + 1)

    colors = dict(
        (row['ID'], row['RGB_code']) for row in read(args, 'Colours'))

    abbrs = {}
    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for id_, name in {
            'C**T': 'clitic',
            'IMPF': 'imperfect',
            'INTERM': 'intermediate',
            'NCOMPL': 'noncompletive',
            'NONFUT': 'nonfuture',
            'NPROX': 'nonproximal',
            'NSG': 'nonsingular',
            'PP': 'past participle',
            'PROP': 'proprietive',
            'TMA': 'tense-mood-aspect',
    }.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'),
                      delimiter=',',
                      namedtuples=True):
        for match in GLOSS_ABBR_PATTERN.finditer(row.standard):
            if match.group('abbr') not in abbrs:
                abbrs[match.group('abbr')] = 1
                DBSession.add(
                    common.GlossAbbreviation(id=match.group('abbr'),
                                             name=row.meaning))

    non_bibs = {}
    for row in read(args, 'References', 'Reference_ID'):
        if row['Reference_type'] == 'Non-bib':
            non_bibs[row['Reference_ID']] = row['Reference_name']
            continue

        if isinstance(row['Year'], int):
            year_int = row['Year']
            year = str(row['Year'])
        elif row['Year']:
            year_int = None
            for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']):
                year_int = int(m.group('year'))
                break
            year = row['Year']
        else:
            year, year_int = None, None

        title = row['Article_title'] or row['Book_title']
        attrs = {}
        jsondata = {}
        for attr, field in {
                'Additional_information': 'note',
                'Article_title': 'title',
                'Book_title': 'booktitle',
                'City': 'address',
                'Editors': 'editor',
                'Full_reference': None,
                'Issue': None,
                'Journal': 'journal',
                'Language_codes': None,
                'LaTeX_cite_key': None,
                'Pages': 'pages',
                'Publisher': 'publisher',
                'Reference_type': 'type',
                'School': 'school',
                'Series_title': 'series',
                'URL': 'url',
                'Volume': 'volume',
        }.items():
            value = row.get(attr)
            if not isinstance(value, int):
                value = (value or '').strip()
            if attr == 'Issue' and value:
                try:
                    value = str(int(value))
                except ValueError:
                    pass
            if value:
                if field:
                    attrs[field] = value
                else:
                    jsondata[attr] = value
        p = data.add(common.Source,
                     row['Reference_ID'],
                     id=str(row['Reference_ID']),
                     name=row['Reference_name'],
                     description=title,
                     author=row['Authors'],
                     year=year,
                     year_int=year_int,
                     bibtex_type=getattr(EntryType, row['BibTeX_type']
                                         or 'misc'),
                     jsondata=jsondata,
                     **attrs)
        if p.bibtex_type.value == 'misc' and not p.description:
            p.description = p.note
        DBSession.flush()

    DBSession.flush()

    infobox = jsonload(args.data_file('infobox.json'))
    glottocodes = jsonload(args.data_file('glottocodes.json'))
    for row in read(args, 'Languages', 'Order_number'):
        lon, lat = [
            float(c.strip()) for c in row['map_coordinates'].split(',')
        ]
        kw = dict(
            name=row['Language_name'],
            id=str(row['Order_number']),
            latitude=lat,
            longitude=lon,
            region=row['Category_region'],
        )
        lect = data.add(models.Lect, row['Language_ID'], **kw)
        DBSession.flush()

        for i, item in enumerate(infobox[lect.id]):
            DBSession.add(
                common.Language_data(object_pk=lect.pk,
                                     ord=i,
                                     key=item[0],
                                     value=item[1]))

        if row["Languages_contribution_documentation::Lect_description_checked_status"] \
                != "Checked":
            print 'unchecked! ---', row['Language_name']

        desc = row.get(
            'Languages_contribution_documentation::Lect description', '')
        markup_desc = normalize_markup(row[
            'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description']
                                       )

        c = data.add(
            models.ApicsContribution,
            row['Language_ID'],
            id=str(row['Order_number']),
            name=row['Language_name'],
            description=desc,
            markup_description=markup_desc,
            survey_reference=data['Source'][row['Survey_reference_ID']],
            language=lect)

        for ext, label, mtype in [
            ('pdf', 'Glossed text', 'application/pdf'),
            ('mp3', 'Glossed text audio', 'audio/mpeg'),
        ]:
            fid = '%s-gt.%s' % (c.id, ext)
            if args.data_file('files', 'contribution', c.id, fid).exists():
                common.Contribution_files(object=c,
                                          id=fid,
                                          name=label,
                                          mime_type=mtype)
            else:
                print label, 'missing for:', row['Language_name']

        #
        # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE
        #

        iso = None
        if row['ISO_code'] and len(row['ISO_code']) == 3:
            iso = row['ISO_code'].lower()
            if 'iso:%s' % row['ISO_code'] not in data['Identifier']:
                data.add(common.Identifier,
                         'iso:%s' % row['ISO_code'],
                         id=row['ISO_code'].lower(),
                         name=row['ISO_code'].lower(),
                         type=common.IdentifierType.iso.value)

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=data['Identifier']['iso:%s' % row['ISO_code']]))

        if lect.id in glottocodes:
            identifier = data.add(common.Identifier,
                                  'gc:%s' % glottocodes[lect.id],
                                  id=glottocodes[lect.id],
                                  name=glottocodes[lect.id],
                                  type=common.IdentifierType.glottolog.value)

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=identifier))

        if row['Language_name_ethnologue']:
            if row['Language_name_ethnologue'] not in data['Identifier']:
                data.add(common.Identifier,
                         row['Language_name_ethnologue'],
                         id=iso
                         or 'ethnologue:%s' % row['Language_name_ethnologue'],
                         name=row['Language_name_ethnologue'],
                         type='ethnologue')

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=data['Identifier'][
                        row['Language_name_ethnologue']]))

    example_count = {}
    for row in read(args, 'Examples', 'Order_number'):
        assert row['Language_ID']
        lang = data['Lect'][row['Language_ID']]
        id_ = '%(Language_ID)s-%(Example_number)s' % row
        atext, gloss = igt(row)
        example_count[row['Language_ID']] = max(
            [example_count.get(row['Language_ID'], 1), row['Example_number']])
        p = add_sentence(
            args,
            data,
            id_,
            id='%s-%s' % (lang.id, row['Example_number']),
            name=row['Text'] or row['Analyzed_text'],
            description=row['Translation'],
            type=row['Type'].strip().lower() if row['Type'] else None,
            comment=row['Comments'],
            gloss=gloss,
            analyzed=atext,
            markup_text=normalize_markup(row['z_calc_Text_CSS']),
            markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']),
            markup_comment=normalize_markup(row['z_calc_Comments_CSS']),
            markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']),
            original_script=row['Original_script'],
            jsondata={
                'sort': row['Order_number'],
                'alt_translation': (row['Translation_other'] or '').strip()
                or None
            },
            language=lang)

        if row['Reference_ID']:
            if row['Reference_ID'] in data['Source']:
                source = data['Source'][row['Reference_ID']]
                DBSession.add(
                    common.SentenceReference(
                        sentence=p,
                        source=source,
                        key=source.id,
                        description=row['Reference_pages']))
            else:
                p.source = non_bibs[row['Reference_ID']]

    DBSession.flush()

    for row in read(args, 'Language_references'):
        if row['Reference_ID'] not in data['Source']:
            assert row['Reference_ID'] in non_bibs
            continue
        assert row['Language_ID'] in data['ApicsContribution']
        source = data['Source'][row['Reference_ID']]
        DBSession.add(
            common.ContributionReference(
                contribution=data['ApicsContribution'][row['Language_ID']],
                source=source,
                description=row['Pages'],
                key=source.id))

    #
    # global counter for features - across feature types
    #
    feature_count = 0
    for row in read(args, 'Features', 'Feature_number'):
        id_ = str(row['Feature_number'])
        if int(id_) > feature_count:
            feature_count = int(id_)
        wals_id = None
        desc = row['Feature_annotation_publication']
        if row['WALS_match'] == 'Total':
            if isinstance(row['WALS_No.'], int):
                wals_id = row['WALS_No.']
            else:
                wals_id = int(row['WALS_No.'].split('.')[0].strip())

        p = data.add(models.Feature,
                     row['Feature_code'],
                     name=row['Feature_name'],
                     id=id_,
                     description=desc,
                     markup_description=normalize_markup(
                         row['z_calc_Feature_annotation_publication_CSS']),
                     feature_type='primary',
                     multivalued=row['Value_relation_type'] != 'Single',
                     area=row['Feature_area'],
                     wals_id=wals_id)

        names = {}
        for i in range(1, 10):
            if not row['Value%s_publication' % i] \
                    or not row['Value%s_publication' % i].strip():
                continue
            name = row['Value%s_publication' % i].strip()
            if name in names:
                name += ' (%s)' % i
            names[name] = 1
            de = data.add(
                common.DomainElement,
                '%s-%s' % (row['Feature_code'], i),
                id='%s-%s' % (id_, i),
                name=name,
                parameter=p,
                abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name,
                number=int(row['Value%s_value_number_for_publication' % i]),
                jsondata={'color': colors[row['Value_%s_colour_ID' % i]]},
            )
            assert de

        if row['Authors_FeatureArticles']:
            authors, _ = row['Authors_FeatureArticles'].split('and the APiCS')
            authors = authors.strip()
            if authors.endswith(','):
                authors = authors[:-1].strip()
            for i, name in enumerate(authors.split(',')):
                assert name.strip() in editors
                p._authors.append(
                    models.FeatureAuthor(ord=i + 1,
                                         contributor=editors[name.strip()]))

        DBSession.flush()

    primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41}
    segment_to_primary = dict(
        zip(primary_to_segment.values(), primary_to_segment.keys()))
    number_map = {}
    names = {}
    for row in read(args, 'Segment_features', 'Order_number'):
        symbol = row['Segment_symbol']
        if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate':
            symbol = 't\u0361s'
        truth = lambda s: s and s.strip().lower() == 'yes'
        name = '%s - %s' % (symbol, row['Segment_name'])

        if name in names:
            number_map[row['Segment_feature_number']] = names[name]
            continue

        number_map[
            row['Segment_feature_number']] = row['Segment_feature_number']
        names[name] = row['Segment_feature_number']
        feature_count += 1
        if row['Segment_feature_number'] in segment_to_primary:
            primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\
                = str(feature_count)
        p = data.add(models.Feature,
                     row['Segment_feature_number'],
                     name=name,
                     id=str(feature_count),
                     feature_type='segment',
                     area='Vowels' if truth(row['Vowel']) else
                     ('Obstruent consonants'
                      if truth(row['Obstruent']) else 'Sonorant consonants'),
                     jsondata=dict(
                         number=int(row['Segment_feature_number']),
                         vowel=truth(row['Vowel']),
                         consonant=truth(row['Consonant']),
                         obstruent=truth(row['Obstruent']),
                         core_list=truth(row['Core_list_segment']),
                         symbol=symbol,
                     ))

        for i, spec in SEGMENT_VALUES.items():
            data.add(common.DomainElement,
                     '%s-%s' % (row['Segment_feature_number'], spec[0]),
                     id='%s-%s' % (p.id, i),
                     name=spec[0],
                     parameter=p,
                     jsondata={'color': spec[1]},
                     number=i)

    print '--> remapped:', primary_to_segment
    DBSession.flush()

    for row in read(args, 'Sociolinguistic_features',
                    'Sociolinguistic_feature_number'):
        feature_count += 1
        p = data.add(models.Feature,
                     row['Sociolinguistic_feature_code'],
                     name=row['Sociolinguistic_feature_name'],
                     id='%s' % feature_count,
                     description=row['Sociolinguistic_feature_annotation'],
                     area='Sociolinguistic',
                     feature_type='sociolinguistic')

        names = {}

        for i in range(1, 10):
            id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i)
            if row.get('Value%s' % i) and row['Value%s' % i].strip():
                name = row['Value%s' % i].strip()
                if name in names:
                    name += ' (%s)' % i
                names[name] = 1
            else:
                continue
            kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i)
            data.add(common.DomainElement,
                     id_,
                     id='%s-%s' % (p.id, i),
                     name=name,
                     parameter=p,
                     number=i,
                     jsondata={
                         'color':
                         colors.get(row['Value%s_colour_ID' % i],
                                    colors.values()[i])
                     })

    sd = {}
    for row in read(args, 'Segment_data'):
        if row['Segment_feature_number'] not in number_map:
            continue
        number = number_map[row['Segment_feature_number']]

        if not row['Presence_in_the_language']:
            continue

        lang = data['Lect'][row['Language_ID']]
        param = data['Feature'][number]
        id_ = '%s-%s' % (lang.id, param.id)
        if id_ in sd:
            assert row['c_Record_is_a_duplicate'] == 'Yes'
            continue
        sd[id_] = 1
        valueset = data.add(
            common.ValueSet,
            id_,
            id=id_,
            parameter=param,
            language=lang,
            contribution=data['ApicsContribution'][row['Language_ID']],
            description=row['Comments'],
            markup_description=normalize_markup(row['z_calc_Comments_CSS']),
        )
        v = data.add(
            common.Value,
            id_,
            id=id_,
            frequency=float(100),
            valueset=valueset,
            domainelement=data['DomainElement'][
                '%s-%s' % (number, row['Presence_in_the_language'])],
        )
        if row['Example_word'] and row['Example_word_gloss']:
            example_count[row['Language_ID']] += 1
            p = add_sentence(args,
                             data,
                             '%s-p%s' % (lang.id, data['Feature'][number].id),
                             id='%s-%s' %
                             (lang.id, example_count[row['Language_ID']]),
                             name=row['Example_word'],
                             description=row['Example_word_gloss'],
                             language=lang)
            DBSession.add(common.ValueSentence(value=v, sentence=p))

        source = data['Source'].get(row['Refers_to_references_Reference_ID'])
        if source:
            DBSession.add(
                common.ValueSetReference(valueset=valueset,
                                         source=source,
                                         key=source.id))
        elif row['Refers_to_references_Reference_ID'] in non_bibs:
            valueset.source = non_bibs[
                row['Refers_to_references_Reference_ID']]

    lects = defaultdict(lambda: 1)
    lect_map = {}
    records = {}
    false_values = {}
    no_values = {}
    wals_value_number = {}
    for row in read(args, 'wals'):
        if row['z_calc_WALS_value_number']:
            wals_value_number[
                row['Data_record_id']] = row['z_calc_WALS_value_number']

    def prefix(attr, _prefix):
        if _prefix:
            return '%s_%s' % (_prefix, attr)
        return attr.capitalize()

    for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]:
        num_values = 10
        for row in read(args, prefix('data', _prefix)):
            if not row[prefix('feature_code', _prefix)]:
                print('no associated feature for', prefix('data', _prefix),
                      row[prefix('data_record_id', _prefix)])
                continue

            lid = row['Language_ID']
            lect_attr = row.get('Lect_attribute', 'my default lect').lower()
            if lect_attr != 'my default lect':
                if (row['Language_ID'], row['Lect_attribute']) in lect_map:
                    lid = lect_map[(row['Language_ID'], row['Lect_attribute'])]
                else:
                    lang = data['Lect'][row['Language_ID']]
                    c = lects[row['Language_ID']]
                    lid = '%s-%s' % (row['Language_ID'], c)
                    kw = dict(
                        name='%s (%s)' % (lang.name, row['Lect_attribute']),
                        id='%s' % (1000 + 10 * int(lang.id) + c),
                        latitude=lang.latitude,
                        longitude=lang.longitude,
                        description=row['Lect_attribute'],
                        language=lang,
                    )
                    data.add(models.Lect, lid, **kw)
                    lects[row['Language_ID']] += 1
                    lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid

            id_ = abbr + str(row[prefix('data_record_id', _prefix)])
            assert id_ not in records
            records[id_] = 1

            assert row[prefix('feature_code', _prefix)] in data['Feature']
            language = data['Lect'][lid]
            parameter = data['Feature'][row[prefix('feature_code', _prefix)]]
            valueset = common.ValueSet(
                id='%s-%s' % (language.id, parameter.id),
                description=row['Comments_on_value_assignment'],
                markup_description=normalize_markup(
                    row.get('z_calc_Comments_on_value_assignment_CSS')),
            )

            values_found = {}
            for i in range(1, num_values):
                if not row['Value%s_true_false' % i]:
                    continue

                if row['Value%s_true_false' % i].strip().lower() != 'true':
                    assert row['Value%s_true_false' %
                               i].strip().lower() == 'false'
                    false_values[row[prefix('data_record_id', _prefix)]] = 1
                    continue

                iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i)
                if iid not in data['DomainElement']:
                    print(iid, row[prefix('data_record_id',
                                          _prefix)], '--> no domainelement!')
                    continue
                values_found['%s-%s' % (id_, i)] = dict(
                    id='%s-%s' % (valueset.id, i),
                    domainelement=data['DomainElement']['%s-%s' % (row[prefix(
                        'feature_code', _prefix)], i)],
                    confidence=row['Value%s_confidence' % i],
                    frequency=float(row['c_V%s_frequency_normalised' %
                                        i]) if _prefix == '' else 100)

            if values_found:
                if row[prefix('data_record_id', _prefix)] in wals_value_number:
                    valueset.jsondata = {
                        'wals_value_number':
                        wals_value_number.pop(row[prefix(
                            'data_record_id', _prefix)])
                    }
                valueset.parameter = parameter
                valueset.language = language
                valueset.contribution = data['ApicsContribution'][
                    row['Language_ID']]
                valueset = data.add(common.ValueSet, id_, _obj=valueset)
                for i, item in enumerate(values_found.items()):
                    if i > 0 and not parameter.multivalued:
                        print 'multiple values for single-valued parameter: %s' % id_
                        break
                    id_, kw = item
                    kw['valueset'] = valueset
                    value = data.add(common.Value, id_, **kw)

                #
                # store references to additional data for segments which should be reused
                # for corresponding primary features!
                #
                if int(parameter.id) in primary_to_segment:
                    assert len(values_found) == 1
                    seg_id = '%s-%s' % (language.id, primary_to_segment[int(
                        parameter.id)])
                    seg_valueset = data['ValueSet'][seg_id]
                    seg_value = data['Value'][seg_id]
                    if not valueset.description and seg_valueset.description:
                        valueset.description = seg_valueset.description

                    for s in seg_value.sentence_assocs:
                        DBSession.add(
                            common.ValueSentence(value=value,
                                                 sentence=s.sentence))

                    for r in seg_valueset.references:
                        DBSession.add(
                            common.ValueSetReference(valueset=valueset,
                                                     source=r.source,
                                                     key=r.key))

                    if not valueset.source and seg_valueset.source:
                        valueset.source = seg_valueset.source

                DBSession.flush()
            else:
                no_values[id_] = 1

    DBSession.flush()

    for prefix, abbr, num_values in [
        ('D', '', 10),
        ('Sociolinguistic_d', 'sl', 7),
    ]:
        for row in read(args, prefix + 'ata_references'):
            assert row['Reference_ID'] in data['Source'] \
                or row['Reference_ID'] in non_bibs
            try:
                vs = data['ValueSet'][abbr +
                                      str(row[prefix + 'ata_record_id'])]
                if row['Reference_ID'] in data['Source']:
                    source = data['Source'][row['Reference_ID']]
                    DBSession.add(
                        common.ValueSetReference(
                            valueset=vs,
                            source=source,
                            key=source.id,
                            description=row['Pages'],
                        ))
                else:
                    if vs.source:
                        vs.source += '; ' + non_bibs[row['Reference_ID']]
                    else:
                        vs.source = non_bibs[row['Reference_ID']]
            except KeyError:
                continue

    DBSession.flush()

    missing = 0
    for row in read(args, 'Value_examples'):
        try:
            DBSession.add(
                common.ValueSentence(
                    value=data['Value']['%(Data_record_id)s-%(Value_number)s' %
                                        row],
                    sentence=data['Sentence'][
                        '%(Language_ID)s-%(Example_number)s' % row],
                    description=row['Notes'],
                ))
        except KeyError:
            missing += 1
    print('%s Value_examples are missing data' % missing)

    print('%s data sets with false values' % len(false_values))
    print('%s data sets without values' % len(no_values))

    for k, v in wals_value_number.items():
        print 'unclaimed wals value number:', k, v

    for i, row in enumerate(read(args, 'Contributors')):
        kw = dict(contribution=data['ApicsContribution'][row['Language ID']],
                  contributor=data['Contributor'][row['Author ID']])
        if row['Order_of_appearance']:
            kw['ord'] = int(float(row['Order_of_appearance']))
        data.add(common.ContributionContributor, i, **kw)

    DBSession.flush()
Exemple #14
0
def import_dataset(path, data, icons):
    # look for metadata
    # look for sources
    # then loop over values
    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)
    glottolog = Glottolog()

    contrib = Contribution(id=basename, name=basename)

    md = {}
    mdpath = path + '-metadata.json'
    if os.path.exists(mdpath):
        md = jsonload(mdpath)
    contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS'))
    contributor_id = slug(contributor_name.last + contributor_name.first)
    contributor = data['Contributor'].get(contributor_id)
    if not contributor:
        contributor = data.add(
            Contributor,
            contributor_id,
            id=contributor_id,
            name='%s' % contributor_name)
    DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor))

    bibpath = os.path.join(dirpath, basename + '.bib')
    if os.path.exists(bibpath):
        for rec in Database.from_file(bibpath):
            if rec['key'] not in data['Source']:
                data.add(Source, rec['key'], _obj=bibtex2source(rec))

    languages = {f['properties']['glottocode']: f for f in md.get('features', [])}

    for i, row in enumerate(reader(path, dicts=True, quoting=csv.QUOTE_NONE, delimiter=',' if 'c' in ext else '\t')):
        if not row['Value'] or not row['Feature_ID']:
            continue
        vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID'])
        vid = row.get('ID', '%s-%s' % (basename, i + 1))

        parameter = data['Feature'].get(row['Feature_ID'])
        if parameter is None:
            print('skip value for invalid feature %s' % row['Feature_ID'])
            continue
            #parameter = data.add(
            #    Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID']))

        language = data['GrambankLanguage'].get(row['Language_ID'])
        if language is None:
            # query glottolog!
            languoid = glottolog.languoid(row['Language_ID'])
            gl_md = {
                'name': languoid.name,
                'longitude': languoid.longitude,
                'latitude': languoid.latitude}
            lmd = languages.get(row['Language_ID'])
            if lmd:
                if lmd.get('properties', {}).get('name'):
                    gl_md['name'] = lmd['properties']['name']
                if lmd.get('geometry', {}).get('coordinates'):
                    gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates']

            language = data.add(
                GrambankLanguage, row['Language_ID'],
                id=row['Language_ID'],
                name=gl_md['name'],
                latitude=gl_md.get('latitude'),
                longitude=gl_md.get('longitude'))

        vs = data['ValueSet'].get(vsid)
        if vs is None:
            vs = data.add(
                ValueSet, vsid,
                id=vsid,
                parameter=parameter,
                language=language,
                contribution=contrib,
                source=row['Source'])

        domain = {de.abbr: de for de in parameter.domain}
        name = row['Value']
        if name in domain:
            name = domain[name].name

        Value(
            id=vid,
            valueset=vs,
            name=name,
            description=row['Comment'],
            domainelement=domain.get(row['Value']))

        for key, src in data['Source'].items():
            if key in vs.source:
                ValueSetReference(valueset=vs, source=src, key=key)
Exemple #15
0
def update_reflang(args):
    stats = Counter()
    brugmann_noderefs = jsonload(args.data_dir.joinpath('languoid_refs.json'))

    languoid_map = {}
    for l in DBSession.query(Languoid).options(joinedload_all(
        Language.languageidentifier, LanguageIdentifier.identifier
    )):
        if l.hid:
            languoid_map[l.hid] = l.pk
        elif l.iso_code:
            languoid_map[l.iso_code] = l.pk
        languoid_map[l.id] = l.pk

    lgcodes = {}
    for rec in get_bib(args):
        lgcode = ''
        for f in 'lgcode lcode lgcde lgcoe lgcosw'.split():
            if rec.get(f):
                lgcode = rec[f]
                break
        if len(lgcode) == 3 or lgcode.startswith('NOCODE_'):
            lgcode = '[' + lgcode + ']'
        lgcodes[rec.get('glottolog_ref_id', None)] = lgcode

    for ref in page_query(
            DBSession.query(Ref).order_by(desc(Source.pk)),
            n=10000,
            commit=True,
            verbose=True):
        # disregard iso change requests:
        if ref.description and ref.description.startswith('Change Request Number '):
            stats.update(['ignored'])
            continue

        if ref.id not in lgcodes:
            # remove all language relations for refs no longer in bib!
            update_relationship(ref.languages, [])
            stats.update(['obsolete'])
            continue

        language_note = lgcodes[ref.id]
        trigger = ca_trigger(language_note)
        if trigger:
            ref.ca_language_trigger, ref.language_note = trigger
        else:
            ref.language_note = language_note

        remove = brugmann_noderefs['delete'].get(str(ref.pk), [])

        # keep relations to non-language languoids:
        # FIXME: adapt this for bib-entries now referring to glottocodes of
        #        families/dialects (e.g. add a sticky-bit to languagesource)
        langs = [
            l for l in ref.languages if
            (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove]
        langs_pk = [l.pk for l in langs]

        # add relations from filemaker data:
        for lpk in brugmann_noderefs['create'].get(str(ref.pk), []):
            if lpk not in langs_pk:
                l = Languoid.get(lpk, default=None)
                if l:
                    langs.append(l)
                    langs_pk.append(l.pk)
                else:
                    args.log.warn('brugmann relation for non-existing languoid %s' % lpk)

        for code in set(get_codes(ref)):
            if code not in languoid_map:
                stats.update([code])
                continue
            lpk = languoid_map[code]
            if lpk in remove:
                print(ref.name, ref.id, '--', l.name, l.id)
                print('relation removed according to brugmann data')
            else:
                if lpk not in langs_pk:
                    langs.append(DBSession.query(Languoid).get(lpk))
                    langs_pk.append(lpk)

        a, r = update_relationship(ref.languages, langs)
        if a or r:
            stats.update(['changed'])

    args.log.info('%s' % stats)
Exemple #16
0
def update_reflang(args):
    stats = Counter()
    brugmann_noderefs = jsonload(args.data_dir.joinpath('languoid_refs.json'))

    languoid_map = {}
    for l in DBSession.query(Languoid).options(
            joinedload_all(Language.languageidentifier,
                           LanguageIdentifier.identifier)):
        if l.hid:
            languoid_map[l.hid] = l.pk
        elif l.iso_code:
            languoid_map[l.iso_code] = l.pk
        languoid_map[l.id] = l.pk

    lgcodes = {}
    for rec in get_bib(args):
        lgcode = ''
        for f in 'lgcode lcode lgcde lgcoe lgcosw'.split():
            if rec.get(f):
                lgcode = rec[f]
                break
        if len(lgcode) == 3 or lgcode.startswith('NOCODE_'):
            lgcode = '[' + lgcode + ']'
        lgcodes[rec.get('glottolog_ref_id', None)] = lgcode

    for ref in page_query(DBSession.query(Ref).order_by(desc(Source.pk)),
                          n=10000,
                          commit=True,
                          verbose=True):
        # disregard iso change requests:
        if ref.description and ref.description.startswith(
                'Change Request Number '):
            stats.update(['ignored'])
            continue

        if ref.id not in lgcodes:
            # remove all language relations for refs no longer in bib!
            update_relationship(ref.languages, [])
            stats.update(['obsolete'])
            continue

        language_note = lgcodes[ref.id]
        trigger = ca_trigger(language_note)
        if trigger:
            ref.ca_language_trigger, ref.language_note = trigger
        else:
            ref.language_note = language_note

        remove = brugmann_noderefs['delete'].get(str(ref.pk), [])

        # keep relations to non-language languoids:
        # FIXME: adapt this for bib-entries now referring to glottocodes of
        #        families/dialects (e.g. add a sticky-bit to languagesource)
        langs = [
            l for l in ref.languages
            if (l.level != LanguoidLevel.language or not l.active)
            and l.pk not in remove
        ]
        langs_pk = [l.pk for l in langs]

        # add relations from filemaker data:
        for lpk in brugmann_noderefs['create'].get(str(ref.pk), []):
            if lpk not in langs_pk:
                l = Languoid.get(lpk, default=None)
                if l:
                    langs.append(l)
                    langs_pk.append(l.pk)
                else:
                    args.log.warn(
                        'brugmann relation for non-existing languoid %s' % lpk)

        for code in set(get_codes(ref)):
            if code not in languoid_map:
                stats.update([code])
                continue
            lpk = languoid_map[code]
            if lpk in remove:
                print(ref.name, ref.id, '--', l.name, l.id)
                print('relation removed according to brugmann data')
            else:
                if lpk not in langs_pk:
                    langs.append(DBSession.query(Languoid).get(lpk))
                    langs_pk.append(lpk)

        a, r = update_relationship(ref.languages, langs)
        if a or r:
            stats.update(['changed'])

    args.log.info('%s' % stats)
Exemple #17
0
def prime_cache(args):
    #
    # TODO: relate survey chapter reference with language!
    #
    icons = {}
    frequencies = {}

    args.log.info('computing wals representation')
    for feature in DBSession.query(common.Parameter).options(
        joinedload(common.Parameter.valuesets)
    ):
        feature.representation = len(feature.valuesets)
        if feature.wals_id:
            data = jsonload(path(apics.__file__).dirname().joinpath(
                'static', 'wals', '%sA.json' % feature.wals_id
            ))
            feature.wals_representation = sum(
                [len(l['features']) for l in data['layers']])

    args.log.info('computing language sources')
    compute_language_sources((common.ContributionReference, 'contribution'))
    compute_number_of_values()

    for valueset in DBSession.query(common.ValueSet)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == '0')\
            .options(joinedload(common.ValueSet.language)):
        if valueset.language.language_pk:
            continue
        if len(valueset.values) > 1:
            valueset.language.lexifier = 'Other'
        else:
            if valueset.values[0].domainelement.name == 'Other':
                valueset.language.lexifier = 'Other'
            else:
                valueset.language.lexifier \
                    = valueset.values[0].domainelement.name.replace('-based', '')
        for lect in valueset.language.lects:
            lect.lexifier = valueset.language.lexifier

    args.log.info('creating icons')
    for valueset in DBSession.query(common.ValueSet).options(
        joinedload(common.ValueSet.parameter),
        joinedload_all(common.ValueSet.values, common.Value.domainelement)
    ):
        values = sorted(list(valueset.values), key=lambda v: v.domainelement.number)
        assert abs(sum(v.frequency for v in values) - 100) < 1
        fracs = []
        colors = []

        for v in values:
            color = v.domainelement.jsondata['color']
            frequency = round(v.frequency)
            assert frequency

            if frequency not in frequencies:
                figure(figsize=(0.4, 0.4))
                axes([0.1, 0.1, 0.8, 0.8])
                coll = pie((int(100 - frequency), frequency), colors=('w', 'k'))
                coll[0][0].set_linewidth(0.5)
                assert icons_dir.joinpath('freq-%s.png' % frequency).exists()
                frequencies[frequency] = True

            v.jsondata = {'frequency_icon': 'freq-%s.png' % frequency}
            fracs.append(frequency)
            colors.append(color)
            v.domainelement.jsondata = {
                'color': color, 'icon': 'pie-100-%s.png' % color}

        assert len(colors) == len(set(colors))
        fracs, colors = tuple(fracs), tuple(colors)

        basename = 'pie-'
        basename += '-'.join('%s-%s' % (f, c) for f, c in zip(fracs, colors))
        valueset.update_jsondata(icon=basename + '.png')
        if (fracs, colors) not in icons:
            figure(figsize=(0.4, 0.4))
            axes([0.1, 0.1, 0.8, 0.8])
            coll = pie(
                tuple(reversed(fracs)),
                colors=['#' + _color for _color in reversed(colors)])
            for wedge in coll[0]:
                wedge.set_linewidth(0.5)
            assert icons_dir.joinpath('%s.png' % basename).exists()
            icons[(fracs, colors)] = True
            assert icons_dir.joinpath(basename + '.svg').exists()

    for de in DBSession.query(common.DomainElement):
        if not de.jsondata.get('icon'):
            de.update_jsondata(icon='pie-100-%s.png' % de.jsondata['color'])

    gbs_func('update', args)
Exemple #18
0
def prime_cache(args):
    #
    # TODO: relate survey chapter reference with language!
    #
    icons = {}
    frequencies = {}

    args.log.info('computing wals representation')
    for feature in DBSession.query(common.Parameter).options(
            joinedload(common.Parameter.valuesets)):
        feature.representation = len(feature.valuesets)
        if feature.wals_id:
            data = jsonload(
                path(apics.__file__).dirname().joinpath(
                    'static', 'wals', '%sA.json' % feature.wals_id))
            feature.wals_representation = sum(
                [len(l['features']) for l in data['layers']])

    args.log.info('computing language sources')
    compute_language_sources((common.ContributionReference, 'contribution'))
    compute_number_of_values()

    for valueset in DBSession.query(common.ValueSet)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == '0')\
            .options(joinedload(common.ValueSet.language)):
        if valueset.language.language_pk:
            continue
        if len(valueset.values) > 1:
            valueset.language.lexifier = 'Other'
        else:
            if valueset.values[0].domainelement.name == 'Other':
                valueset.language.lexifier = 'Other'
            else:
                valueset.language.lexifier \
                    = valueset.values[0].domainelement.name.replace('-based', '')
        for lect in valueset.language.lects:
            lect.lexifier = valueset.language.lexifier

    args.log.info('creating icons')
    for valueset in DBSession.query(common.ValueSet).options(
            joinedload(common.ValueSet.parameter),
            joinedload_all(common.ValueSet.values,
                           common.Value.domainelement)):
        values = sorted(list(valueset.values),
                        key=lambda v: v.domainelement.number)
        assert abs(sum(v.frequency for v in values) - 100) < 1
        fracs = []
        colors = []

        for v in values:
            color = v.domainelement.jsondata['color']
            frequency = round(v.frequency)
            assert frequency

            if frequency not in frequencies:
                figure(figsize=(0.4, 0.4))
                axes([0.1, 0.1, 0.8, 0.8])
                coll = pie((int(100 - frequency), frequency),
                           colors=('w', 'k'))
                coll[0][0].set_linewidth(0.5)
                assert icons_dir.joinpath('freq-%s.png' % frequency).exists()
                frequencies[frequency] = True

            v.jsondata = {'frequency_icon': 'freq-%s.png' % frequency}
            fracs.append(frequency)
            colors.append(color)
            v.domainelement.jsondata = {
                'color': color,
                'icon': 'pie-100-%s.png' % color
            }

        assert len(colors) == len(set(colors))
        fracs, colors = tuple(fracs), tuple(colors)

        basename = 'pie-'
        basename += '-'.join('%s-%s' % (f, c) for f, c in zip(fracs, colors))
        valueset.update_jsondata(icon=basename + '.png')
        if (fracs, colors) not in icons:
            figure(figsize=(0.4, 0.4))
            axes([0.1, 0.1, 0.8, 0.8])
            coll = pie(tuple(reversed(fracs)),
                       colors=['#' + _color for _color in reversed(colors)])
            for wedge in coll[0]:
                wedge.set_linewidth(0.5)
            assert icons_dir.joinpath('%s.png' % basename).exists()
            icons[(fracs, colors)] = True
            assert icons_dir.joinpath(basename + '.svg').exists()

    for de in DBSession.query(common.DomainElement):
        if not de.jsondata.get('icon'):
            de.update_jsondata(icon='pie-100-%s.png' % de.jsondata['color'])

    gbs_func('update', args)
Exemple #19
0
def main(args):
    data = Data()

    editors = OrderedDict()
    editors['Susanne Maria Michaelis'] = None
    editors['Philippe Maurer'] = None
    editors['Martin Haspelmath'] = None
    editors['Magnus Huber'] = None

    for row in read(args, 'People'):
        name = row['First name'] + ' ' if row['First name'] else ''
        name += row['Last name']
        kw = dict(
            name=name,
            id=slug('%(Last name)s%(First name)s' % row),
            url=row['Contact Website'].split()[0] if row['Contact Website'] else None,
            address=row['Comments on database'],
        )
        contrib = data.add(common.Contributor, row['Author ID'], **kw)
        if kw['name'] in editors:
            editors[kw['name']] = contrib

    DBSession.flush()

    dataset = common.Dataset(
        id='apics',
        name='APiCS Online',
        description='Atlas of Pidgin and Creole Language Structures Online',
        domain='apics-online.info',
        published=date(2013, 11, 4),
        license='http://creativecommons.org/licenses/by/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'})
    DBSession.add(dataset)
    for i, editor in enumerate(editors.values()):
        common.Editor(dataset=dataset, contributor=editor, ord=i + 1)

    colors = dict((row['ID'], row['RGB_code']) for row in read(args, 'Colours'))

    abbrs = {}
    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for id_, name in {
        'C**T': 'clitic',
        'IMPF': 'imperfect',
        'INTERM': 'intermediate',
        'NCOMPL': 'noncompletive',
        'NONFUT': 'nonfuture',
        'NPROX': 'nonproximal',
        'NSG': 'nonsingular',
        'PP': 'past participle',
        'PROP': 'proprietive',
        'TMA': 'tense-mood-aspect',
    }.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for row in reader(
            args.data_file('non-lgr-gloss-abbrs.csv'), delimiter=',', namedtuples=True):
        for match in GLOSS_ABBR_PATTERN.finditer(row.standard):
            if match.group('abbr') not in abbrs:
                abbrs[match.group('abbr')] = 1
                DBSession.add(
                    common.GlossAbbreviation(id=match.group('abbr'), name=row.meaning))

    non_bibs = {}
    for row in read(args, 'References', 'Reference_ID'):
        if row['Reference_type'] == 'Non-bib':
            non_bibs[row['Reference_ID']] = row['Reference_name']
            continue

        if isinstance(row['Year'], int):
            year_int = row['Year']
            year = str(row['Year'])
        elif row['Year']:
            year_int = None
            for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']):
                year_int = int(m.group('year'))
                break
            year = row['Year']
        else:
            year, year_int = None, None

        title = row['Article_title'] or row['Book_title']
        attrs = {}
        jsondata = {}
        for attr, field in {
            'Additional_information': 'note',
            'Article_title': 'title',
            'Book_title': 'booktitle',
            'City': 'address',
            'Editors': 'editor',
            'Full_reference': None,
            'Issue': None,
            'Journal': 'journal',
            'Language_codes': None,
            'LaTeX_cite_key': None,
            'Pages': 'pages',
            'Publisher': 'publisher',
            'Reference_type': 'type',
            'School': 'school',
            'Series_title': 'series',
            'URL': 'url',
            'Volume': 'volume',
        }.items():
            value = row.get(attr)
            if not isinstance(value, int):
                value = (value or '').strip()
            if attr == 'Issue' and value:
                try:
                    value = str(int(value))
                except ValueError:
                    pass
            if value:
                if field:
                    attrs[field] = value
                else:
                    jsondata[attr] = value
        p = data.add(
            common.Source, row['Reference_ID'],
            id=str(row['Reference_ID']),
            name=row['Reference_name'],
            description=title,
            author=row['Authors'],
            year=year,
            year_int=year_int,
            bibtex_type=getattr(EntryType, row['BibTeX_type'] or 'misc'),
            jsondata=jsondata,
            **attrs)
        if p.bibtex_type.value == 'misc' and not p.description:
            p.description = p.note
        DBSession.flush()

    DBSession.flush()

    infobox = jsonload(args.data_file('infobox.json'))
    glottocodes = jsonload(args.data_file('glottocodes.json'))
    for row in read(args, 'Languages', 'Order_number'):
        lon, lat = [float(c.strip()) for c in row['map_coordinates'].split(',')]
        kw = dict(
            name=row['Language_name'],
            id=str(row['Order_number']),
            latitude=lat,
            longitude=lon,
            region=row['Category_region'],
        )
        lect = data.add(models.Lect, row['Language_ID'], **kw)
        DBSession.flush()

        for i, item in enumerate(infobox[lect.id]):
            DBSession.add(common.Language_data(
                object_pk=lect.pk, ord=i, key=item[0], value=item[1]))

        if row["Languages_contribution_documentation::Lect_description_checked_status"] \
                != "Checked":
            print 'unchecked! ---', row['Language_name']

        desc = row.get('Languages_contribution_documentation::Lect description', '')
        markup_desc = normalize_markup(
            row['Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description'])

        c = data.add(
            models.ApicsContribution, row['Language_ID'],
            id=str(row['Order_number']),
            name=row['Language_name'],
            description=desc,
            markup_description=markup_desc,
            survey_reference=data['Source'][row['Survey_reference_ID']],
            language=lect)

        for ext, label, mtype in [
            ('pdf', 'Glossed text', 'application/pdf'),
            ('mp3', 'Glossed text audio', 'audio/mpeg'),
        ]:
            fid = '%s-gt.%s' % (c.id, ext)
            if args.data_file('files', 'contribution', c.id, fid).exists():
                common.Contribution_files(object=c, id=fid, name=label, mime_type=mtype)
            else:
                print label, 'missing for:', row['Language_name']

        #
        # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE
        #

        iso = None
        if row['ISO_code'] and len(row['ISO_code']) == 3:
            iso = row['ISO_code'].lower()
            if 'iso:%s' % row['ISO_code'] not in data['Identifier']:
                data.add(
                    common.Identifier, 'iso:%s' % row['ISO_code'],
                    id=row['ISO_code'].lower(),
                    name=row['ISO_code'].lower(),
                    type=common.IdentifierType.iso.value)

            DBSession.add(common.LanguageIdentifier(
                language=data['Lect'][row['Language_ID']],
                identifier=data['Identifier']['iso:%s' % row['ISO_code']]))

        if lect.id in glottocodes:
            identifier = data.add(
                common.Identifier, 'gc:%s' % glottocodes[lect.id],
                id=glottocodes[lect.id],
                name=glottocodes[lect.id],
                type=common.IdentifierType.glottolog.value)

            DBSession.add(common.LanguageIdentifier(
                language=data['Lect'][row['Language_ID']],
                identifier=identifier))

        if row['Language_name_ethnologue']:
            if row['Language_name_ethnologue'] not in data['Identifier']:
                data.add(
                    common.Identifier, row['Language_name_ethnologue'],
                    id=iso or 'ethnologue:%s' % row['Language_name_ethnologue'],
                    name=row['Language_name_ethnologue'],
                    type='ethnologue')

            DBSession.add(common.LanguageIdentifier(
                language=data['Lect'][row['Language_ID']],
                identifier=data['Identifier'][row['Language_name_ethnologue']]))

    example_count = {}
    for row in read(args, 'Examples', 'Order_number'):
        assert row['Language_ID']
        lang = data['Lect'][row['Language_ID']]
        id_ = '%(Language_ID)s-%(Example_number)s' % row
        atext, gloss = igt(row)
        example_count[row['Language_ID']] = max(
            [example_count.get(row['Language_ID'], 1), row['Example_number']])
        p = add_sentence(
            args, data, id_,
            id='%s-%s' % (lang.id, row['Example_number']),
            name=row['Text'] or row['Analyzed_text'],
            description=row['Translation'],
            type=row['Type'].strip().lower() if row['Type'] else None,
            comment=row['Comments'],
            gloss=gloss,
            analyzed=atext,
            markup_text=normalize_markup(row['z_calc_Text_CSS']),
            markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']),
            markup_comment=normalize_markup(row['z_calc_Comments_CSS']),
            markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']),
            original_script=row['Original_script'],
            jsondata={
                'sort': row['Order_number'],
                'alt_translation': (row['Translation_other'] or '').strip() or None},
            language=lang)

        if row['Reference_ID']:
            if row['Reference_ID'] in data['Source']:
                source = data['Source'][row['Reference_ID']]
                DBSession.add(common.SentenceReference(
                    sentence=p,
                    source=source,
                    key=source.id,
                    description=row['Reference_pages']))
            else:
                p.source = non_bibs[row['Reference_ID']]

    DBSession.flush()

    for row in read(args, 'Language_references'):
        if row['Reference_ID'] not in data['Source']:
            assert row['Reference_ID'] in non_bibs
            continue
        assert row['Language_ID'] in data['ApicsContribution']
        source = data['Source'][row['Reference_ID']]
        DBSession.add(common.ContributionReference(
            contribution=data['ApicsContribution'][row['Language_ID']],
            source=source,
            description=row['Pages'],
            key=source.id))

    #
    # global counter for features - across feature types
    #
    feature_count = 0
    for row in read(args, 'Features', 'Feature_number'):
        id_ = str(row['Feature_number'])
        if int(id_) > feature_count:
            feature_count = int(id_)
        wals_id = None
        desc = row['Feature_annotation_publication']
        if row['WALS_match'] == 'Total':
            if isinstance(row['WALS_No.'], int):
                wals_id = row['WALS_No.']
            else:
                wals_id = int(row['WALS_No.'].split('.')[0].strip())

        p = data.add(
            models.Feature, row['Feature_code'],
            name=row['Feature_name'],
            id=id_,
            description=desc,
            markup_description=normalize_markup(
                row['z_calc_Feature_annotation_publication_CSS']),
            feature_type='primary',
            multivalued=row['Value_relation_type'] != 'Single',
            area=row['Feature_area'],
            wals_id=wals_id)

        names = {}
        for i in range(1, 10):
            if not row['Value%s_publication' % i] \
                    or not row['Value%s_publication' % i].strip():
                continue
            name = row['Value%s_publication' % i].strip()
            if name in names:
                name += ' (%s)' % i
            names[name] = 1
            de = data.add(
                common.DomainElement, '%s-%s' % (row['Feature_code'], i),
                id='%s-%s' % (id_, i),
                name=name,
                parameter=p,
                abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name,
                number=int(row['Value%s_value_number_for_publication' % i]),
                jsondata={'color': colors[row['Value_%s_colour_ID' % i]]},
            )
            assert de

        if row['Authors_FeatureArticles']:
            authors, _ = row['Authors_FeatureArticles'].split('and the APiCS')
            authors = authors.strip()
            if authors.endswith(','):
                authors = authors[:-1].strip()
            for i, name in enumerate(authors.split(',')):
                assert name.strip() in editors
                p._authors.append(models.FeatureAuthor(
                    ord=i + 1, contributor=editors[name.strip()]))

        DBSession.flush()

    primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41}
    segment_to_primary = dict(zip(
        primary_to_segment.values(), primary_to_segment.keys()))
    number_map = {}
    names = {}
    for row in read(args, 'Segment_features', 'Order_number'):
        symbol = row['Segment_symbol']
        if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate':
            symbol = 't\u0361s'
        truth = lambda s: s and s.strip().lower() == 'yes'
        name = '%s - %s' % (symbol, row['Segment_name'])

        if name in names:
            number_map[row['Segment_feature_number']] = names[name]
            continue

        number_map[row['Segment_feature_number']] = row['Segment_feature_number']
        names[name] = row['Segment_feature_number']
        feature_count += 1
        if row['Segment_feature_number'] in segment_to_primary:
            primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\
                = str(feature_count)
        p = data.add(
            models.Feature, row['Segment_feature_number'],
            name=name,
            id=str(feature_count),
            feature_type='segment',
            area='Vowels' if truth(row['Vowel']) else (
                'Obstruent consonants' if truth(row['Obstruent'])
                else 'Sonorant consonants'),
            jsondata=dict(
                number=int(row['Segment_feature_number']),
                vowel=truth(row['Vowel']),
                consonant=truth(row['Consonant']),
                obstruent=truth(row['Obstruent']),
                core_list=truth(row['Core_list_segment']),
                symbol=symbol,
            ))

        for i, spec in SEGMENT_VALUES.items():
            data.add(
                common.DomainElement,
                '%s-%s' % (row['Segment_feature_number'], spec[0]),
                id='%s-%s' % (p.id, i),
                name=spec[0],
                parameter=p,
                jsondata={'color': spec[1]},
                number=i)

    print '--> remapped:', primary_to_segment
    DBSession.flush()

    for row in read(args, 'Sociolinguistic_features', 'Sociolinguistic_feature_number'):
        feature_count += 1
        p = data.add(
            models.Feature, row['Sociolinguistic_feature_code'],
            name=row['Sociolinguistic_feature_name'],
            id='%s' % feature_count,
            description=row['Sociolinguistic_feature_annotation'],
            area='Sociolinguistic',
            feature_type='sociolinguistic')

        names = {}

        for i in range(1, 10):
            id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i)
            if row.get('Value%s' % i) and row['Value%s' % i].strip():
                name = row['Value%s' % i].strip()
                if name in names:
                    name += ' (%s)' % i
                names[name] = 1
            else:
                continue
            kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i)
            data.add(
                common.DomainElement,
                id_,
                id='%s-%s' % (p.id, i),
                name=name,
                parameter=p,
                number=i,
                jsondata={'color': colors.get(
                    row['Value%s_colour_ID' % i], colors.values()[i])})

    sd = {}
    for row in read(args, 'Segment_data'):
        if row['Segment_feature_number'] not in number_map:
            continue
        number = number_map[row['Segment_feature_number']]

        if not row['Presence_in_the_language']:
            continue

        lang = data['Lect'][row['Language_ID']]
        param = data['Feature'][number]
        id_ = '%s-%s' % (lang.id, param.id)
        if id_ in sd:
            assert row['c_Record_is_a_duplicate'] == 'Yes'
            continue
        sd[id_] = 1
        valueset = data.add(
            common.ValueSet,
            id_,
            id=id_,
            parameter=param,
            language=lang,
            contribution=data['ApicsContribution'][row['Language_ID']],
            description=row['Comments'],
            markup_description=normalize_markup(row['z_calc_Comments_CSS']),
        )
        v = data.add(
            common.Value,
            id_,
            id=id_,
            frequency=float(100),
            valueset=valueset,
            domainelement=data['DomainElement']['%s-%s' % (
                number, row['Presence_in_the_language'])],
        )
        if row['Example_word'] and row['Example_word_gloss']:
            example_count[row['Language_ID']] += 1
            p = add_sentence(
                args, data, '%s-p%s' % (lang.id, data['Feature'][number].id),
                id='%s-%s' % (lang.id, example_count[row['Language_ID']]),
                name=row['Example_word'],
                description=row['Example_word_gloss'],
                language=lang)
            DBSession.add(common.ValueSentence(value=v, sentence=p))

        source = data['Source'].get(row['Refers_to_references_Reference_ID'])
        if source:
            DBSession.add(common.ValueSetReference(
                valueset=valueset, source=source, key=source.id))
        elif row['Refers_to_references_Reference_ID'] in non_bibs:
            valueset.source = non_bibs[row['Refers_to_references_Reference_ID']]

    lects = defaultdict(lambda: 1)
    lect_map = {}
    records = {}
    false_values = {}
    no_values = {}
    wals_value_number = {}
    for row in read(args, 'wals'):
        if row['z_calc_WALS_value_number']:
            wals_value_number[row['Data_record_id']] = row['z_calc_WALS_value_number']

    def prefix(attr, _prefix):
        if _prefix:
            return '%s_%s' % (_prefix, attr)
        return attr.capitalize()

    for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]:
        num_values = 10
        for row in read(args, prefix('data', _prefix)):
            if not row[prefix('feature_code', _prefix)]:
                print('no associated feature for',
                      prefix('data', _prefix),
                      row[prefix('data_record_id', _prefix)])
                continue

            lid = row['Language_ID']
            lect_attr = row.get('Lect_attribute', 'my default lect').lower()
            if lect_attr != 'my default lect':
                if (row['Language_ID'], row['Lect_attribute']) in lect_map:
                    lid = lect_map[(row['Language_ID'], row['Lect_attribute'])]
                else:
                    lang = data['Lect'][row['Language_ID']]
                    c = lects[row['Language_ID']]
                    lid = '%s-%s' % (row['Language_ID'], c)
                    kw = dict(
                        name='%s (%s)' % (lang.name, row['Lect_attribute']),
                        id='%s' % (1000 + 10 * int(lang.id) + c),
                        latitude=lang.latitude,
                        longitude=lang.longitude,
                        description=row['Lect_attribute'],
                        language=lang,
                    )
                    data.add(models.Lect, lid, **kw)
                    lects[row['Language_ID']] += 1
                    lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid

            id_ = abbr + str(row[prefix('data_record_id', _prefix)])
            assert id_ not in records
            records[id_] = 1

            assert row[prefix('feature_code', _prefix)] in data['Feature']
            language = data['Lect'][lid]
            parameter = data['Feature'][row[prefix('feature_code', _prefix)]]
            valueset = common.ValueSet(
                id='%s-%s' % (language.id, parameter.id),
                description=row['Comments_on_value_assignment'],
                markup_description=normalize_markup(
                    row.get('z_calc_Comments_on_value_assignment_CSS')),
            )

            values_found = {}
            for i in range(1, num_values):
                if not row['Value%s_true_false' % i]:
                    continue

                if row['Value%s_true_false' % i].strip().lower() != 'true':
                    assert row['Value%s_true_false' % i].strip().lower() == 'false'
                    false_values[row[prefix('data_record_id', _prefix)]] = 1
                    continue

                iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i)
                if iid not in data['DomainElement']:
                    print(iid,
                          row[prefix('data_record_id', _prefix)],
                          '--> no domainelement!')
                    continue
                values_found['%s-%s' % (id_, i)] = dict(
                    id='%s-%s' % (valueset.id, i),
                    domainelement=data['DomainElement']['%s-%s' % (
                        row[prefix('feature_code', _prefix)], i)],
                    confidence=row['Value%s_confidence' % i],
                    frequency=float(row['c_V%s_frequency_normalised' % i])
                    if _prefix == '' else 100)

            if values_found:
                if row[prefix('data_record_id', _prefix)] in wals_value_number:
                    valueset.jsondata = {
                        'wals_value_number': wals_value_number.pop(
                            row[prefix('data_record_id', _prefix)])}
                valueset.parameter = parameter
                valueset.language = language
                valueset.contribution = data['ApicsContribution'][row['Language_ID']]
                valueset = data.add(common.ValueSet, id_, _obj=valueset)
                for i, item in enumerate(values_found.items()):
                    if i > 0 and not parameter.multivalued:
                        print 'multiple values for single-valued parameter: %s' % id_
                        break
                    id_, kw = item
                    kw['valueset'] = valueset
                    value = data.add(common.Value, id_, **kw)

                #
                # store references to additional data for segments which should be reused
                # for corresponding primary features!
                #
                if int(parameter.id) in primary_to_segment:
                    assert len(values_found) == 1
                    seg_id = '%s-%s' % (
                        language.id, primary_to_segment[int(parameter.id)])
                    seg_valueset = data['ValueSet'][seg_id]
                    seg_value = data['Value'][seg_id]
                    if not valueset.description and seg_valueset.description:
                        valueset.description = seg_valueset.description

                    for s in seg_value.sentence_assocs:
                        DBSession.add(
                            common.ValueSentence(value=value, sentence=s.sentence))

                    for r in seg_valueset.references:
                        DBSession.add(common.ValueSetReference(
                            valueset=valueset, source=r.source, key=r.key))

                    if not valueset.source and seg_valueset.source:
                        valueset.source = seg_valueset.source

                DBSession.flush()
            else:
                no_values[id_] = 1

    DBSession.flush()

    for prefix, abbr, num_values in [
        ('D', '', 10),
        ('Sociolinguistic_d', 'sl', 7),
    ]:
        for row in read(args, prefix + 'ata_references'):
            assert row['Reference_ID'] in data['Source'] \
                or row['Reference_ID'] in non_bibs
            try:
                vs = data['ValueSet'][abbr + str(row[prefix + 'ata_record_id'])]
                if row['Reference_ID'] in data['Source']:
                    source = data['Source'][row['Reference_ID']]
                    DBSession.add(common.ValueSetReference(
                        valueset=vs,
                        source=source,
                        key=source.id,
                        description=row['Pages'],
                    ))
                else:
                    if vs.source:
                        vs.source += '; ' + non_bibs[row['Reference_ID']]
                    else:
                        vs.source = non_bibs[row['Reference_ID']]
            except KeyError:
                continue

    DBSession.flush()

    missing = 0
    for row in read(args, 'Value_examples'):
        try:
            DBSession.add(common.ValueSentence(
                value=data['Value']['%(Data_record_id)s-%(Value_number)s' % row],
                sentence=data['Sentence']['%(Language_ID)s-%(Example_number)s' % row],
                description=row['Notes'],
            ))
        except KeyError:
            missing += 1
    print('%s Value_examples are missing data' % missing)

    print('%s data sets with false values' % len(false_values))
    print('%s data sets without values' % len(no_values))

    for k, v in wals_value_number.items():
        print 'unclaimed wals value number:', k, v

    for i, row in enumerate(read(args, 'Contributors')):
        kw = dict(
            contribution=data['ApicsContribution'][row['Language ID']],
            contributor=data['Contributor'][row['Author ID']]
        )
        if row['Order_of_appearance']:
            kw['ord'] = int(float(row['Order_of_appearance']))
        data.add(common.ContributionContributor, i, **kw)

    DBSession.flush()
Exemple #20
0
def import_dataset(path, provider):
    # look for metadata
    # look for sources
    # then loop over values
    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)
    glottolog = Glottolog()

    mdpath = path + "-metadata.json"
    assert os.path.exists(mdpath)
    md = jsonload(mdpath)
    md, parameters = md["properties"], md["parameters"]

    cname = md["name"]
    if "id" in md:
        cname = "%s [%s]" % (cname, md["id"])
    contrib = Wordlist(id=basename, name=cname)
    contributors = md.get("typedby", md.get("contributors"))

    if contributors:
        contributor_name = HumanName(contributors)
        contributor_id = slug(contributor_name.last + contributor_name.first)
        contributor = Contributor.get(contributor_id, default=None)
        if not contributor:
            contributor = Contributor(id=contributor_id, name="%s" % contributor_name)

        DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor))

    # bibpath = os.path.join(dirpath, basename + '.bib')
    # if os.path.exists(bibpath):
    #    for rec in Database.from_file(bibpath):
    #        if rec['key'] not in data['Source']:
    #            data.add(Source, rec['key'], _obj=bibtex2source(rec))

    data = Data()
    concepts = {p.id: p for p in DBSession.query(Concept)}
    language = None

    for i, row in enumerate(reader(path, dicts=True, delimiter=",")):
        if not row["Value"] or not row["Feature_ID"]:
            continue

        fid = row["Feature_ID"].split("/")[-1]
        vsid = "%s-%s-%s" % (basename, row["Language_ID"], fid)
        vid = "%s-%s-%s" % (provider, basename, i + 1)

        if language:
            assert language.id == row["Language_ID"]
        else:
            language = Language.get(row["Language_ID"], default=None)
            if language is None:
                # query glottolog!
                languoid = glottolog.languoid(row["Language_ID"])
                language = LexibankLanguage(
                    id=row["Language_ID"], name=languoid.name, latitude=languoid.latitude, longitude=languoid.longitude
                )

        parameter = concepts.get(fid)
        if parameter is None:
            concepts[fid] = parameter = Concept(
                id=fid, name=parameters[row["Feature_ID"]], concepticon_url=row["Feature_ID"]
            )

        vs = data["ValueSet"].get(vsid)
        if vs is None:
            vs = data.add(
                ValueSet,
                vsid,
                id=vsid,
                parameter=parameter,
                language=language,
                contribution=contrib,
                source=row.get("Source"),
            )

        counterpart = Counterpart(
            id=vid, valueset=vs, name=row["Value"], description=row.get("Comment"), loan=row.get("Loan") == "yes"
        )

        if row.get("Cognate_Set"):
            csid = row["Cognate_Set"].split(",")[0].strip()
            cs = Cognateset.get(csid, key="name", default=None)
            if cs is None:
                cs = Cognateset(name=csid)
            counterpart.cognateset = cs

        # for key, src in data['Source'].items():
        #    if key in vs.source:
        #        ValueSetReference(valueset=vs, source=src, key=key)

    contrib.language = language
Exemple #21
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)

    def data_file(*comps):
        return path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(common.Dataset, 'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex")
    glottolog = glottocodes_by_isocode('postgresql://robert@/glottolog3')

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        try:
            add_language_codes(
                data, lang, lang.id.split('-')[0], glottolog, glottocode=row[2] or None)
        except:
            print(row)
            raise
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace(
            '/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):
        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(
                url=image.source_url,
                thumbnail=image_url(image.source_url, 'thumbnail'),
                web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(
                object=data['Taxon'][image.taxa__id],
                id=image.id,
                name=image.tags,
                jsondata=jsondata,
                mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)