Esempio n. 1
0
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=cognition.__name__,
        name="COSTATOL",
        description="Cognitive Structures across the Tree of Life",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='cognition.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    DBSession.add(dataset)

    #
    # TODO: add editors!
    #

    for rec in Database.from_file(args.data_file('sources.bib')):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    contrib = common.Contribution(id='costatol', name='COSTATOL')
    for datapoint in reader(args.data_file('values.csv'), delimiter=',', dicts=True):
        param = data['Parameter'].get(datapoint['cognitive capacity'])
        if not param:
            name = datapoint['cognitive capacity']
            param = data.add(common.Parameter, name, id=slug(name), name=name)

        species = data['Language'].get(datapoint['species'])
        if not species:
            name = datapoint['species']
            species = data.add(common.Language, name, id=slug(name), name=name)

        vid = '%s-%s' % (species.id, param.id)
        vs = data.add(
            common.ValueSet,
            vid,
            id=vid,
            language=species,
            parameter=param,
            contribution=contrib)
        data.add(common.Value, vid, id=vid, name=datapoint['value'], valueset=vs)
        match = source_pattern.match(datapoint['source'])
        if match:
            DBSession.add(common.ValueSetReference(
                valueset=vs,
                source=data['Source'][match.group('key')],
                description=match.group('pages')))

    for species in reader(args.data_file('species.csv'), delimiter=',', namedtuples=True):
        data['Language'][species.name].longitude = species.longitude
        data['Language'][species.name].latitude = species.latitude
Esempio n. 2
0
File: grambank.py Progetto: clld/nts
def main(args):
    features = reader(args.data_file('grambank_features.csv'), dicts=True, )
    features = [GBFeature(f) for f in features]
    features = {'%s' % int(f.id[2:]): f for f in features}
    errors = []

    db = create_engine('postgresql://robert@/glottolog3')

    for l in DBSession.query(Language):
        if l.id == 'qgr':
            continue

        gc = l.glottocode
        ma = db.execute("""
select
    m.id
from
    macroarea as m, languoidmacroarea as lm, language as l
where
    m.pk = lm.macroarea_pk and lm.languoid_pk = l.pk and l.id = '%s';""" % gc).fetchone()[0]

        if ma == 'pacific':
            ma = 'papunesia'

        errors.extend(export(args, l, features, gc, ma))

    with UnicodeWriter(args.data_file('na_errors.tsv'), delimiter=b'\t') as writer:
        writer.writerow(['Language', 'Feature', 'Value', 'Source', 'Comment'])
        writer.writerows(errors)
Esempio n. 3
0
def main(args, reload=False):
    species = {}
    db = args.data_file('theplantlist', 'db.json')
    if reload:
        for a in bs(get('/1.1/browse/-/')).find('ul', id='nametree').find_all('a'):
            with iopen(args.data_file('theplantlist', a.text + '.csv'), 'w', encoding='utf8') as fp:
                fp.write(get(a['href'] + a.text + '.csv'))

    if db.exists():
        with open(db) as fp:
            species = json.load(fp)
    else:
        for p in args.data_file('theplantlist').files('*.csv'):
            for row in reader(p, namedtuples=True, delimiter=','):
                if row.Taxonomic_status_in_TPL == 'Accepted':
                    id_ = slug(row.Genus + row.Species)
                    species[id_] = row.ID
        with open(db, 'w') as fp:
            json.dump(species, fp)

    with transaction.manager:
        found = 0
        for p in DBSession.query(Parameter):
            id_ = slug(p.name)
            if id_ in species:
                found += 1
                p.tpl_id = species[id_]

    print(found)
def upgrade():
    conn = Connection(op.get_bind())
    example_map = {}

    sid = 204
    for example in jsonload(data_file('lingala_examples.json')):
        sid += 1
        kw = {
            'id': '60-%s' % sid,
            'language_pk': conn.pk(Language, '60'),
            'name': example['Text'],
            'description': example['Translation'],
            'gloss': '\t'.join(example['Gloss'].split()),
            'analyzed': '\t'.join(example['Text'].split()),
            'type': example['Type'].strip().lower(),
            'jsondata': {'sort': int(example['Order_number']), 'alt_translation': None}
        }
        example_map[example['Example_number']] = conn.insert(Sentence, **kw)

    for ve in jsonload(data_file('lingala_value_examples.json')):
        vspk = conn.pk(ValueSet, '60-%s' % ve['Features::Feature_number'])
        vpk = conn.pk(Value, vspk, attr='valueset_pk')
        conn.insert(
            ValueSentence, value_pk=vpk, sentence_pk=example_map[ve['Example_number']])

    for i, comment in enumerate(reader(data_file('lingala_valueset_comments.tab'), dicts=True)):
        vspk = conn.pk(ValueSet, '60-%s' % comment['Features::Feature_number'])
        comment['Comments_on_value_assignment'] = comment['Comments_on_value_assignment'].replace('\x0b', '\n')
        conn.update(
            ValueSet,
            {
                'description': comment['Comments_on_value_assignment'],
                'markup_description': None,
            },
            pk=vspk)
Esempio n. 5
0
def download(args):
    data = dict(wikipedia={}, multitree=defaultdict(list))
    for item in reader(args.data_file(DATA_FILE), namedtuples=True):
        if item.Glottolog and GC_PATTERN.match(item.Glottolog.strip()):
            data['wikipedia'][item.Glottolog.strip()] = item.Wiki.strip()
            for code in ll_codes(item):
                data['multitree'][item.Glottolog.strip()].append(code)
    return data
Esempio n. 6
0
def download(args):
    data = dict(wikipedia={}, multitree=defaultdict(list))
    for item in reader(args.data_file(DATA_FILE), namedtuples=True):
        if item.Glottolog and GC_PATTERN.match(item.Glottolog.strip()):
            data['wikipedia'][item.Glottolog.strip()] = item.Wiki.strip()
            for code in ll_codes(item):
                data['multitree'][item.Glottolog.strip()].append(code)
    return data
def upgrade():
    csv = path(phoible.__file__).dirname().joinpath("..", "data", "InventoryID-InternetArchive.csv")
    ia_urls = {row[0]: row[1] for row in reader(csv) if row[1] != "NA"}

    conn = Connection(op.get_bind())
    for id_, url in ia_urls.items():
        pk = conn.pk(Contribution, id_)
        conn.update(Inventory, dict(internetarchive_url=url), pk=pk)
Esempio n. 8
0
def get_lginfo(args, filter=None):
    return [
        (r.id, r) for r in
        dsv.reader(
            args.data_dir.joinpath('languoids', 'forkel_lginfo.tab'),
            fieldnames=['id', 'longitude', 'latitude', 'macro_area', 'year'],
            namedtuples=True)
        if filter is None or filter(r)]
Esempio n. 9
0
    def test_reader(self):
        from clld.lib.dsv import reader

        lines = ['first\tline', 'sücond\tläneß']
        encoded_lines = [l.encode('utf8') for l in lines]
        csv_lines = [l.replace('\t', ',') for l in lines]

        def check(r):
            res = list(r)
            assert len(res) == 2
            assert res[1][1] == 'läneß'

        check(reader(lines))
        for lt in ['\n', '\r\n', '\r']:
            check(reader(StringIO(str(lt).join(encoded_lines))))
        check(reader(TESTS_DIR.joinpath('csv.txt'), delimiter=','))

        res = list(reader(TESTS_DIR.joinpath('test.tab'), namedtuples=True))
        assert res[0].a_name == 'b'
        # Missing column values should be set to None:
        assert res[2].a_name is None

        r = list(reader(lines, dicts=True))
        assert len(r) == 1 and r[0]['first'] == 'sücond'
        r = list(reader(lines, namedtuples=True))
        assert len(r) == 1 and r[0].first == 'sücond'
        r = list(reader(csv_lines, namedtuples=True, delimiter=','))
        assert len(r) == 1 and r[0].first == 'sücond'
Esempio n. 10
0
def upgrade():
    csv = path(phoible.__file__).dirname().joinpath(
        '..', 'data', 'InventoryID-InternetArchive.csv')
    ia_urls = {row[0]: row[1] for row in reader(csv) if row[1] != 'NA'}

    conn = Connection(op.get_bind())
    for id_, url in ia_urls.items():
        pk = conn.pk(Contribution, id_)
        conn.update(Inventory, dict(internetarchive_url=url), pk=pk)
Esempio n. 11
0
def main(args):
    mapping = {}
    for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True):
        if row.GlyphID not in mapping:
            unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme]
            description = ' - '.join([t[1] for t in unicode_desc])
            mapping[int(row.GlyphID)] = b16encode(md5(description).digest())

    with open(args.data_file('segment_id_mapping.txt'), 'w') as fp:
        for gid in sorted(mapping.keys()):
            fp.write('    ("%s", "%s"),\n' % (gid, mapping[gid]))
Esempio n. 12
0
def update(args):
    pid, cid = 'vitality', 'unesco'
    count = 0
    notfound = {}
    contrib = common.Contribution.get(cid, default=None)
    if not contrib:
        contrib = common.Contribution(
            id=cid,
            name='Atlas of the World’s Languages in Danger',
            description='Atlas of the World’s Languages in Danger, © UNESCO, http://www.unesco.org/culture/languages-atlas')
    param = common.Parameter.get(pid, default=None)
    if param is None:
        param = common.Parameter(
            id=pid,
            name='Degree of endangerment')
    domain = {de.name: de for de in param.domain}
    for i, spec in enumerate(VITALITY_VALUES):
        name, desc = spec
        if name not in domain:
            number = i + 1
            domain[name] = common.DomainElement(
                id='%s-%s' % (pid, number),
                name=name,
                description=desc,
                number=number,
                parameter=param)
    valuesets = {vs.id: vs for vs in param.valuesets}
    for item in reader(args.data_file(DATA_FILE), dicts=True):
        if item['ISO639-3 codes']:
            for code in item['ISO639-3 codes'].split(','):
                code = code.strip()
                lang = Languoid.get(code, key='hid', default=None)
                if lang:
                    count += 1
                    item['url'] = 'http://www.unesco.org/culture/languages-atlas/en/atlasmap/language-iso-%s.html' % code
                    lang.update_jsondata(unesco=item)
                    de = domain[item['Degree of endangerment']]
                    vsid = '%s-%s' % (pid, lang.id)
                    vs = valuesets.get(vsid)
                    if not vs:
                        vs = common.ValueSet(
                            id='vitality-%s' % lang.id,
                            parameter=param,
                            contribution=contrib,
                            language=lang)
                        DBSession.add(common.Value(valueset=vs, name=de.name, domainelement=de))
                        valuesets[vsid] = vs
                    else:
                        vs.values[0].domainelement = de
                else:
                    notfound[code] = 1
    print 'assigned', count, 'unesco urls'
    print 'missing iso codes:', notfound
Esempio n. 13
0
def import_features_collaborative_sheet(datadir, data):
    for feature in reader(os.path.join(datadir, 'features_collaborative_sheet.tsv'), dicts=True):
        feature = FeatureSpec(feature)
        f = data.add(Feature, feature.id, id=feature.id, name=feature.name, doc=feature.doc, patron=feature.patron, std_comments=feature.std_comments, name_french=feature.name_french, jl_relevant_unit=feature.jl_relevant_unit, jl_function=feature.jl_function, jl_formal_means=feature.jl_formal_means, hard_to_deny=feature.hard_to_deny, prone_misunderstanding=feature.prone_misunderstanding, requires_extensive_data=feature.requires_extensive_data, last_edited=feature.last_edited, other_survey=feature.other_survey)
        for i, (deid, desc) in enumerate(feature.domain.items()):
            DomainElement(
                id='%s-%s' % (f.id, deid),
                parameter=f,
                abbr=deid,
                name='%s - %s' % (deid, desc),
                number=int(deid) if deid != '?' else 999,
                description=desc,
                jsondata=dict(icon=ORDERED_ICONS[i].name))
Esempio n. 14
0
def main(args):
    data = Data()

    # fetch language data from glottolog:
    glottolog = glottocodes_by_isocode(
        'postgresql://robert@/glottolog3', ['id', 'name', 'latitude', 'longitude'])

    dataset = common.Dataset(
        id=jcld.__name__,
        name="Journal of Cross-Linguistic Databases",
        domain='jcld.clld.org')
    DBSession.add(dataset)

    contribution = data.add(common.Contribution, '1', id='1', name='fb')

    for i, row in enumerate(reader(file(args.data_file('fb_jcld.tab')), namedtuples=True, encoding='latin1')):
        if row.Feature not in data['Parameter']:
            parameter = data.add(common.Parameter, row.Feature, id='1', name=row.Feature)
        else:
            parameter = data['Parameter'][row.Feature]

        if row.Value not in data['DomainElement']:
            de = data.add(
                common.DomainElement, row.Value,
                id='%s-%s' % (parameter.id, slug(row.Value)), parameter=parameter, name=row.Value)
        else:
            de = data['DomainElement'][row.Value]

        if row.Language not in data['Language']:
            if row.Language not in glottolog:
                print '--->', row.Language
                continue
            glottocode, name, lat, lon = glottolog[row.Language]
            language = data.add(
                common.Language, row.Language,
                id=slug(row.Language), name=name, latitude=lat, longitude=lon)
        else:
            language = data['Language'][row.Language]

        id_ = str(i + 1)  #'%s-%s' % (parameter.id, language.id)
        vs = common.ValueSet(
            id=id_,
            parameter=parameter,
            language=language,
            contribution=contribution,
            description=row.Comment,
            source=row.Source)
        common.Value(valueset=vs, name=row.Value, domainelement=de)
Esempio n. 15
0
def update(args):
    codes = {}
    for lang in reader(args.data_file(DATA_FILE), namedtuples=True):
        codes[lang.LangID] = 1

    count = 0
    for lang in DBSession.query(Languoid)\
            .filter(Languoid.hid != None)\
            .filter(not_(icontains(Languoid.hid, 'nocode'))):
        if lang.hid in codes:
            lang.update_jsondata(ethnologue=LANGUAGE_URL + lang.hid)
        else:
            lang.update_jsondata(ethnologue=None)
            count += 1

    print count, 'iso codes have no ethnologue code'

    ethnologue = args.json

    leafsets = defaultdict(list)
    for id_, doc in ethnologue['docs'].items():
        for sid, spec in get_classification(id_, doc).items():
            leafs = sorted(set([p[0] for p in spec[2]]))
            if leafs:
                leafsets[tuple(leafs)].append(sid)

    all = 0
    matched = 0
    for family in DBSession.query(Languoid)\
            .filter(Languoid.level == LanguoidLevel.family)\
            .filter(Language.active == True):
        leafs = []
        all += 1
        for row in DBSession.query(TreeClosureTable.child_pk, Languoid.hid)\
                .filter(TreeClosureTable.parent_pk == family.pk)\
                .filter(TreeClosureTable.child_pk == Languoid.pk)\
                .filter(Languoid.hid != None):
            if len(row[1]) == 3:
                leafs.append(row[1])
        leafs = tuple(sorted(set(leafs)))
        for i, subgroup in enumerate(leafsets.get(leafs, [])):
            if i == 0:
                matched += 1
                family.update_jsondata(ethnologue=SUBGROUP_URL + subgroup)
                break
    print matched, 'of', all, 'families have an exact counterpart in ethnologue!'
Esempio n. 16
0
def coordinates(args, languages):
    diff = lambda x, y: abs(x - y) > 0.001

    for hid, lon, lat in dsv.reader(args.data_file("coordinates.tab")):
        if hid not in languages:
            languages[hid] = Languoid.get(hid, key="hid", default=None)
        if not languages[hid]:
            continue
        language = languages[hid]
        lat, lon = map(float, [lat, lon])

        if not language.latitude or not language.longitude:
            language.longitude, language.latitude = lon, lat
            args.log.info("++ %s" % language.id)
        elif diff(language.longitude, lon) or diff(language.latitude, lat):
            language.longitude, language.latitude = lon, lat
            args.log.info("~~ %s" % language.id)
Esempio n. 17
0
def from_csv(data_file, model, data, name=None, visitor=None, filter_=None):
    if filter_ is None:
        filter_ = lambda r: True
    kw = {'delimiter': ',', 'lineterminator': str('\r\n'), 'quotechar': '"'}
    for fname in data_files(data_file, (name or model.__csv_name__) + '.csv'):
        for row in list(reader(fname, **kw))[1:]:
            if row and filter_(row):
                try:
                    obj = model.from_csv(row, data)
                except (KeyError, IndexError):
                    obj = None
                    print(fname)
                    print(row)
                    raise
                if obj:
                    obj = data.add(model, row[0], _obj=obj)
                    if visitor:
                        visitor(obj, row, data)
Esempio n. 18
0
def main(args):
    sources = jsonload(args.data_file('sources.json'))
    fields = ['href', 'name', 'author', 'iso', 'source', 'notes', 'wordlist']
    with UnicodeWriter(args.data_file('..', 'sources.csv')) as fp:
        fp.writerow(fields)
        for source in sorted(sources, key=lambda i: i['name']):
            fp.writerow([source.get(f, '') for f in fields])
    return
    ethnologue_names = {
        r.ISO_639: r.Language_Name for r in reader(args.data_file(
        '..', '..', 'ethnologue-17-data', 'Table_of_Languages.tab'), namedtuples=True)}

    # ASJP name for language, Ethnologue's name, ISO code
    rows = [['ASJP Name', 'Ethnologue name', 'ISO code']]
    subquery = DBSession.query(LanguageSource.language_pk).distinct().subquery()
    for i, l in enumerate(DBSession.query(Doculect).order_by(Doculect.pk).filter(not_(Doculect.pk.in_(subquery)))):
        rows.append([l.id, ethnologue_names.get(l.code_iso, ''), l.code_iso or ''])
    #print i
    with UnicodeWriter(args.data_file('..', 'doculects_without_source.csv')) as fp:
        fp.writerows(rows)
Esempio n. 19
0
def countries(args, languages, stats):
    """update relations between languages and countries they are spoken in.
    """
    cname_map = {
        'Tanzania': 'Tanzania, United Republic of',
        'Russia': 'Russian Federation',
        'South Korea': 'Korea, Republic of',
        'Iran': 'Iran, Islamic Republic of',
        'Syria': 'Syrian Arab Republic',
        'Laos': "Lao People's Democratic Republic",
        r"C\^ote d'Ivoire": "Côte d'Ivoire",
        'British Virgin Islands': 'Virgin Islands, British',
        'Bolivia': 'Bolivia, Plurinational State of',
        'Venezuela': 'Venezuela, Bolivarian Republic of',
        'Democratic Republic of the Congo': 'Congo, The Democratic Republic of the',
        'Micronesia': 'Micronesia, Federated States of',
    }
    countries = {}
    for row in dsv.reader(
            args.data_dir.joinpath('languoids', 'forkel_countries.tab'), encoding='latin1'):
        hid, cnames = row[0], row[1:]
        if hid not in languages:
            languages[hid] = Languoid.get(hid, key='hid', default=None)
        if not languages[hid]:
            args.log.warn('unknown hid in countries.tab: %s' % hid)
            continue
        l = languages[hid]
        if l.countries:
            # we only add country relations to new languages or languages which have none.
            continue
        for cname in set(cnames):
            if cname not in countries:
                q = cname if '(' not in cname else cname.split('(')[0].strip()
                countries[cname] = Country.get(cname_map.get(q, q), key='name', default=None)
            if not countries[cname]:
                args.log.warn('unknown country name in countries.tab: %s' % cname)
                continue
            c = countries[cname]
            if c.id not in [_c.id for _c in l.countries]:
                l.countries.append(c)
                stats.update(['countries'])
Esempio n. 20
0
def main(args):  # pragma: no cover
    # we merge information about extinct languages from unesco and Harald.
    extinct = dict(list(dsv.reader(args.data_file('extinct.tab'))))
    with transaction.manager:
        query = language_query().options(
            joinedload_all(Language.valuesets, ValueSet.values))
        # loop over active, established languages with geo-coords
        for l in page_query(query, n=100, verbose=True):
            # let's collect the relevant sources in a way that allows computation of med.
            # Note: we limit refs to the ones without computerized assignments.
            sources = DBSession.query(Ref).join(LanguageSource)\
                .filter(LanguageSource.language_pk == l.pk) \
                .filter(Ref.ca_doctype_trigger == None)\
                .filter(Ref.ca_language_trigger == None)\
                .options(joinedload(Ref.doctypes))
            sources = sorted(map(Source, sources))

            # keep the overall med
            # note: this source may not be included in the potential meds computed below,
            # e.g. because it may not have a year.
            med = sources[0].__json__() if sources else None

            # now we have to compute meds respecting a cut-off year.
            # to do so, we collect eligible sources per year and then
            # take the med of this collection.
            potential_meds = []

            # we only have to loop over publication years within all sources, because
            # only in these years something better might have come along.
            for year in set(s.year for s in sources if s.year):
                # let's see if something better was published!
                eligible = [s for s in sources if s.year and s.year <= year]
                if eligible:
                    potential_meds.append(sorted(eligible)[0])

            # we store the precomputed sources information as jsondata:
            l.update_jsondata(
                endangerment='Extinct' if l.hid in extinct else l.endangerment,
                med=med,
                sources=[s.__json__() for s in
                         sorted(set(potential_meds), key=lambda s: -s.year)])
Esempio n. 21
0
def main(args):  # pragma: no cover
    # we merge information about extinct languages from unesco and Harald.
    extinct = dict(list(dsv.reader(args.data_file('extinct.tab'))))
    with transaction.manager:
        query = language_query().options(
            joinedload_all(Language.valuesets, ValueSet.values))
        # loop over active, established languages with geo-coords
        for l in page_query(query, n=100, verbose=True):
            # let's collect the relevant sources in a way that allows computation of med.
            # Note: we limit refs to the ones without computerized assignments.
            sources = DBSession.query(Ref).join(LanguageSource)\
                .filter(LanguageSource.language_pk == l.pk) \
                .filter(Ref.ca_doctype_trigger == None)\
                .filter(Ref.ca_language_trigger == None)\
                .options(joinedload(Ref.doctypes))
            sources = sorted(map(Source, sources))

            # keep the overall med
            # note: this source may not be included in the potential meds computed below,
            # e.g. because it may not have a year.
            med = sources[0].__json__() if sources else None

            # now we have to compute meds respecting a cut-off year.
            # to do so, we collect eligible sources per year and then
            # take the med of this collection.
            potential_meds = []

            # we only have to loop over publication years within all sources, because
            # only in these years something better might have come along.
            for year in set(s.year for s in sources if s.year):
                # let's see if something better was published!
                eligible = [s for s in sources if s.year and s.year <= year]
                if eligible:
                    potential_meds.append(sorted(eligible)[0])

            # we store the precomputed sources information as jsondata:
            l.update_jsondata(
                endangerment='Extinct' if l.hid in extinct else l.endangerment,
                med=med,
                sources=[s.__json__() for s in
                         sorted(set(potential_meds), key=lambda s: -s.year)])
Esempio n. 22
0
def countries(args, languages):
    count = 0
    countries = {}
    for row in dsv.reader(args.data_file("countries.tab"), encoding="latin1"):
        hid, cnames = row[0], row[1:]
        if hid not in languages:
            languages[hid] = Languoid.get(hid, key="hid", default=None)
        if not languages[hid]:
            continue
        l = languages[hid]
        if l.countries:
            continue
        for cname in set(cnames):
            if cname not in countries:
                countries[cname] = Country.get(cname, key="name", default=None)
            if not countries[cname]:
                continue
            c = countries[cname]
            if c.id not in [_c.id for _c in l.countries]:
                l.countries.append(c)
                count += 1

    print "countries:", count, "relations added"
Esempio n. 23
0
def macroareas(args, languages):
    ma_map = get_map(Macroarea)

    # we store references to languages to make computation of cumulated macroareas for
    # families easier
    lang_map = {}

    for hid, macroarea in dsv.reader(args.data_file("macroareas.tab")):
        if hid not in languages:
            languages[hid] = Languoid.get(hid, key="hid", default=None)
        if not languages[hid]:
            continue
        lang_map[languages[hid].pk] = languages[hid]
        update_relationship(languages[hid].macroareas, [ma_map[macroarea]], log=args.log)

    for family in (
        DBSession.query(Languoid).filter(Languoid.level == LanguoidLevel.family).filter(Language.active == True)
    ):
        mas = []
        for lang in DBSession.query(TreeClosureTable.child_pk).filter(TreeClosureTable.parent_pk == family.pk):
            if lang[0] in lang_map:
                mas.extend(lang_map[lang[0]].macroareas)
        update_relationship(family.macroareas, mas, log=args.log)
    print "macroareas done"
Esempio n. 24
0
def main(args):
    # determine if we run on a machine where other databases are available for lookup
    # locally:
    data = Data()
    genera = get_genera(data) if astroman else {}
    glottocodes, lnames, geocoords = {}, {}, {}
    if astroman:
        for k, v in glottocodes_by_isocode(
                'postgresql://robert@/glottolog3',
                cols=['id', 'name', 'latitude', 'longitude']).items():
            glottocodes[k] = v[0]
            lnames[k] = v[1]
            geocoords[k] = (v[2], v[3])

    refs = defaultdict(list)
    for row in get_rows(args, 'BibtexKey'):
        if row[1] == 'NO SOURCE GIVEN':
            refs[row[0]] = []
        else:
            refs[row[0]].append(row[1])
    add_sources(args, data)

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE Online',
        description='PHOIBLE Online',
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        domain='phoible.org',
        license='http://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, spec in enumerate([
        ('moran', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
        ('wright', "Richard Wright"),
    ]):
        DBSession.add(
            common.Editor(dataset=dataset,
                          ord=i + 1,
                          contributor=common.Contributor(id=spec[0],
                                                         name=spec[1])))

    squibs = defaultdict(list)
    for row in get_rows(args, 'Squib'):
        squibs[row[0]].append(row[1])

    source_urls = dict(get_rows(args, 'URL'))
    ia_urls = dict(get_rows(args, 'InternetArchive'))

    aggregated = list(
        reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True))
    inventory_names = {}
    for key, items in groupby(sorted(aggregated,
                                     key=lambda t: (t.LanguageCode, t.Source)),
                              key=lambda t: (t.LanguageCode, t.Source)):
        items = list(items)

        lname = lnames.get(key[0])
        if not lname:
            lname = items[0].LanguageName
            lnames[key[0]] = lname

        if len(items) == 1:
            inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1])
        else:
            for i, item in enumerate(items):
                inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i +
                                                                    1, key[1])

    family_map = {
        ("Arawakan", "arwk"): "Arawakan",
        ("Trans-New Guinea", "trng"): "Trans-New Guinea",
        ("Moklen", "anes"): "Austronesian",
        ("Oko", "ncon"): "Niger-Congo",
        ("Muniche", "saso"): "Muniche",
        ("Tinigua", "saso"): "Tinigua",
        ("Vilela", "luvi"): "Vilela",
        ("Ofayé", "macg"): "Kamakanan",
        ("Purian", "macg"): "PurianPrint",
        ("Mixed language", "saml"): "Mixed language",
        ("Tupian", "tupi"): "Tupian",
        ("Yuwana", "saun"): "YuwanaPrint",
    }
    family_code_map = {k[1]: v for k, v in family_map.items()}

    for row in aggregated:
        lang = data['Variety'].get(row.LanguageCode)
        if not lang:
            if row.LanguageFamilyGenus == 'UNCLASSIFIED':
                genus = None
            else:
                genus_id = slug(strip_quotes(row.LanguageFamilyGenus))
                genus = genera.get(genus_id)
                if not genus:
                    genus = genera.get(row.LanguageCode)
                    if not genus:
                        #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot)
                        family = family_map.get(
                            (row.LanguageFamilyGenus, row.LanguageFamilyRoot))
                        genus = genera[genus_id] = data.add(
                            models.Genus,
                            genus_id,
                            id=genus_id,
                            name=row.LanguageFamilyGenus,
                            description=family or row.LanguageFamilyRoot,
                            active=False,
                            root=row.LanguageFamilyRoot)

                if not genus.root:
                    genus.root = row.LanguageFamilyRoot

                if genus.description in family_code_map:
                    genus.description = family_code_map[genus.description]

            if row.LanguageCode in geocoords:
                coords = geocoords[row.LanguageCode]
            elif row.Latitude != 'NULL' and row.Longitude != 'NULL':
                coords = (float(row.Latitude), float(row.Longitude))
            lang = data.add(models.Variety,
                            row.LanguageCode,
                            id=row.LanguageCode,
                            name=lnames[row.LanguageCode],
                            genus=genus,
                            country=strip_quotes(row.Country),
                            area=strip_quotes(row.Area),
                            latitude=coords[0],
                            longitude=coords[1],
                            jsondata=dict(inventory_id=row.InventoryID))
            add_language_codes(data,
                               lang,
                               row.LanguageCode,
                               glottocodes=glottocodes)

        contributor = data['Contributor'].get(row.Source)
        if not contributor:
            contributor = data.add(common.Contributor,
                                   row.Source,
                                   id=row.Source,
                                   name=SOURCES[row.Source][0],
                                   description=SOURCES[row.Source][2])
            for ref in SOURCES[row.Source][1]:
                DBSession.add(
                    models.ContributorReference(source=data['Source'][ref],
                                                contributor=contributor))

        contrib = data.add(models.Inventory,
                           row.InventoryID,
                           id=row.InventoryID,
                           language=lang,
                           source=row.Source,
                           source_url=source_urls.get(row.InventoryID),
                           internetarchive_url=ia_urls.get(row.InventoryID),
                           name=inventory_names[row.InventoryID],
                           description=row.LanguageName)

        DBSession.add(
            common.ContributionContributor(contribution=contrib,
                                           contributor=contributor))

        for j, squib in enumerate(squibs.get(row.InventoryID, [])):
            f = common.Contribution_files(object=contrib,
                                          id='squib-%s-%s.pdf' %
                                          (contrib.id, j + 1),
                                          name='Phonological squib',
                                          description=squib,
                                          mime_type='application/pdf')
            assert f
            # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read())

    DBSession.flush()
    unknown_refs = {}

    for row in reader(args.data_file('phoible-phonemes.tsv'),
                      namedtuples=True):
        inventory = data['Inventory'][row.InventoryID]
        segment = data['Segment'].get(row.Phoneme)
        if not segment:
            unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme]
            description = ' - '.join([t[1] for t in unicode_desc])
            segment = data.add(
                models.Segment,
                row.Phoneme,
                id=b16encode(md5(description).digest()),
                name=row.Phoneme,
                description=description,
                equivalence_class=''.join([
                    t[0] for t in unicode_desc
                    if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
                ]),
                segment_class=row.Class,
                combined_class=row.CombinedClass)
            DBSession.flush()

        vs = common.ValueSet(id=row.PhonemeID,
                             contribution=inventory,
                             language=inventory.language,
                             parameter=segment)

        for ref in refs.get(row.InventoryID, []):
            if ref not in data['Source']:
                if ref not in unknown_refs:
                    print('-------', ref)
                unknown_refs[ref] = 1
                continue
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            common.Value(
                id=row.PhonemeID,
                name='%s %s' %
                (row.Phoneme, data['Inventory'][row.InventoryID].name),
                valueset=vs))
        DBSession.flush()

    for inventory_id in refs:
        for ref in refs[inventory_id]:
            if ref not in data['Source']:
                continue
            data.add(common.ContributionReference,
                     '%s-%s' % (inventory_id, ref),
                     source=data['Source'][ref],
                     contribution=data['Inventory'][inventory_id])

    for i, row in enumerate(
            reader(args.data_file('phoible-segments-features.tsv'))):
        if i == 0:
            features = list(map(feature_name, row))
            continue

        if row[0] not in data['Segment']:
            # print('skipping feature vector:', row)
            continue
        for j, value in enumerate(row):
            if j and value != '0':
                DBSession.add(
                    common.Parameter_data(
                        key=features[j],
                        value=value,
                        ord=j,
                        object_pk=data['Segment'][row[0]].pk))
    DBSession.flush()
Esempio n. 25
0
def import_dataset(path, data, icons):
    # look for metadata
    # look for sources
    # then loop over values
    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)
    glottolog = Glottolog()

    contrib = Contribution(id=basename, name=basename)

    md = {}
    mdpath = path + '-metadata.json'
    if os.path.exists(mdpath):
        md = jsonload(mdpath)
    contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS'))
    contributor_id = slug(contributor_name.last + contributor_name.first)
    contributor = data['Contributor'].get(contributor_id)
    if not contributor:
        contributor = data.add(
            Contributor,
            contributor_id,
            id=contributor_id,
            name='%s' % contributor_name)
    DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor))

    bibpath = os.path.join(dirpath, basename + '.bib')
    if os.path.exists(bibpath):
        for rec in Database.from_file(bibpath):
            if rec['key'] not in data['Source']:
                data.add(Source, rec['key'], _obj=bibtex2source(rec))

    languages = {f['properties']['glottocode']: f for f in md.get('features', [])}

    for i, row in enumerate(reader(path, dicts=True, quoting=csv.QUOTE_NONE, delimiter=',' if 'c' in ext else '\t')):
        if not row['Value'] or not row['Feature_ID']:
            continue
        vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID'])
        vid = row.get('ID', '%s-%s' % (basename, i + 1))

        parameter = data['Feature'].get(row['Feature_ID'])
        if parameter is None:
            print('skip value for invalid feature %s' % row['Feature_ID'])
            continue
            #parameter = data.add(
            #    Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID']))

        language = data['GrambankLanguage'].get(row['Language_ID'])
        if language is None:
            # query glottolog!
            languoid = glottolog.languoid(row['Language_ID'])
            gl_md = {
                'name': languoid.name,
                'longitude': languoid.longitude,
                'latitude': languoid.latitude}
            lmd = languages.get(row['Language_ID'])
            if lmd:
                if lmd.get('properties', {}).get('name'):
                    gl_md['name'] = lmd['properties']['name']
                if lmd.get('geometry', {}).get('coordinates'):
                    gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates']

            language = data.add(
                GrambankLanguage, row['Language_ID'],
                id=row['Language_ID'],
                name=gl_md['name'],
                latitude=gl_md.get('latitude'),
                longitude=gl_md.get('longitude'))

        vs = data['ValueSet'].get(vsid)
        if vs is None:
            vs = data.add(
                ValueSet, vsid,
                id=vsid,
                parameter=parameter,
                language=language,
                contribution=contrib,
                source=row['Source'])

        domain = {de.abbr: de for de in parameter.domain}
        name = row['Value']
        if name in domain:
            name = domain[name].name

        Value(
            id=vid,
            valueset=vs,
            name=name,
            description=row['Comment'],
            domainelement=domain.get(row['Value']))

        for key, src in data['Source'].items():
            if key in vs.source:
                ValueSetReference(valueset=vs, source=src, key=key)
Esempio n. 26
0
def main(args):
    data = Data()

    editors = OrderedDict()
    editors['Susanne Maria Michaelis'] = None
    editors['Philippe Maurer'] = None
    editors['Martin Haspelmath'] = None
    editors['Magnus Huber'] = None

    for row in read(args, 'People'):
        name = row['First name'] + ' ' if row['First name'] else ''
        name += row['Last name']
        kw = dict(
            name=name,
            id=slug('%(Last name)s%(First name)s' % row),
            url=row['Contact Website'].split()[0]
            if row['Contact Website'] else None,
            address=row['Comments on database'],
        )
        contrib = data.add(common.Contributor, row['Author ID'], **kw)
        if kw['name'] in editors:
            editors[kw['name']] = contrib

    DBSession.flush()

    dataset = common.Dataset(
        id='apics',
        name='APiCS Online',
        description='Atlas of Pidgin and Creole Language Structures Online',
        domain='apics-online.info',
        published=date(2013, 11, 4),
        license='http://creativecommons.org/licenses/by/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'
        })
    DBSession.add(dataset)
    for i, editor in enumerate(editors.values()):
        common.Editor(dataset=dataset, contributor=editor, ord=i + 1)

    colors = dict(
        (row['ID'], row['RGB_code']) for row in read(args, 'Colours'))

    abbrs = {}
    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for id_, name in {
            'C**T': 'clitic',
            'IMPF': 'imperfect',
            'INTERM': 'intermediate',
            'NCOMPL': 'noncompletive',
            'NONFUT': 'nonfuture',
            'NPROX': 'nonproximal',
            'NSG': 'nonsingular',
            'PP': 'past participle',
            'PROP': 'proprietive',
            'TMA': 'tense-mood-aspect',
    }.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'),
                      delimiter=',',
                      namedtuples=True):
        for match in GLOSS_ABBR_PATTERN.finditer(row.standard):
            if match.group('abbr') not in abbrs:
                abbrs[match.group('abbr')] = 1
                DBSession.add(
                    common.GlossAbbreviation(id=match.group('abbr'),
                                             name=row.meaning))

    non_bibs = {}
    for row in read(args, 'References', 'Reference_ID'):
        if row['Reference_type'] == 'Non-bib':
            non_bibs[row['Reference_ID']] = row['Reference_name']
            continue

        if isinstance(row['Year'], int):
            year_int = row['Year']
            year = str(row['Year'])
        elif row['Year']:
            year_int = None
            for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']):
                year_int = int(m.group('year'))
                break
            year = row['Year']
        else:
            year, year_int = None, None

        title = row['Article_title'] or row['Book_title']
        attrs = {}
        jsondata = {}
        for attr, field in {
                'Additional_information': 'note',
                'Article_title': 'title',
                'Book_title': 'booktitle',
                'City': 'address',
                'Editors': 'editor',
                'Full_reference': None,
                'Issue': None,
                'Journal': 'journal',
                'Language_codes': None,
                'LaTeX_cite_key': None,
                'Pages': 'pages',
                'Publisher': 'publisher',
                'Reference_type': 'type',
                'School': 'school',
                'Series_title': 'series',
                'URL': 'url',
                'Volume': 'volume',
        }.items():
            value = row.get(attr)
            if not isinstance(value, int):
                value = (value or '').strip()
            if attr == 'Issue' and value:
                try:
                    value = str(int(value))
                except ValueError:
                    pass
            if value:
                if field:
                    attrs[field] = value
                else:
                    jsondata[attr] = value
        p = data.add(common.Source,
                     row['Reference_ID'],
                     id=str(row['Reference_ID']),
                     name=row['Reference_name'],
                     description=title,
                     author=row['Authors'],
                     year=year,
                     year_int=year_int,
                     bibtex_type=getattr(EntryType, row['BibTeX_type']
                                         or 'misc'),
                     jsondata=jsondata,
                     **attrs)
        if p.bibtex_type.value == 'misc' and not p.description:
            p.description = p.note
        DBSession.flush()

    DBSession.flush()

    infobox = jsonload(args.data_file('infobox.json'))
    glottocodes = jsonload(args.data_file('glottocodes.json'))
    for row in read(args, 'Languages', 'Order_number'):
        lon, lat = [
            float(c.strip()) for c in row['map_coordinates'].split(',')
        ]
        kw = dict(
            name=row['Language_name'],
            id=str(row['Order_number']),
            latitude=lat,
            longitude=lon,
            region=row['Category_region'],
        )
        lect = data.add(models.Lect, row['Language_ID'], **kw)
        DBSession.flush()

        for i, item in enumerate(infobox[lect.id]):
            DBSession.add(
                common.Language_data(object_pk=lect.pk,
                                     ord=i,
                                     key=item[0],
                                     value=item[1]))

        if row["Languages_contribution_documentation::Lect_description_checked_status"] \
                != "Checked":
            print 'unchecked! ---', row['Language_name']

        desc = row.get(
            'Languages_contribution_documentation::Lect description', '')
        markup_desc = normalize_markup(row[
            'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description']
                                       )

        c = data.add(
            models.ApicsContribution,
            row['Language_ID'],
            id=str(row['Order_number']),
            name=row['Language_name'],
            description=desc,
            markup_description=markup_desc,
            survey_reference=data['Source'][row['Survey_reference_ID']],
            language=lect)

        for ext, label, mtype in [
            ('pdf', 'Glossed text', 'application/pdf'),
            ('mp3', 'Glossed text audio', 'audio/mpeg'),
        ]:
            fid = '%s-gt.%s' % (c.id, ext)
            if args.data_file('files', 'contribution', c.id, fid).exists():
                common.Contribution_files(object=c,
                                          id=fid,
                                          name=label,
                                          mime_type=mtype)
            else:
                print label, 'missing for:', row['Language_name']

        #
        # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE
        #

        iso = None
        if row['ISO_code'] and len(row['ISO_code']) == 3:
            iso = row['ISO_code'].lower()
            if 'iso:%s' % row['ISO_code'] not in data['Identifier']:
                data.add(common.Identifier,
                         'iso:%s' % row['ISO_code'],
                         id=row['ISO_code'].lower(),
                         name=row['ISO_code'].lower(),
                         type=common.IdentifierType.iso.value)

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=data['Identifier']['iso:%s' % row['ISO_code']]))

        if lect.id in glottocodes:
            identifier = data.add(common.Identifier,
                                  'gc:%s' % glottocodes[lect.id],
                                  id=glottocodes[lect.id],
                                  name=glottocodes[lect.id],
                                  type=common.IdentifierType.glottolog.value)

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=identifier))

        if row['Language_name_ethnologue']:
            if row['Language_name_ethnologue'] not in data['Identifier']:
                data.add(common.Identifier,
                         row['Language_name_ethnologue'],
                         id=iso
                         or 'ethnologue:%s' % row['Language_name_ethnologue'],
                         name=row['Language_name_ethnologue'],
                         type='ethnologue')

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=data['Identifier'][
                        row['Language_name_ethnologue']]))

    example_count = {}
    for row in read(args, 'Examples', 'Order_number'):
        assert row['Language_ID']
        lang = data['Lect'][row['Language_ID']]
        id_ = '%(Language_ID)s-%(Example_number)s' % row
        atext, gloss = igt(row)
        example_count[row['Language_ID']] = max(
            [example_count.get(row['Language_ID'], 1), row['Example_number']])
        p = add_sentence(
            args,
            data,
            id_,
            id='%s-%s' % (lang.id, row['Example_number']),
            name=row['Text'] or row['Analyzed_text'],
            description=row['Translation'],
            type=row['Type'].strip().lower() if row['Type'] else None,
            comment=row['Comments'],
            gloss=gloss,
            analyzed=atext,
            markup_text=normalize_markup(row['z_calc_Text_CSS']),
            markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']),
            markup_comment=normalize_markup(row['z_calc_Comments_CSS']),
            markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']),
            original_script=row['Original_script'],
            jsondata={
                'sort': row['Order_number'],
                'alt_translation': (row['Translation_other'] or '').strip()
                or None
            },
            language=lang)

        if row['Reference_ID']:
            if row['Reference_ID'] in data['Source']:
                source = data['Source'][row['Reference_ID']]
                DBSession.add(
                    common.SentenceReference(
                        sentence=p,
                        source=source,
                        key=source.id,
                        description=row['Reference_pages']))
            else:
                p.source = non_bibs[row['Reference_ID']]

    DBSession.flush()

    for row in read(args, 'Language_references'):
        if row['Reference_ID'] not in data['Source']:
            assert row['Reference_ID'] in non_bibs
            continue
        assert row['Language_ID'] in data['ApicsContribution']
        source = data['Source'][row['Reference_ID']]
        DBSession.add(
            common.ContributionReference(
                contribution=data['ApicsContribution'][row['Language_ID']],
                source=source,
                description=row['Pages'],
                key=source.id))

    #
    # global counter for features - across feature types
    #
    feature_count = 0
    for row in read(args, 'Features', 'Feature_number'):
        id_ = str(row['Feature_number'])
        if int(id_) > feature_count:
            feature_count = int(id_)
        wals_id = None
        desc = row['Feature_annotation_publication']
        if row['WALS_match'] == 'Total':
            if isinstance(row['WALS_No.'], int):
                wals_id = row['WALS_No.']
            else:
                wals_id = int(row['WALS_No.'].split('.')[0].strip())

        p = data.add(models.Feature,
                     row['Feature_code'],
                     name=row['Feature_name'],
                     id=id_,
                     description=desc,
                     markup_description=normalize_markup(
                         row['z_calc_Feature_annotation_publication_CSS']),
                     feature_type='primary',
                     multivalued=row['Value_relation_type'] != 'Single',
                     area=row['Feature_area'],
                     wals_id=wals_id)

        names = {}
        for i in range(1, 10):
            if not row['Value%s_publication' % i] \
                    or not row['Value%s_publication' % i].strip():
                continue
            name = row['Value%s_publication' % i].strip()
            if name in names:
                name += ' (%s)' % i
            names[name] = 1
            de = data.add(
                common.DomainElement,
                '%s-%s' % (row['Feature_code'], i),
                id='%s-%s' % (id_, i),
                name=name,
                parameter=p,
                abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name,
                number=int(row['Value%s_value_number_for_publication' % i]),
                jsondata={'color': colors[row['Value_%s_colour_ID' % i]]},
            )
            assert de

        if row['Authors_FeatureArticles']:
            authors, _ = row['Authors_FeatureArticles'].split('and the APiCS')
            authors = authors.strip()
            if authors.endswith(','):
                authors = authors[:-1].strip()
            for i, name in enumerate(authors.split(',')):
                assert name.strip() in editors
                p._authors.append(
                    models.FeatureAuthor(ord=i + 1,
                                         contributor=editors[name.strip()]))

        DBSession.flush()

    primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41}
    segment_to_primary = dict(
        zip(primary_to_segment.values(), primary_to_segment.keys()))
    number_map = {}
    names = {}
    for row in read(args, 'Segment_features', 'Order_number'):
        symbol = row['Segment_symbol']
        if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate':
            symbol = 't\u0361s'
        truth = lambda s: s and s.strip().lower() == 'yes'
        name = '%s - %s' % (symbol, row['Segment_name'])

        if name in names:
            number_map[row['Segment_feature_number']] = names[name]
            continue

        number_map[
            row['Segment_feature_number']] = row['Segment_feature_number']
        names[name] = row['Segment_feature_number']
        feature_count += 1
        if row['Segment_feature_number'] in segment_to_primary:
            primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\
                = str(feature_count)
        p = data.add(models.Feature,
                     row['Segment_feature_number'],
                     name=name,
                     id=str(feature_count),
                     feature_type='segment',
                     area='Vowels' if truth(row['Vowel']) else
                     ('Obstruent consonants'
                      if truth(row['Obstruent']) else 'Sonorant consonants'),
                     jsondata=dict(
                         number=int(row['Segment_feature_number']),
                         vowel=truth(row['Vowel']),
                         consonant=truth(row['Consonant']),
                         obstruent=truth(row['Obstruent']),
                         core_list=truth(row['Core_list_segment']),
                         symbol=symbol,
                     ))

        for i, spec in SEGMENT_VALUES.items():
            data.add(common.DomainElement,
                     '%s-%s' % (row['Segment_feature_number'], spec[0]),
                     id='%s-%s' % (p.id, i),
                     name=spec[0],
                     parameter=p,
                     jsondata={'color': spec[1]},
                     number=i)

    print '--> remapped:', primary_to_segment
    DBSession.flush()

    for row in read(args, 'Sociolinguistic_features',
                    'Sociolinguistic_feature_number'):
        feature_count += 1
        p = data.add(models.Feature,
                     row['Sociolinguistic_feature_code'],
                     name=row['Sociolinguistic_feature_name'],
                     id='%s' % feature_count,
                     description=row['Sociolinguistic_feature_annotation'],
                     area='Sociolinguistic',
                     feature_type='sociolinguistic')

        names = {}

        for i in range(1, 10):
            id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i)
            if row.get('Value%s' % i) and row['Value%s' % i].strip():
                name = row['Value%s' % i].strip()
                if name in names:
                    name += ' (%s)' % i
                names[name] = 1
            else:
                continue
            kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i)
            data.add(common.DomainElement,
                     id_,
                     id='%s-%s' % (p.id, i),
                     name=name,
                     parameter=p,
                     number=i,
                     jsondata={
                         'color':
                         colors.get(row['Value%s_colour_ID' % i],
                                    colors.values()[i])
                     })

    sd = {}
    for row in read(args, 'Segment_data'):
        if row['Segment_feature_number'] not in number_map:
            continue
        number = number_map[row['Segment_feature_number']]

        if not row['Presence_in_the_language']:
            continue

        lang = data['Lect'][row['Language_ID']]
        param = data['Feature'][number]
        id_ = '%s-%s' % (lang.id, param.id)
        if id_ in sd:
            assert row['c_Record_is_a_duplicate'] == 'Yes'
            continue
        sd[id_] = 1
        valueset = data.add(
            common.ValueSet,
            id_,
            id=id_,
            parameter=param,
            language=lang,
            contribution=data['ApicsContribution'][row['Language_ID']],
            description=row['Comments'],
            markup_description=normalize_markup(row['z_calc_Comments_CSS']),
        )
        v = data.add(
            common.Value,
            id_,
            id=id_,
            frequency=float(100),
            valueset=valueset,
            domainelement=data['DomainElement'][
                '%s-%s' % (number, row['Presence_in_the_language'])],
        )
        if row['Example_word'] and row['Example_word_gloss']:
            example_count[row['Language_ID']] += 1
            p = add_sentence(args,
                             data,
                             '%s-p%s' % (lang.id, data['Feature'][number].id),
                             id='%s-%s' %
                             (lang.id, example_count[row['Language_ID']]),
                             name=row['Example_word'],
                             description=row['Example_word_gloss'],
                             language=lang)
            DBSession.add(common.ValueSentence(value=v, sentence=p))

        source = data['Source'].get(row['Refers_to_references_Reference_ID'])
        if source:
            DBSession.add(
                common.ValueSetReference(valueset=valueset,
                                         source=source,
                                         key=source.id))
        elif row['Refers_to_references_Reference_ID'] in non_bibs:
            valueset.source = non_bibs[
                row['Refers_to_references_Reference_ID']]

    lects = defaultdict(lambda: 1)
    lect_map = {}
    records = {}
    false_values = {}
    no_values = {}
    wals_value_number = {}
    for row in read(args, 'wals'):
        if row['z_calc_WALS_value_number']:
            wals_value_number[
                row['Data_record_id']] = row['z_calc_WALS_value_number']

    def prefix(attr, _prefix):
        if _prefix:
            return '%s_%s' % (_prefix, attr)
        return attr.capitalize()

    for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]:
        num_values = 10
        for row in read(args, prefix('data', _prefix)):
            if not row[prefix('feature_code', _prefix)]:
                print('no associated feature for', prefix('data', _prefix),
                      row[prefix('data_record_id', _prefix)])
                continue

            lid = row['Language_ID']
            lect_attr = row.get('Lect_attribute', 'my default lect').lower()
            if lect_attr != 'my default lect':
                if (row['Language_ID'], row['Lect_attribute']) in lect_map:
                    lid = lect_map[(row['Language_ID'], row['Lect_attribute'])]
                else:
                    lang = data['Lect'][row['Language_ID']]
                    c = lects[row['Language_ID']]
                    lid = '%s-%s' % (row['Language_ID'], c)
                    kw = dict(
                        name='%s (%s)' % (lang.name, row['Lect_attribute']),
                        id='%s' % (1000 + 10 * int(lang.id) + c),
                        latitude=lang.latitude,
                        longitude=lang.longitude,
                        description=row['Lect_attribute'],
                        language=lang,
                    )
                    data.add(models.Lect, lid, **kw)
                    lects[row['Language_ID']] += 1
                    lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid

            id_ = abbr + str(row[prefix('data_record_id', _prefix)])
            assert id_ not in records
            records[id_] = 1

            assert row[prefix('feature_code', _prefix)] in data['Feature']
            language = data['Lect'][lid]
            parameter = data['Feature'][row[prefix('feature_code', _prefix)]]
            valueset = common.ValueSet(
                id='%s-%s' % (language.id, parameter.id),
                description=row['Comments_on_value_assignment'],
                markup_description=normalize_markup(
                    row.get('z_calc_Comments_on_value_assignment_CSS')),
            )

            values_found = {}
            for i in range(1, num_values):
                if not row['Value%s_true_false' % i]:
                    continue

                if row['Value%s_true_false' % i].strip().lower() != 'true':
                    assert row['Value%s_true_false' %
                               i].strip().lower() == 'false'
                    false_values[row[prefix('data_record_id', _prefix)]] = 1
                    continue

                iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i)
                if iid not in data['DomainElement']:
                    print(iid, row[prefix('data_record_id',
                                          _prefix)], '--> no domainelement!')
                    continue
                values_found['%s-%s' % (id_, i)] = dict(
                    id='%s-%s' % (valueset.id, i),
                    domainelement=data['DomainElement']['%s-%s' % (row[prefix(
                        'feature_code', _prefix)], i)],
                    confidence=row['Value%s_confidence' % i],
                    frequency=float(row['c_V%s_frequency_normalised' %
                                        i]) if _prefix == '' else 100)

            if values_found:
                if row[prefix('data_record_id', _prefix)] in wals_value_number:
                    valueset.jsondata = {
                        'wals_value_number':
                        wals_value_number.pop(row[prefix(
                            'data_record_id', _prefix)])
                    }
                valueset.parameter = parameter
                valueset.language = language
                valueset.contribution = data['ApicsContribution'][
                    row['Language_ID']]
                valueset = data.add(common.ValueSet, id_, _obj=valueset)
                for i, item in enumerate(values_found.items()):
                    if i > 0 and not parameter.multivalued:
                        print 'multiple values for single-valued parameter: %s' % id_
                        break
                    id_, kw = item
                    kw['valueset'] = valueset
                    value = data.add(common.Value, id_, **kw)

                #
                # store references to additional data for segments which should be reused
                # for corresponding primary features!
                #
                if int(parameter.id) in primary_to_segment:
                    assert len(values_found) == 1
                    seg_id = '%s-%s' % (language.id, primary_to_segment[int(
                        parameter.id)])
                    seg_valueset = data['ValueSet'][seg_id]
                    seg_value = data['Value'][seg_id]
                    if not valueset.description and seg_valueset.description:
                        valueset.description = seg_valueset.description

                    for s in seg_value.sentence_assocs:
                        DBSession.add(
                            common.ValueSentence(value=value,
                                                 sentence=s.sentence))

                    for r in seg_valueset.references:
                        DBSession.add(
                            common.ValueSetReference(valueset=valueset,
                                                     source=r.source,
                                                     key=r.key))

                    if not valueset.source and seg_valueset.source:
                        valueset.source = seg_valueset.source

                DBSession.flush()
            else:
                no_values[id_] = 1

    DBSession.flush()

    for prefix, abbr, num_values in [
        ('D', '', 10),
        ('Sociolinguistic_d', 'sl', 7),
    ]:
        for row in read(args, prefix + 'ata_references'):
            assert row['Reference_ID'] in data['Source'] \
                or row['Reference_ID'] in non_bibs
            try:
                vs = data['ValueSet'][abbr +
                                      str(row[prefix + 'ata_record_id'])]
                if row['Reference_ID'] in data['Source']:
                    source = data['Source'][row['Reference_ID']]
                    DBSession.add(
                        common.ValueSetReference(
                            valueset=vs,
                            source=source,
                            key=source.id,
                            description=row['Pages'],
                        ))
                else:
                    if vs.source:
                        vs.source += '; ' + non_bibs[row['Reference_ID']]
                    else:
                        vs.source = non_bibs[row['Reference_ID']]
            except KeyError:
                continue

    DBSession.flush()

    missing = 0
    for row in read(args, 'Value_examples'):
        try:
            DBSession.add(
                common.ValueSentence(
                    value=data['Value']['%(Data_record_id)s-%(Value_number)s' %
                                        row],
                    sentence=data['Sentence'][
                        '%(Language_ID)s-%(Example_number)s' % row],
                    description=row['Notes'],
                ))
        except KeyError:
            missing += 1
    print('%s Value_examples are missing data' % missing)

    print('%s data sets with false values' % len(false_values))
    print('%s data sets without values' % len(no_values))

    for k, v in wals_value_number.items():
        print 'unclaimed wals value number:', k, v

    for i, row in enumerate(read(args, 'Contributors')):
        kw = dict(contribution=data['ApicsContribution'][row['Language ID']],
                  contributor=data['Contributor'][row['Author ID']])
        if row['Order_of_appearance']:
            kw['ord'] = int(float(row['Order_of_appearance']))
        data.add(common.ContributionContributor, i, **kw)

    DBSession.flush()
Esempio n. 27
0
def get_rows(args, name):
    for i, row in enumerate(reader(args.data_file('InventoryID-%s.csv' %
                                                  name))):
        if i and row[1] != 'NA':
            yield row
Esempio n. 28
0
def justifications(args, languages):
    """
    - text goes into ValueSet.description
    - refs go into ValueSetReference objects
    """

    def normalized_pages(s):
        if PAGES_PATTERN.match(s or ""):
            return s or ""

    #
    # create mappings to look up glottolog languoids matching names in justification files
    #
    langs_by_hid = languages
    langs_by_hname = {}
    langs_by_name = {}

    for l in DBSession.query(Languoid).filter(Languoid.active == False):
        langs_by_hname[l.jsondatadict.get("hname")] = l
        langs_by_hid[l.hid] = l
        langs_by_name[l.name] = l

    for l in DBSession.query(Languoid).filter(Languoid.active == True):
        langs_by_hname[l.jsondatadict.get("hname")] = l
        langs_by_hid[l.hid] = l
        langs_by_name[l.name] = l

    for id_, type_ in [("fc", "family"), ("sc", "subclassification")]:
        for i, row in enumerate(dsv.reader(args.data_file("%s_justifications.tab" % type_))):
            name = row[0]
            name = name.replace("_", " ") if not name.startswith("NOCODE") else name
            l = langs_by_hname.get(name, langs_by_hid.get(name, langs_by_name.get(name)))
            if not l:
                args.log.warn("ignoring %s" % name)
                continue

            _r = 3 if type_ == "family" else 2
            comment = (row[_r].strip() or None) if len(row) > _r else None
            if comment and not WORD_PATTERN.search(comment):
                comment = None

            #
            # TODO: look for [NOCODE_ppp] patterns as well!?
            #

            refs = [(int(m.group("id")), normalized_pages(m.group("comment"))) for m in REF_PATTERN.finditer(row[2])]

            vs = None
            for _vs in l.valuesets:
                if _vs.parameter.id == id_:
                    vs = _vs
                    break

            if not vs:
                args.log.info("%s %s ++" % (l.id, type_))
                vs = ValueSet(
                    id="%s%s" % (type_, l.id),
                    description=comment,
                    language=l,
                    parameter=Parameter.get(id_),
                    contribution=Contribution.first(),
                )
                DBSession.add(Value(id="%s%s" % (type_, l.id), name="%s - %s" % (l.level, l.status), valueset=vs))
                DBSession.flush()
            else:
                if vs.description != comment:
                    args.log.info("%s %s ~~ description" % (l.id, type_))
                    vs.description = comment

            for r in vs.references:
                DBSession.delete(r)

            for r, pages in refs:
                vs.references.append(ValueSetReference(source=Source.get(str(r)), description=pages))

        args.log.info("%s %s" % (i, type_))
Esempio n. 29
0
def get_vs2008(args):  # pragma: no cover
    vs2008 = {}
    for row in reader(args.data_file('datapoints_2008.csv'), delimiter=','):
        vs2008[(row[0], '%sA' % row[1])] = int(row[2])
    return vs2008
Esempio n. 30
0
def prime_cache(args):
    """
    we use a versioned session to insert the changes in value assignment
    """
    #
    # compute the changes from 2008 to 2011:
    #
    vs2008 = get_vs2008(args)
    for row in DB.execute("select * from datapoint"):
        key = (row['language_id'], row['feature_id'])
        old_value = vs2008.get(key)
        new_value = row['value_numeric']
        if old_value and old_value != new_value:
            valueset = VersionedDBSession.query(common.ValueSet)\
                .join(common.Language)\
                .join(common.Parameter)\
                .filter(common.Parameter.id == row['feature_id'])\
                .filter(common.Language.id == row['language_id'])\
                .one()
            value = valueset.values[0]
            assert value.domainelement.number == old_value
            for de in valueset.parameter.domain:
                if de.number == new_value:
                    value.domainelement = de
                    break
            assert value.domainelement.number == new_value
            valueset.updated = E2011
            value.updated = E2011
            VersionedDBSession.flush()

    for row in reader(args.data_file('corrections_2013.tab'), namedtuples=True, newline='\r'):
        valueset = VersionedDBSession.query(common.ValueSet)\
            .join(common.Language)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == row.feature)\
            .filter(common.Language.id == row.wals_code)\
            .one()
        value = valueset.values[0]

        if value.domainelement.number == int(row.new):
            print '**** old news', valueset.language.id, valueset.parameter.id
            continue

        if value.domainelement.number != int(row.old):
            print '--->', valueset.language.id, valueset.parameter.id, value.domainelement.number
        for de in valueset.parameter.domain:
            if de.number == int(row.new):
                value.domainelement = de
                break
        assert value.domainelement.number == int(row.new)
        valueset.updated = E2013
        value.updated = E2013
        VersionedDBSession.flush()
    print 'corrections 2013 done'

    for issue in ['0', '9', '10', '11', '13', '14', '15', '16', '17', '19', '20', '24', '26', '27', '28']:
        issue = getattr(issues, 'issue' + issue)
        issue(VersionedDBSession, E2013)
        VersionedDBSession.flush()
        transaction.commit()
        transaction.begin()

    #
    # TODO: these must be recomputed as well, after migrations!
    #
    # cache number of languages for a parameter:
    for parameter, valuesets in groupby(
            DBSession.query(common.ValueSet).order_by(common.ValueSet.parameter_pk),
            lambda vs: vs.parameter):
        parameter.representation = str(len(set(v.language_pk for v in valuesets)))
    print 'recomputation of representation done'
    transaction.commit()
    transaction.begin()

    # cache iso codes for languages:
    for language in DBSession.query(common.Language).options(joinedload_all(
        common.Language.languageidentifier, common.LanguageIdentifier.identifier
    )):
        iso_codes = []
        for identifier in language.identifiers:
            if identifier.type == common.IdentifierType.iso.value:
                iso_codes.append(identifier.name)
        language.iso_codes = ', '.join(sorted(set(iso_codes)))
    print 'recomputation of iso codes done'
    transaction.commit()
    transaction.begin()

    compute_language_sources()
    transaction.commit()
    transaction.begin()

    gbs_func('update', args)
Esempio n. 31
0
def get_vs2008(args):
    vs2008 = {}
    for row in reader(args.data_file('datapoints_2008.csv'), delimiter=','):
        vs2008[(row[0], '%sA' % row[1])] = int(row[2])
    return vs2008
Esempio n. 32
0
def main(args):
    # determine if we run on a machine where other databases are available for lookup
    # locally:
    data = Data()
    genera = get_genera(data) if astroman else {}
    glottocodes, lnames, geocoords = {}, {}, {}
    if astroman:
        for k, v in glottocodes_by_isocode(
                'postgresql://robert@/glottolog3',
                cols=['id', 'name', 'latitude', 'longitude']).items():
            glottocodes[k] = v[0]
            lnames[k] = v[1]
            geocoords[k] = (v[2], v[3])

    refs = defaultdict(list)
    for row in get_rows(args, 'BibtexKey'):
        if row[1] == 'NO SOURCE GIVEN':
            refs[row[0]] = []
        else:
            refs[row[0]].append(row[1])
    add_sources(args, data)

    dataset = data.add(
        common.Dataset, 'phoible',
        id='phoible',
        name='PHOIBLE Online',
        description='PHOIBLE Online',
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        domain='phoible.org',
        license='http://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
                'Creative Commons Attribution-ShareAlike 3.0 Unported License'})

    for i, spec in enumerate([
        ('moran', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
        ('wright', "Richard Wright"),
    ]):
        DBSession.add(common.Editor(
            dataset=dataset,
            ord=i + 1,
            contributor=common.Contributor(id=spec[0], name=spec[1])))

    squibs = defaultdict(list)
    for row in get_rows(args, 'Squib'):
        squibs[row[0]].append(row[1])

    source_urls = dict(get_rows(args, 'URL'))
    ia_urls = dict(get_rows(args, 'InternetArchive'))

    aggregated = list(reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True))
    inventory_names = {}
    for key, items in groupby(
            sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)),
            key=lambda t: (t.LanguageCode, t.Source)):
        items = list(items)

        lname = lnames.get(key[0])
        if not lname:
            lname = items[0].LanguageName
            lnames[key[0]] = lname

        if len(items) == 1:
            inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1])
        else:
            for i, item in enumerate(items):
                inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1])

    family_map = {
        ("Arawakan", "arwk"): "Arawakan",
        ("Trans-New Guinea", "trng"): "Trans-New Guinea",
        ("Moklen", "anes"): "Austronesian",
        ("Oko", "ncon"): "Niger-Congo",
        ("Muniche", "saso"): "Muniche",
        ("Tinigua", "saso"): "Tinigua",
        ("Vilela", "luvi"): "Vilela",
        ("Ofayé", "macg"): "Kamakanan",
        ("Purian", "macg"): "PurianPrint",
        ("Mixed language", "saml"): "Mixed language",
        ("Tupian", "tupi"): "Tupian",
        ("Yuwana", "saun"): "YuwanaPrint",
    }
    family_code_map = {k[1]: v for k, v in family_map.items()}

    for row in aggregated:
        lang = data['Variety'].get(row.LanguageCode)
        if not lang:
            if row.LanguageFamilyGenus == 'UNCLASSIFIED':
                genus = None
            else:
                genus_id = slug(strip_quotes(row.LanguageFamilyGenus))
                genus = genera.get(genus_id)
                if not genus:
                    genus = genera.get(row.LanguageCode)
                    if not genus:
                        #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot)
                        family = family_map.get(
                            (row.LanguageFamilyGenus, row.LanguageFamilyRoot))
                        genus = genera[genus_id] = data.add(
                            models.Genus, genus_id,
                            id=genus_id,
                            name=row.LanguageFamilyGenus,
                            description=family or row.LanguageFamilyRoot,
                            active=False,
                            root=row.LanguageFamilyRoot)

                if not genus.root:
                    genus.root = row.LanguageFamilyRoot

                if genus.description in family_code_map:
                    genus.description = family_code_map[genus.description]

            if row.LanguageCode in geocoords:
                coords = geocoords[row.LanguageCode]
            elif row.Latitude != 'NULL' and row.Longitude != 'NULL':
                coords = (float(row.Latitude), float(row.Longitude))
            lang = data.add(
                models.Variety, row.LanguageCode,
                id=row.LanguageCode,
                name=lnames[row.LanguageCode],
                genus=genus,
                country=strip_quotes(row.Country),
                area=strip_quotes(row.Area),
                latitude=coords[0],
                longitude=coords[1],
                jsondata=dict(inventory_id=row.InventoryID))
            add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes)

        contributor = data['Contributor'].get(row.Source)
        if not contributor:
            contributor = data.add(
                common.Contributor, row.Source,
                id=row.Source,
                name=SOURCES[row.Source][0],
                description=SOURCES[row.Source][2])
            for ref in SOURCES[row.Source][1]:
                DBSession.add(models.ContributorReference(
                    source=data['Source'][ref], contributor=contributor))

        contrib = data.add(
            models.Inventory, row.InventoryID,
            id=row.InventoryID,
            language=lang,
            source=row.Source,
            source_url=source_urls.get(row.InventoryID),
            internetarchive_url=ia_urls.get(row.InventoryID),
            name=inventory_names[row.InventoryID],
            description=row.LanguageName)

        DBSession.add(common.ContributionContributor(
            contribution=contrib, contributor=contributor))

        for j, squib in enumerate(squibs.get(row.InventoryID, [])):
            f = common.Contribution_files(
                object=contrib,
                id='squib-%s-%s.pdf' % (contrib.id, j + 1),
                name='Phonological squib',
                description=squib,
                mime_type='application/pdf')
            assert f
            # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read())

    DBSession.flush()
    unknown_refs = {}

    for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True):
        inventory = data['Inventory'][row.InventoryID]
        segment = data['Segment'].get(row.Phoneme)
        if not segment:
            unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme]
            description = ' - '.join([t[1] for t in unicode_desc])
            segment = data.add(
                models.Segment, row.Phoneme,
                id=b16encode(md5(description).digest()),
                name=row.Phoneme,
                description=description,
                equivalence_class=''.join(
                    [t[0] for t in unicode_desc
                     if t[1].split()[0] not in ['COMBINING', 'MODIFIER']]),
                segment_class=row.Class,
                combined_class=row.CombinedClass)
            DBSession.flush()

        vs = common.ValueSet(
            id=row.PhonemeID,
            contribution=inventory,
            language=inventory.language,
            parameter=segment)

        for ref in refs.get(row.InventoryID, []):
            if ref not in data['Source']:
                if ref not in unknown_refs:
                    print('-------', ref)
                unknown_refs[ref] = 1
                continue
            DBSession.add(common.ValueSetReference(
                source=data['Source'][ref],
                valueset=vs))

        DBSession.add(common.Value(
            id=row.PhonemeID,
            name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name),
            valueset=vs))
        DBSession.flush()

    for inventory_id in refs:
        for ref in refs[inventory_id]:
            if ref not in data['Source']:
                continue
            data.add(
                common.ContributionReference, '%s-%s' % (inventory_id, ref),
                source=data['Source'][ref],
                contribution=data['Inventory'][inventory_id])

    for i, row in enumerate(reader(args.data_file('phoible-segments-features.tsv'))):
        if i == 0:
            features = list(map(feature_name, row))
            continue

        if row[0] not in data['Segment']:
            # print('skipping feature vector:', row)
            continue
        for j, value in enumerate(row):
            if j and value != '0':
                DBSession.add(common.Parameter_data(
                    key=features[j],
                    value=value,
                    ord=j,
                    object_pk=data['Segment'][row[0]].pk))
    DBSession.flush()
Esempio n. 33
0
def prime_cache(args):  # pragma: no cover
    """
    we use a versioned session to insert the changes in value assignment
    """
    #
    # compute the changes from 2008 to 2011:
    #
    vs2008 = get_vs2008(args)
    for row in DB.execute("select * from datapoint"):
        key = (row['language_id'], row['feature_id'])
        old_value = vs2008.get(key)
        new_value = row['value_numeric']
        if old_value and old_value != new_value:
            valueset = VersionedDBSession.query(common.ValueSet)\
                .join(common.Language)\
                .join(common.Parameter)\
                .filter(common.Parameter.id == row['feature_id'])\
                .filter(common.Language.id == row['language_id'])\
                .one()
            value = valueset.values[0]
            assert value.domainelement.number == old_value
            for de in valueset.parameter.domain:
                if de.number == new_value:
                    value.domainelement = de
                    break
            assert value.domainelement.number == new_value
            valueset.updated = E2011
            value.updated = E2011
            VersionedDBSession.flush()

    for row in reader(args.data_file('corrections_2013.tab'),
                      namedtuples=True,
                      newline='\r'):
        valueset = VersionedDBSession.query(common.ValueSet)\
            .join(common.Language)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == row.feature)\
            .filter(common.Language.id == row.wals_code)\
            .one()
        value = valueset.values[0]

        if value.domainelement.number == int(row.new):
            print('**** old news', valueset.language.id, valueset.parameter.id)
            continue

        if value.domainelement.number != int(row.old):
            print('--->', valueset.language.id, valueset.parameter.id,
                  value.domainelement.number)
        for de in valueset.parameter.domain:
            if de.number == int(row.new):
                value.domainelement = de
                break
        assert value.domainelement.number == int(row.new)
        valueset.updated = E2013
        value.updated = E2013
        VersionedDBSession.flush()
    print('corrections 2013 done')

    for issue in [
            '0', '9', '10', '11', '13', '14', '15', '16', '17', '19', '20',
            '24', '26', '27', '28'
    ]:
        issue = getattr(issues, 'issue' + issue)
        issue(VersionedDBSession, E2013)
        VersionedDBSession.flush()
        transaction.commit()
        transaction.begin()

    #
    # TODO: these must be recomputed as well, after migrations!
    #
    # cache number of languages for a parameter:
    for parameter, valuesets in groupby(
            DBSession.query(common.ValueSet).order_by(
                common.ValueSet.parameter_pk), lambda vs: vs.parameter):
        parameter.representation = str(
            len(set(v.language_pk for v in valuesets)))
    print('recomputation of representation done')
    transaction.commit()
    transaction.begin()

    # cache iso codes for languages:
    for language in DBSession.query(common.Language).options(
            joinedload_all(common.Language.languageidentifier,
                           common.LanguageIdentifier.identifier)):
        iso_codes = []
        for identifier in language.identifiers:
            if identifier.type == common.IdentifierType.iso.value:
                iso_codes.append(identifier.name)
        language.iso_codes = ', '.join(sorted(set(iso_codes)))
    print('ecomputation of iso codes done')
    transaction.commit()
    transaction.begin()

    compute_language_sources()
    transaction.commit()
    transaction.begin()

    gbs_func('update', args)
Esempio n. 34
0
def justifications(args, languages, stats):
    """
    - text goes into ValueSet.description
    - refs go into ValueSetReference objects
    """
    hh_bibkey_to_glottolog_id = {}
    for rec in get_bib(args):
        for provider, bibkeys in get_bibkeys(rec).items():
            if provider == 'hh':
                for bibkey in bibkeys:
                    hh_bibkey_to_glottolog_id[bibkey] = rec['glottolog_ref_id']
                break

    def substitute_hh_bibkeys(m):
        return '**%s**' % hh_bibkey_to_glottolog_id[m.group('bibkey')]

    #
    # create mappings to look up glottolog languoids matching names in justification files
    #
    langs_by_hid = languages
    langs_by_hname = {}
    langs_by_name = {}

    # order by active to make sure, we active languoid overwrite the data of obsolete ones.
    for l in DBSession.query(Languoid).order_by(Languoid.active):
        langs_by_hname[l.jsondata.get('hname')] = l
        langs_by_hid[l.hid] = l
        langs_by_name[l.name] = l

    def normalize_pages(s):
        return (s or '').strip().rstrip(',') or None

    for id_, type_ in [('fc', 'family'), ('sc', 'subclassification')]:
        for i, row in enumerate(dsv.reader(
                args.data_dir.joinpath('languoids', 'forkel_%s_justifications-utf8.tab' % type_))):
            name = row[0]
            name = name.replace('_', ' ') if not name.startswith('NOCODE') else name
            l = langs_by_hname.get(name, langs_by_hid.get(name, langs_by_name.get(name)))
            if not l:
                args.log.warn('ignoring %s' % name)
                continue

            _r = 3 if type_ == 'family' else 2
            comment = (row[_r].strip() or None) if len(row) > _r else None
            if comment and not WORD_PATTERN.search(comment):
                comment = None
            if comment:
                comment = re.sub('\*\*(?P<bibkey>[^\*]+)\*\*', substitute_hh_bibkeys, comment)

            #
            # TODO: look for [NOCODE_ppp] patterns as well!?
            #

            refs = [(int(m.group('id')), normalize_pages(m.group('pages')))
                    for m in REF_PATTERN.finditer(
                    re.sub('\*\*(?P<bibkey>[^\*]+)\*\*', substitute_hh_bibkeys, row[2]))]

            vs = None
            for _vs in l.valuesets:
                if _vs.parameter.id == id_:
                    vs = _vs
                    break

            if not vs:
                args.log.info('%s %s ++' % (l.id, type_))
                vs = ValueSet(
                    id='%s%s' % (id_, l.pk),
                    description=comment,
                    language=l,
                    parameter=Parameter.get(id_),
                    contribution=Contribution.first())
                DBSession.add(Value(
                    id='%s%s' % (id_, l.pk),
                    name='%s - %s' % (l.level, l.status),
                    valueset=vs))
                DBSession.flush()
            else:
                if vs.description != comment:
                    args.log.info('%s %s ~~ description: %s ---> %s' % (l.id, type_, vs.description, comment))
                    vs.description = comment
                    stats.update(['justifications-%s' % type_])

            for r in vs.references:
                DBSession.delete(r)

            for r, pages in refs:
                # FIXME: we must make sure not to link sources which will subsequently be
                # replaced!
                vs.references.append(ValueSetReference(
                    source=Source.get(str(r)),
                    description=pages))

        args.log.info('%s %s' % (i, type_))
Esempio n. 35
0
def get_vs2008(args):  # pragma: no cover
    vs2008 = {}
    for row in reader(args.data_file("datapoints_2008.csv"), delimiter=","):
        vs2008[(row[0], "%sA" % row[1])] = int(row[2])
    return vs2008
Esempio n. 36
0
 def read(table):
     return list(dsv.reader(
         args.data_file(table + '.csv'), delimiter=',', namedtuples=True))
Esempio n. 37
0
def get_rows(args, name):
    for i, row in enumerate(reader(args.data_file('InventoryID-%s.csv' % name))):
        if i and row[1] != 'NA':
            yield row
Esempio n. 38
0
def prime_cache(args):  # pragma: no cover
    """
    we use a versioned session to insert the changes in value assignment
    """
    #
    # compute the changes from 2008 to 2011:
    #
    vs2008 = get_vs2008(args)
    for row in DB.execute("select * from datapoint"):
        key = (row["language_id"], row["feature_id"])
        old_value = vs2008.get(key)
        new_value = row["value_numeric"]
        if old_value and old_value != new_value:
            valueset = (
                VersionedDBSession.query(common.ValueSet)
                .join(common.Language)
                .join(common.Parameter)
                .filter(common.Parameter.id == row["feature_id"])
                .filter(common.Language.id == row["language_id"])
                .one()
            )
            value = valueset.values[0]
            assert value.domainelement.number == old_value
            for de in valueset.parameter.domain:
                if de.number == new_value:
                    value.domainelement = de
                    break
            assert value.domainelement.number == new_value
            valueset.updated = E2011
            value.updated = E2011
            VersionedDBSession.flush()

    for row in reader(args.data_file("corrections_2013.tab"), namedtuples=True, newline="\r"):
        valueset = (
            VersionedDBSession.query(common.ValueSet)
            .join(common.Language)
            .join(common.Parameter)
            .filter(common.Parameter.id == row.feature)
            .filter(common.Language.id == row.wals_code)
            .one()
        )
        value = valueset.values[0]

        if value.domainelement.number == int(row.new):
            print("**** old news", valueset.language.id, valueset.parameter.id)
            continue

        if value.domainelement.number != int(row.old):
            print("--->", valueset.language.id, valueset.parameter.id, value.domainelement.number)
        for de in valueset.parameter.domain:
            if de.number == int(row.new):
                value.domainelement = de
                break
        assert value.domainelement.number == int(row.new)
        valueset.updated = E2013
        value.updated = E2013
        VersionedDBSession.flush()
    print("corrections 2013 done")

    for issue in ["0", "9", "10", "11", "13", "14", "15", "16", "17", "19", "20", "24", "26", "27", "28"]:
        issue = getattr(issues, "issue" + issue)
        issue(VersionedDBSession, E2013)
        VersionedDBSession.flush()
        transaction.commit()
        transaction.begin()

    #
    # TODO: these must be recomputed as well, after migrations!
    #
    # cache number of languages for a parameter:
    for parameter, valuesets in groupby(
        DBSession.query(common.ValueSet).order_by(common.ValueSet.parameter_pk), lambda vs: vs.parameter
    ):
        parameter.representation = str(len(set(v.language_pk for v in valuesets)))
    print("recomputation of representation done")
    transaction.commit()
    transaction.begin()

    # cache iso codes for languages:
    for language in DBSession.query(common.Language).options(
        joinedload_all(common.Language.languageidentifier, common.LanguageIdentifier.identifier)
    ):
        iso_codes = []
        for identifier in language.identifiers:
            if identifier.type == common.IdentifierType.iso.value:
                iso_codes.append(identifier.name)
        language.iso_codes = ", ".join(sorted(set(iso_codes)))
    print("ecomputation of iso codes done")
    transaction.commit()
    transaction.begin()

    compute_language_sources()
    transaction.commit()
    transaction.begin()

    gbs_func("update", args)
Esempio n. 39
0
def get_tab(name):
    """generator for entries in a tab file specified by name.
    """
    return dsv.reader(get(get_taburls()[name]).split('\n'), namedtuples=True)