Beispiel #1
0
 def load_sources(self, dictionary, data):
     if self.bib:
         for rec in self.bib.records:
             src = bibtex2source(rec, models.DictionarySource)
             src.dictionary = dictionary
             src.id = '%s-%s' % (self.id, src.id)
             data.add(models.DictionarySource, rec.id, _obj=bibtex2source(rec))
def add_sources(sources_file_path, session):
    """
    Creates and adds to the given SQLAlchemy session the common.Source model
    instances that comprise the project's references. Expects the path to a
    bibtex file as its first argument.
    Returns a dict containing the added model instances with the bibtex IDs
    being the keys.
    Helper for the main function.
    """
    d = {}

    bibtex_db = bibtex.Database.from_file(sources_file_path, encoding='utf-8')
    seen = set()

    for record in bibtex_db:

        if record.id in seen:
            continue

        d[record.id] = bibtex2source(record)
        session.add(d[record.id])
        seen.add(record.id)

    session.flush()

    return d
Beispiel #3
0
def add_sources(args, data):
    bib = Database.from_file(args.data_file('phoible-references.bib'),
                             lowercase=True)
    ext = [
        Record.from_string('@' + s, lowercase=True)
        for s in nfilter(BIB.split('@'))
    ]

    for rec in chain(ext, bib):
        if rec.id not in data['Source']:
            data.add(Source, rec.id, _obj=bibtex2source(rec))

    #
    # add aliases to lookup records with bibtex keys with numeric prefixes without
    # specifying the prefix
    #
    for key in list(data['Source'].keys()):
        if '_' in key:
            no, rem = key.split('_', 1)
            try:
                int(no)
                if rem not in data['Source']:
                    data['Source'][rem] = data['Source'][key]
            except (ValueError, TypeError):
                pass
Beispiel #4
0
def test_bibtex2source():
    from clld.scripts.util import bibtex2source

    bibtex2source(Record('book', 'id', author='M, R and G, H and Z, U'))
    bibtex2source(Record('book', 'id', editor='M, R and G, H'))
    bibtex2source(Record('book', 'id', title='tb', customfield='cf', year="1920}"))
    assert bibtex2source(Record('misc', 'Id', title='title')).id == 'Id'
Beispiel #5
0
def test_bibtex2source():
    from clld.scripts.util import bibtex2source

    bibtex2source(Record('book', 'id', author='M, R and G, H and Z, U'))
    bibtex2source(Record('book', 'id', editor='M, R and G, H'))
    bibtex2source(Record('book', 'id', title='tb', customfield='cf', year="1920}"))
    assert bibtex2source(Record('misc', 'Id', title='title')).id == 'Id'
Beispiel #6
0
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=cognition.__name__,
        name="COSTATOL",
        description="Cognitive Structures across the Tree of Life",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='cognition.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    DBSession.add(dataset)

    #
    # TODO: add editors!
    #

    for rec in Database.from_file(args.data_file('sources.bib')):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    contrib = common.Contribution(id='costatol', name='COSTATOL')
    for datapoint in reader(args.data_file('values.csv'), delimiter=',', dicts=True):
        param = data['Parameter'].get(datapoint['cognitive capacity'])
        if not param:
            name = datapoint['cognitive capacity']
            param = data.add(common.Parameter, name, id=slug(name), name=name)

        species = data['Language'].get(datapoint['species'])
        if not species:
            name = datapoint['species']
            species = data.add(common.Language, name, id=slug(name), name=name)

        vid = '%s-%s' % (species.id, param.id)
        vs = data.add(
            common.ValueSet,
            vid,
            id=vid,
            language=species,
            parameter=param,
            contribution=contrib)
        data.add(common.Value, vid, id=vid, name=datapoint['value'], valueset=vs)
        match = source_pattern.match(datapoint['source'])
        if match:
            DBSession.add(common.ValueSetReference(
                valueset=vs,
                source=data['Source'][match.group('key')],
                description=match.group('pages')))

    for species in reader(args.data_file('species.csv'), delimiter=',', namedtuples=True):
        data['Language'][species.name].longitude = species.longitude
        data['Language'][species.name].latitude = species.latitude
Beispiel #7
0
def add_sources(sources_file_path, session):
    """
    Creates and adds to the given SQLAlchemy session the common.Source model
    instances that comprise the project's references. Expects the path to a
    bibtex file as its first argument.

    Helper for the main function.
    """
    bibtex_db = bibtex.Database.from_file(sources_file_path, encoding='utf-8')

    for record in bibtex_db:
        session.add(bibtex2source(record))
Beispiel #8
0
def add_sources(args, data):
    bib = Database.from_file(args.data_file('phoible-references.bib'), lowercase=True)
    ext = [Record.from_string('@' + s, lowercase=True) for s in nfilter(BIB.split('@'))]

    for rec in chain(ext, bib):
        if rec.id not in data['Source']:
            data.add(Source, rec.id, _obj=bibtex2source(rec))

    #
    # add aliases to lookup records with bibtex keys with numeric prefixes without
    # specifying the prefix
    #
    for key in list(data['Source'].keys()):
        if '_' in key:
            no, rem = key.split('_', 1)
            try:
                int(no)
                if rem not in data['Source']:
                    data['Source'][rem] = data['Source'][key]
            except (ValueError, TypeError):
                pass
Beispiel #9
0
def main(args):
    data = Data()
    data_path = lambda *cs: args.data_file('concepticon-data',
                                           'concepticondata', *cs)

    dataset = common.Dataset(
        id=concepticon.__name__,
        name="Concepticon 1.0",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='concepticon.clld.org',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    DBSession.add(dataset)
    for i, name in enumerate(
        ['Johann-Mattis List', 'Michael Cysouw', 'Robert Forkel']):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    english = data.add(common.Language, 'eng', id='eng', name='English')

    files = {}
    for fname in data_path('sources').iterdir():
        files[fname.stem] = \
            "https://github.com/clld/concepticon-data/blob/master/concepticondata/sources/%s" % fname.name

    for rec in Database.from_file(data_path('references', 'references.bib'),
                                  lowercase=True):
        source = data.add(common.Source, rec.id, _obj=bibtex2source(rec))
        if rec.id in files:
            DBSession.flush()
            DBSession.add(
                common.Source_files(mime_type='application/pdf',
                                    object_pk=source.pk,
                                    jsondata=dict(url=files[rec.id])))

    for concept in reader(data_path('concepticon.tsv'), namedtuples=True):
        data.add(models.ConceptSet,
                 concept.ID,
                 id=concept.ID,
                 name=concept.GLOSS,
                 description=concept.DEFINITION,
                 semanticfield=concept.SEMANTICFIELD,
                 ontological_category=concept.ONTOLOGICAL_CATEGORY)

    for rel in reader(data_path('conceptrelations.tsv'), namedtuples=True):
        DBSession.add(
            models.Relation(source=data['ConceptSet'][rel.SOURCE],
                            target=data['ConceptSet'][rel.TARGET],
                            description=rel.RELATION))

    unmapped = Counter()
    number_pattern = re.compile('(?P<number>[0-9]+)(?P<suffix>.*)')

    for cl in reader(data_path('conceptlists.tsv'), dicts=True):
        concepts = data_path('conceptlists', '%(ID)s.tsv' % cl)
        if not concepts.exists():
            continue
        langs = [l.lower() for l in split(cl['SOURCE_LANGUAGE'])]
        conceptlist = data.add(
            models.Conceptlist,
            cl['ID'],
            id=cl['ID'],
            name=' '.join(cl['ID'].split('-')),
            description=cl['NOTE'],
            target_languages=cl['TARGET_LANGUAGE'],
            source_languages=' '.join(langs),
            year=int(cl['YEAR']) if cl['YEAR'] else None,
        )
        for id_ in split(cl['REFS']):
            common.ContributionReference(source=data['Source'][id_],
                                         contribution=conceptlist)
        for i, name in enumerate(split(cl['AUTHOR'], sep=' and ')):
            name = strip_braces(name)
            contrib = data['Contributor'].get(name)
            if not contrib:
                contrib = data.add(common.Contributor,
                                   name,
                                   id=slug(name),
                                   name=name)
            DBSession.add(
                common.ContributionContributor(ord=i,
                                               contribution=conceptlist,
                                               contributor=contrib))
        for k in 'ID NOTE TARGET_LANGUAGE SOURCE_LANGUAGE YEAR REFS AUTHOR'.split(
        ):
            del cl[k]
        DBSession.flush()
        for k, v in cl.items():
            DBSession.add(
                common.Contribution_data(object_pk=conceptlist.pk,
                                         key=k,
                                         value=v))

        for concept in reader(concepts, namedtuples=True):
            if not concept.ID or not concept.CONCEPTICON_ID or concept.CONCEPTICON_ID == 'NAN':
                #print conceptlist.id, getattr(concept, 'ENGLISH', getattr(concept, 'GLOSS', None))
                unmapped.update([conceptlist.id])
                continue

            lgs = {}
            for lang in langs:
                v = getattr(concept, lang.upper())
                if v:
                    lgs[lang] = v

            match = number_pattern.match(concept.NUMBER)
            if not match:
                print(concept.ID)
                raise ValueError
            vs = common.ValueSet(
                id=concept.ID,
                description=getattr(concept, 'GLOSS',
                                    getattr(concept, 'ENGLISH', None)),
                language=english,
                contribution=conceptlist,
                parameter=data['ConceptSet'][concept.CONCEPTICON_ID])
            d = {}
            for key, value in concept.__dict__.items():
                if not key.startswith('CONCEPTICON_') and \
                        key not in ['NUMBER', 'ID', 'GLOSS'] + [l.upper() for l in langs]:
                    d[key.lower()] = value
            v = models.Concept(
                id=concept.ID,
                valueset=vs,
                description=getattr(concept, 'GLOSS',
                                    None),  # our own gloss, if available
                name='; '.join('%s [%s]' % (lgs[l], l)
                               for l in sorted(lgs.keys())),
                number=int(match.group('number')),
                number_suffix=match.group('suffix'),
                jsondata=d)
            DBSession.flush()
            for key, value in lgs.items():
                DBSession.add(
                    common.Value_data(key='lang_' + key,
                                      value=value,
                                      object_pk=v.pk))

    print('Unmapped concepts:')
    for clid, no in unmapped.most_common():
        print(clid, no)

    for fname in data_path('concept_set_meta').iterdir():
        if fname.suffix == '.tsv':
            md = load(fname.parent.joinpath(fname.name + '-metadata.json'))
            provider = models.MetaProvider(id=fname.stem,
                                           name=md['dc:title'],
                                           description=md['dc:description'],
                                           url=md['dc:source'],
                                           jsondata=md)
            for meta in reader(fname, dicts=True):
                try:
                    for k, v in meta.items():
                        if v and k != 'CONCEPTICON_ID':
                            models.ConceptSetMeta(metaprovider=provider,
                                                  conceptset=data['ConceptSet']
                                                  [meta['CONCEPTICON_ID']],
                                                  key=k,
                                                  value=v)
                except:
                    print(fname)
                    print(meta)
                    raise
Beispiel #10
0
def main(args):
    if DBSession.bind.dialect.name == 'postgresql':
        Index('ducet', collkey(common.Value.name)).create(DBSession.bind)

    def data_file(*comps):
        return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(
        common.Dataset,
        'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex")

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        add_language_codes(
            data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None)
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace(
            '/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):

        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(
                url=image.source_url,
                thumbnail=image_url(image.source_url, 'thumbnail'),
                web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(
                object=data['Taxon'][image.taxa__id],
                id=image.id,
                name=image.tags,
                jsondata=jsondata,
                mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)
Beispiel #11
0
def import_dataset(path,
                   data,
                   languoids,
                   invalid_features,
                   add_missing_features=False):
    # look for metadata
    # look for sources
    # then loop over values

    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)

    contrib = GrambankContribution(id=basename,
                                   name=basename,
                                   desc=languoids[basename].name)

    md = {}
    mdpath = path + '-metadata.json'
    if os.path.exists(mdpath):
        md = jsonload(mdpath)
    contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS'))
    contributor_id = slug(contributor_name.last + contributor_name.first)
    contributor = data['Contributor'].get(contributor_id)
    if not contributor:
        contributor = data.add(Contributor,
                               contributor_id,
                               id=contributor_id,
                               name='%s' % contributor_name)
    DBSession.add(
        ContributionContributor(contribution=contrib, contributor=contributor))

    bibpath = os.path.join(dirpath, basename + '.bib')
    if os.path.exists(bibpath):
        for rec in Database.from_file(bibpath):
            if rec['key'] not in data['Source']:
                data.add(Source, rec['key'], _obj=bibtex2source(rec))

    languages = {
        f['properties']['glottocode']: f
        for f in md.get('features', [])
    }

    for i, row in enumerate(
            reader(path,
                   dicts=True,
                   quoting=csv.QUOTE_NONE,
                   delimiter=',' if 'c' in ext else '\t')):
        if not row['Value'] or not row['Feature_ID']:
            continue
        vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID'])
        vid = row.get('ID', '%s-%s' % (basename, i + 1))

        parameter = data['Feature'].get(row['Feature_ID'])
        if parameter is None:
            if add_missing_features:
                parameter = data.add(Feature,
                                     row['Feature_ID'],
                                     id=row['Feature_ID'],
                                     name=row.get('Feature',
                                                  row['Feature_ID']))
            else:
                invalid_features.update([row['Feature_ID']])
                continue

        language = data['GrambankLanguage'].get(row['Language_ID'])
        if language is None:
            languoid = languoids.get(row['Language_ID'])
            if languoid is None:
                print('Skipping, no Glottocode found for %s' %
                      row['Language_ID'])
                continue

            gl_md = {
                'name': languoid.name,
                'longitude': languoid.longitude,
                'latitude': languoid.latitude
            }
            lmd = languages.get(row['Language_ID'])
            if lmd:
                if lmd.get('properties', {}).get('name'):
                    gl_md['name'] = lmd['properties']['name']
                if lmd.get('geometry', {}).get('coordinates'):
                    gl_md['longitude'], gl_md['latitude'] = lmd['geometry'][
                        'coordinates']

            language = data.add(GrambankLanguage,
                                row['Language_ID'],
                                id=row['Language_ID'],
                                name=gl_md['name'],
                                latitude=gl_md.get('latitude'),
                                longitude=gl_md.get('longitude'))

        domain = {de.abbr: de for de in parameter.domain}
        if not domain.get(row['Value']):
            #print "skipped", row, "not in", domain
            continue

        vs = data['ValueSet'].get(vsid)
        if vs is None:
            vs = data.add(ValueSet,
                          vsid,
                          id=vsid,
                          parameter=parameter,
                          language=language,
                          contribution=contrib,
                          source=row['Source'])

        name = row['Value']
        if name in domain:
            name = domain[name].name

        data.add(Value,
                 vid,
                 id=vid,
                 valueset=vs,
                 name=name,
                 description=row['Comment'],
                 domainelement=domain.get(row['Value']))

        for key, src in data['Source'].items():
            if key in vs.source:
                ValueSetReference(valueset=vs, source=src, key=key)
Beispiel #12
0
def main(args):
    """
    The case is we have to codings for two different dialects (called hua and yagaria) of
    the same iso "qgr", both of which we want to keep and keep separately. I had missed
    that when making NTS, rigging everything so that the iso would be the id, which is not
    sufficient. Glottocodes in Grambank would have taken care of it except the dialect
    division for yaga1260 is wrong, having yagaria as overarching and Hua under it
    (reality has it that Hua and Yagaria are two dialects of the same language, which has
    no name). So a solution with glottocodes would have to wait until we fix that or need
    another fix later. So I guess, for now, let's ignore qgr (and its datapoints) and I'll
    fix on my end later.
    """
    data = Data(
        created=utc.localize(datetime(2013, 11, 15)),
        updated=utc.localize(datetime(2013, 12, 12)))
    icons = issues.Icons()

    dtab = partial(_dtab, args.data_file())

    #Languages
    tabfns = ['%s' % fn.name for fn in args.data_file().glob('nts_*.tab')]
    args.log.info("Sheets found: %s" % tabfns)
    ldps = []
    lgs = {}
    nfeatures = Counter()
    nlgs = Counter()

    for fn in tabfns:
        for ld in dtab(fn):
            if ld['language_id'] == 'qgr':
                continue
            if "feature_alphanumid" not in ld:
                args.log.info("NO FEATUREID %s %s" % (len(ld), ld))
            if not ld["feature_alphanumid"].startswith("DRS") \
                    and ld["feature_alphanumid"].find(".") == -1:
                ldps.append(dp_dict(ld))
                lgs[ld['language_id']] = unescape(ld['language_name'])
                if ld["value"] != "?":
                    nfeatures.update([ld['language_id']])
                    nlgs.update([ld['feature_alphanumid']])

    ldps = sorted(ldps, key=lambda d: d['feature_alphanumid'])

    lgs["ygr"] = "Hua"

    for lgid, lgname in lgs.items():
        data.add(
            models.ntsLanguage, lgid,
            id=lgid,
            name=lgname,
            representation=nfeatures.get(lgid, 0))
    DBSession.flush()

    load_families(data, [(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['ntsLanguage'].values()], isolates_icon='tcccccc')
    #glottolog = Glottolog()
    #for lg in data['ntsLanguage'].values():
    #    print lg.id, NOCODE_TO_GLOTTOCODE.get(lg.id, lg.id)
    #    gl_language = glottolog.languoid(NOCODE_TO_GLOTTOCODE.get(lg.id, lg.id))
    #    if not gl_language.family:
    #        family = data.add(Family, gl_language.id, id = gl_language.id, name = gl_language.name, description=common.Identifier(name=gl_language.id, type=common.IdentifierType.glottolog.value).url(), jsondata={"icon": 'tcccccc'})
    #        lg.family = family

    
    #Domains
    for domain in set(ld['feature_domain'] for ld in ldps):
        data.add(models.FeatureDomain, domain, name=domain)
    DBSession.flush()

    #Designers
    for i, info in enumerate(dtab("ntscontributions.tab") + dtab("ntscontacts.tab")):
        designer_id = str(i + 1)
        data.add(
            models.Designer, info['designer'],
            id=designer_id,
            name=designer_id,
            domain=info["domain"],
            contributor=info['designer'],
            pdflink=info["pdflink"],
            citation=info["citation"])
    DBSession.flush()

    #Sources
    for k, (typ, bibdata) in [
        ktfbib(bibsource) for ld in ldps
        if ld.get(u'bibsources') for bibsource in ld['bibsources'].split(",,,")
    ]:
        if k not in data["Source"]:
            data.add(common.Source, k, _obj=bibtex2source(Record(typ, k, **bibdata)))
    DBSession.flush()

    #Features
    fs = [(fid, mergeds(lds)) for fid, lds in
          groupby(ldps, key=lambda d: d['feature_alphanumid'])]

    fvdesc = [(fid, [(ld.get("feature_possible_values"), ld.get("fromfile")) for ld in lds if ld.get("feature_possible_values")]) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])]
    fvdt = [(fid, grp2(vdescs)) for (fid, vdescs) in fvdesc]
    fvmis = [(fid, vdescs) for (fid, vdescs) in fvdt if len(vdescs) > 1]
    for (fid, vdescs) in fvmis:
        print fid, "DIFF VDESC"
        for (vd, fromf) in vdescs:
            print vd, set(fromf)

    for _, dfsids in groupby(
            sorted((f.get('feature_name', fid), fid) for fid, f in fs),
            key=lambda t: t[0]):
        assert len(list(dfsids)) == 1

    for fid, f in fs:
        if not fid.isdigit():
            args.log.info("NO INT FID %s" % f)           
        feature = data.add(
            models.Feature, fid,
            id=fid,
            name=f.get('feature_name', f['feature_alphanumid']),
            doc=f.get('feature_information', ""),
            vdoc=f.get('feature_possible_values', ""),
            representation=nlgs.get(fid, 0),
            designer=data["Designer"][f['designer']],
            dependson=f.get("depends_on", ""),
            abbreviation=f.get("abbreviation", ""),
            featuredomain=data['FeatureDomain'][f["feature_domain"]],
            name_french=f.get('francais', ""),
            clarification=f.get("draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", ""),
            alternative_id=f.get("old feature number", ""),
            jl_relevant_unit=f.get("relevant unit(s)", ""),
            jl_function=f.get("function", ""),
            jl_formal_means=f.get("formal means", ""),
            sortkey_str="",
            sortkey_int=int(fid))

        vdesclist = [veq.split("==") for veq in feature.vdoc.split("||")]
        vdesc = {v.replace(".", "-"): desc for [v, desc] in vdesclist}
        vdesc.setdefault('?', 'Not known')
        if 'N/A' not in vdesc and feature.dependson:
            vdesc["N/A"] = "Not Applicable"
        vi = {v: i for (i, v) in enumerate(sorted(vdesc.keys()))}
        vicons = icons.iconize(vi.keys())
        for v, desc in vdesc.items():
            data.add(
                common.DomainElement, (fid, v),
                id='%s-%s' % (fid, v),
                name=v,
                description=desc,
                jsondata={"icon": vicons[v]},
                number=vi[v],
                parameter=feature)
    DBSession.flush()

    for ((f, lg), ixs) in grp2(
            [((ld['feature_alphanumid'], ld['language_id']), i)
             for i, ld in enumerate(ldps)]):
        ixvs = set([ldps[ix]['value'] for ix in ixs])
        if len(ixvs) == 1:
            continue
        args.log.warn(
            "Dup value %s %s %s" %
            (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile']) for ix in ixs]))
        print "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'], ldps[ix].get('provenance')) for ix in ixs])
    errors = {}
    done = set()
    for ld in ldps:
        parameter = data['Feature'][ld['feature_alphanumid']]
        language = data['ntsLanguage'][ld['language_id']]
        
        id_ = '%s-%s' % (parameter.id, language.id)
        if id_ in done:
            continue

        if (ld['feature_alphanumid'], ld['value']) not in data['DomainElement']:
            if not ld["value"].strip():
                continue
            info = (
                ld['feature_alphanumid'],
                ld.get('feature_name', "[Feature Name Lacking]"),
                ld['language_id'],
                ld['value'],
                ld['fromfile'])
            msg = u"%s %s %s %s %s not in the set of legal values ({0})" % info
            args.log.error(msg.format(sorted(
                [y for (x, y) in data['DomainElement'].keys()
                 if x == ld['feature_alphanumid']])))
            print msg.format(sorted(
                [y for (x, y) in data['DomainElement'].keys()
                 if x == ld['feature_alphanumid']]))
            errors[(ld['feature_alphanumid'], ld['language_id'])] = info
            continue

        vs = common.ValueSet(
            id=id_,
            language=language,
            parameter=parameter,
            source=ld["source"] or None,
            contribution=parameter.designer)
        models.ntsValue(
            id=id_,
            domainelement=data['DomainElement'][(ld['feature_alphanumid'], ld['value'])],
            jsondata={"icon": data['DomainElement'][(ld['feature_alphanumid'], ld['value'])].jsondata},
            comment=ld["comment"],
            valueset=vs,
            contributed_datapoint=ld["contributor"])
        done.add(id_)

        if not ld.get('bibsources'):
            if 'bibsources' not in ld:
                args.log.warn("no bibsource %s" % ld)
            continue
        for k, _ in [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,")]:
            common.ValueSetReference(valueset=vs, source=data['Source'][k])
    DBSession.flush()

    #To CLDF
    cldf = {}
    for ld in ldps:
        parameter = data['Feature'][ld['feature_alphanumid']]
        language = data['ntsLanguage'][ld['language_id']]
        id_ = '%s-%s' % (parameter.id, language.id)
        if not id_ in done:
            continue
        dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"]) #, ld["comment"], ld["source"], bibliographical_details(ld.get('bibsources', "").split(",,,"))
        cldf[dt] = None
        
        
    tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows])
    savu(tab([("Language", "iso-639-3", "Feature", "Value")] + cldf.keys()), "nts.cldf", encoding = "utf-8") #utf-16 "Comment", "Source", "Bibliographical Details"



    #cldf = {}
    #for ld in ldps:
    #    parameter = data['Feature'][ld['feature_alphanumid']]
    #    language = data['ntsLanguage'][ld['language_id']]
    #    id_ = '%s-%s' % (parameter.id, language.id)
    #    if not id_ in done:
    #        continue
    #    dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"], ld["comment"], ld["source"], bibliographical_details(ld.get('bibsources', "").split(",,,")), ld.get("feature_information", ""), ld.get('feature_possible_values', ""), ld["designer"], ld.get("abbreviation", ""), ld["feature_domain"], ld.get('francais', ""), ld.get("dependencies", ""), ld.get("draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", ""))
    #    cldf[dt] = None
    
    #savu(tab([("Language", "iso-639-3", "Feature", "Value", "Comment", "Source", "Bibliographical Details", "Feature Information", "Feature Possible Values", "Feature Designer", "Feature Abbreviation", "Feature Domain", "Feature (French)", "Feature Dependencies", "Feature Clarifying Comments")] + cldf.keys()), "nts-with-metadata.tsv", encoding="utf-16")

    
    args.log.info('%s Errors' % len(errors))

    dataset = common.Dataset(
        id="NTS",
        name='Nijmegen Typological Survey',
        publisher_name="Max Planck Institute for Psycholinguistics",
        publisher_place="Nijmegen",
        publisher_url="http://www.mpi.nl",
        description="""Dataset on Typological Features, collected 2013-2014 in the Language and Cognition Department at the Max Planck Institute for Psycholinguistics, Max-Planck Gesellschaft, and a European Research Council's Advanced Grant (269484 "INTERACT") to Stephen C. Levinson.""",
        domain='http://nts.clld.org',
        published=date(2014, 2, 20),
        contact='*****@*****.**',
        license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en',
        jsondata={
            'license_icon': 'http://wals.info/static/images/cc_by_nc_nd.png',
            'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany'})

    for i, contributor in enumerate([
        common.Contributor(
            id="Harald Hammarstrom",
            name="Harald Hammarstrom",
            email="*****@*****.**"),
        common.Contributor(
            id="Suzanne van der Meer",
            name="Suzanne van der Meer",
            email="*****@*****.**"),
        common.Contributor(
            id="Hedvig Skirgard",
            name="Hedvig Skirgard",
            email="*****@*****.**")
    ]):
        common.Editor(dataset=dataset, contributor=contributor, ord=i)

    DBSession.add(dataset)
Beispiel #13
0
def import_dataset(path, data, icons):
    # look for metadata
    # look for sources
    # then loop over values
    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)
    glottolog = Glottolog()

    contrib = Contribution(id=basename, name=basename)

    md = {}
    mdpath = path + '-metadata.json'
    if os.path.exists(mdpath):
        md = jsonload(mdpath)
    contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS'))
    contributor_id = slug(contributor_name.last + contributor_name.first)
    contributor = data['Contributor'].get(contributor_id)
    if not contributor:
        contributor = data.add(
            Contributor,
            contributor_id,
            id=contributor_id,
            name='%s' % contributor_name)
    DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor))

    bibpath = os.path.join(dirpath, basename + '.bib')
    if os.path.exists(bibpath):
        for rec in Database.from_file(bibpath):
            if rec['key'] not in data['Source']:
                data.add(Source, rec['key'], _obj=bibtex2source(rec))

    languages = {f['properties']['glottocode']: f for f in md.get('features', [])}

    for i, row in enumerate(reader(path, dicts=True, quoting=csv.QUOTE_NONE, delimiter=',' if 'c' in ext else '\t')):
        if not row['Value'] or not row['Feature_ID']:
            continue
        vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID'])
        vid = row.get('ID', '%s-%s' % (basename, i + 1))

        parameter = data['Feature'].get(row['Feature_ID'])
        if parameter is None:
            print('skip value for invalid feature %s' % row['Feature_ID'])
            continue
            #parameter = data.add(
            #    Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID']))

        language = data['GrambankLanguage'].get(row['Language_ID'])
        if language is None:
            # query glottolog!
            languoid = glottolog.languoid(row['Language_ID'])
            gl_md = {
                'name': languoid.name,
                'longitude': languoid.longitude,
                'latitude': languoid.latitude}
            lmd = languages.get(row['Language_ID'])
            if lmd:
                if lmd.get('properties', {}).get('name'):
                    gl_md['name'] = lmd['properties']['name']
                if lmd.get('geometry', {}).get('coordinates'):
                    gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates']

            language = data.add(
                GrambankLanguage, row['Language_ID'],
                id=row['Language_ID'],
                name=gl_md['name'],
                latitude=gl_md.get('latitude'),
                longitude=gl_md.get('longitude'))

        vs = data['ValueSet'].get(vsid)
        if vs is None:
            vs = data.add(
                ValueSet, vsid,
                id=vsid,
                parameter=parameter,
                language=language,
                contribution=contrib,
                source=row['Source'])

        domain = {de.abbr: de for de in parameter.domain}
        name = row['Value']
        if name in domain:
            name = domain[name].name

        Value(
            id=vid,
            valueset=vs,
            name=name,
            description=row['Comment'],
            domainelement=domain.get(row['Value']))

        for key, src in data['Source'].items():
            if key in vs.source:
                ValueSetReference(valueset=vs, source=src, key=key)
Beispiel #14
0
def main(args):
    data = Data()

    def maybe_int(c):
        try:
            return int(c.value)
        except Exception:
            return None

    contributors = {}
    xls = xlrd.open_workbook(args.data_file('eWAVE2-Contributors.xlsx'))
    sheet = xls.sheet_by_name('Tabelle1')
    fields = [sheet.cell(0, i).value for i in range(sheet.ncols)]
    for i in range(1, sheet.nrows):
        values = dict(zip(fields, [sheet.cell(i, j).value for j in range(sheet.ncols)]))
        contributors[slug(values['Voller Name'])] = values

    xls = xlrd.open_workbook(args.data_file('ewave.xls'))
    varieties = {}
    values = {}
    matrix = xls.sheet_by_name('matrixRAW-quer')
    features = [maybe_int(matrix.cell(0, i)) for i in range(matrix.ncols)]

    for i in range(3, matrix.nrows):
        values[maybe_int(matrix.cell(i, 1))] = dict(
            (features[j], matrix.cell(i, j).value.upper()) for j in range(6, matrix.ncols) if features[j])

    features = {n: dict(name=matrix.cell(1, i).value) for i, n in enumerate(features)}

    sheet = xls.sheet_by_name('Example sources')
    for i in range(sheet.nrows):
        id = maybe_int(sheet.cell(i, 0))
        if id in features:
            features[id]['example'] = sheet.cell(i, 2).value
            features[id]['example_source'] = sheet.cell(i, 2).value

    sheet = xls.sheet_by_name('var-infrmnts-type-regn-lat-lon')
    for i in range(sheet.nrows):
        if i == 0:
            cols = [sheet.cell(i, j).value.lower() for j in range(sheet.ncols)]
        else:
            varieties[int(sheet.cell(i, 0).value)] = dict(
                (cols[j], sheet.cell(i, j).value) for j in range(sheet.ncols))

    dataset = common.Dataset(
        id=ewave.__name__,
        name='eWAVE',
        description='The Electronic World Atlas of Varieties of English',
        domain='ewave-atlas.org',
        published=date(2013, 11, 15),
        license='http://creativecommons.org/licenses/by/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'})
    DBSession.add(dataset)
    common.Editor(dataset=dataset, contributor=common.Contributor(id='ed1', name='Bernd Kortmann'), ord=1)
    common.Editor(dataset=dataset, contributor=common.Contributor(id='ed2', name='Kerstin Lunkenheimer'), ord=2)

    for id, name, description in [
        ('1', 'Pronouns', 'Pronouns, pronoun exchange, nominal gender'),
        ('2', 'Noun Phrase', 'Noun phrase'),
        ('3', 'Tense & Aspect', 'Verb phrase I: tense and aspect'),
        ('4', 'Modal Verbs', 'Verb phrase II: modal verbs'),
        ('5', 'Verb Morphology', 'Verb phrase III: verb morphology'),
        #('6', 'Voice', 'Verb phrase IV: voice'),
        ('6', 'Negation', 'Negation'),
        ('7', 'Agreement', 'Agreement'),
        ('8', 'Relativization', 'Relativization'),
        ('9', 'Complementation', 'Complementation'),
        ('10', 'Adverbial Subordination', 'Adverbial Subordination'),
        ('11', 'Adverbs & Prepositions', 'Adverbs and prepositions'),
        ('12', 'Discourse & Word Order', 'Discourse organization and word order'),
    ]:
        data.add(
            models.FeatureCategory, name, id=id, name=name, description=description)
    data['FeatureCategory']['Voice'] = data['FeatureCategory']['Verb Morphology']

    icons = {
        'L1t': {'shape': 's', 'color': 'f38847', 'broad': 'L1'},
        'L1c': {'shape': 'd', 'color': 'd22257', 'broad': 'L1'},
        'L2': {'shape': 'c', 'color': 'a0fb75', 'broad': 'L2'},
        'Cr': {'shape': 't', 'color': 'cb9a34', 'broad': 'P/C'},
        'P': {'shape': 'f', 'color': '4d6cee', 'broad': 'P/C'},
    }

    for cat in read(args, 'language_cat'):
        cls = models.VarietyType if cat['name'] == 'cat1' else models.Region
        if cat['name'] == 'cat1' and cat['value'] not in icons:
            raise ValueError(cat['value'])
        data.add(
            cls, cat['value'],
            id=cat['value'],
            name=cat['name1'],
            description=cat['definition'],
            jsondata=icons.get(cat['value']))

    for lang in read(args, 'language'):
        keys = ['id', 'name', 'latitude', 'longitude']
        l = data.add(
            models.Variety, lang['id'],
            region=data['Region'][lang['cat2']],
            type=data['VarietyType'][lang['cat1']],
            **{k: v for k, v in lang.items() if k in keys})
        data.add(
            models.WaveContribution, lang['id'],
            id=str(lang['id']),
            name=lang['name'],
            description=lang['spec1'],
            variety=l)

    for author in read(args, 'o1_author'):
        contributor = contributors[slug(author['first_name'] + author['last_name'])]
        data.add(
            common.Contributor, author['id'],
            id=str(author['id']),
            name=contributor['Voller Name'],
            address=contributor['Affiliation'],
            email=contributor['E-Mail'],
            url=contributor['Website'])

    abbr2lang = {}
    new_langs = []
    desc = {
        75: "Philippine English is one of the very few American-transplanted Englishes. "
        "The language was introduced in the country by American colonization that "
        "started in 1898. From only 300,000 users or 4% of the population at the "
        "beginning of the 20th century, it is estimated that there were around 42 "
        "million or 70% of the population who are able to use English, almost fifty "
        "years after the American colonization ended at the end of the century "
        "(Gonzalez, 1996). In the implementing 1987 Constitution, English is regarded as "
        "one of the two official languages of the Philippines, the other one being the "
        "national language Filipino. It also interacts with 180 other Austronesian-type "
        "languages used in the country, nine of them considered major languages. English "
        "plays a major role in the Philippine society, offering a rightfully unique "
        "rendering of the psycho-sociolinguistic phenomenon of the spread of English: "
        "A sizeable number of Filipinos even learn it as a first language (and sometimes "
        "only language). The language is widely used in government, education, business, "
        "science and technology, and the arts but it has also penetrated the personal "
        "and private lives of Filipinos, where code-switching can be prevalent. "
        "Proficiency in English may also be equated with socio-economic status; those "
        "with higher socio-economic status tend to be more proficient in the language. "
        "Philippine English is presently entering a stage of structural "
        "systematicization (cf. Borlongan & Lim, 2012) and is being codified through "
        "dictionaries and grammars. Consequently, some claims are made that Philippine "
        "English is already at the phase of endonormative stabilization (Borlongan, 2011)."
    }
    for vid, v in varieties.items():
        if vid not in data['Variety']:
            new_langs.append(vid)
            l = data.add(
                models.Variety, vid,
                id=str(vid),
                name=v['variety'],
                latitude=v['latitude'],
                longitude=v['longitude'],
                region=[r for r in data['Region'].values() if r.name == v['world region']][0],
                type=data['VarietyType'][v['variety  type (narrow)']])
            contribution = data.add(
                models.WaveContribution, vid,
                id=str(vid),
                name=l.name,
                description=desc.get(vid, ''),
                variety=l)
            if v['contributor(s)'] == 'Rajend Mesthrie':
                v['contributor(s)'] = 'Rajend Mesthrie and Tracey Toefy and Sean Bowerman'
            for name in v['contributor(s)'].split(' and '):
                contributor = None
                name = name.strip()
                maxid = 0
                for c in data['Contributor'].values():
                    if int(c.id) > maxid:
                        maxid = int(c.id)
                    if c.name == name:
                        contributor = c
                        print '--- already known:', name
                if not contributor:
                    maxid += 1
                    contributor = data.add(
                        common.Contributor, maxid, id=str(maxid), name=name)
                DBSession.add(common.ContributionContributor(
                    contributor=contributor, contribution=contribution))
        else:
            l = data['Variety'][vid]
        l.abbr = v['abbreviation'].strip()
        abbr2lang[l.abbr] = l

    for author in read(args, 'o1_author'):
        for lang in filter(None, [l.strip() for l in author['langIDs'].split(',')]):
            DBSession.add(common.ContributionContributor(
                contributor=data['Contributor'][author['id']],
                contribution=data['WaveContribution'][int(lang)]))

    domain = {
        'A': ('feature is pervasive or obligatory', {'color': 'fe3856'}),
        'B': ('feature is neither pervasive nor extremely rare', {'color': 'ed9c07'}),
        'C': ('feature exists, but is extremely rare', {'color': 'efe305'}),
        'D': ('attested absence of feature', {'color': 'f3ffb0'}),
        'X': ('feature is not applicable (given the structural make-up of the variety/P/C)', {'color': 'e8e8e8'}),
        '?': ('no information on feature is available', {'color': 'ffffff'}),
    }

    for param in read(args, 'lparam'):
        data.add(
            models.Feature, param['id'],
            id=str(param['id']),
            category=data['FeatureCategory'][param['cat1']],
            name=param['name'],
            description=param['name1'],
            jsondata={'example_source': param['spec1']})

    for de in read(args, 'lparamshaping'):
        desc, jsondata = domain[de['name']]
        data.add(
            common.DomainElement, de['id'],
            id=str(de['id']),
            parameter=data['Feature'][de['lparam_id']],
            name=de['name'],
            description=desc,
            jsondata=jsondata,
            number=de['number'])

    # values:
    changes = []
    maxid = 0
    for value in read(args, 'llps'):
        if not int(value['value']):
            continue
        if value['id'] > maxid:
            maxid = value['id']
        de = data['DomainElement'][value['lparamshaping_id']]
        if de.name != values[value['language_id']][int(de.parameter.id)]:
            new_de = None
            for _de in de.parameter.domain:
                if _de.name == values[value['language_id']][int(de.parameter.id)]:
                    new_de = _de
                    break
            if not new_de or new_de == de:
                print values[value['language_id']][int(de.parameter.id)], ' =?= ', de.name
            changes.append((str(value['language_id']), de.parameter.id, de.name, values[value['language_id']][int(de.parameter.id)]))
            de = new_de
        vs = data.add(
            common.ValueSet, value['id'],
            id=str(value['id']),
            contribution=data['WaveContribution'][value['language_id']],
            parameter=de.parameter,
            jsondata=de.jsondata,
            language=data['Variety'][value['language_id']])
        data.add(
            common.Value, value['id'],
            id=str(value['id']),
            domainelement=de,
            valueset=vs)

    dataset.jsondata['changes'] = {'2013': changes}
    print len(changes), 'values changed'

    for new_lang in new_langs:
        for param, value in values[new_lang].items():
            if new_lang == 75 and param == 195 and not value:
                value = '?'
            maxid += 1
            parameter = data['Feature'][param]
            de = None
            for _de in parameter.domain:
                if _de.name == value:
                    de = _de
            assert de
            vs = data.add(
                common.ValueSet, maxid,
                id=str(maxid),
                contribution=data['WaveContribution'][new_lang],
                parameter=parameter,
                jsondata=de.jsondata,
                language=data['Variety'][new_lang])
            data.add(
                common.Value, maxid,
                id=str(maxid),
                domainelement=de,
                valueset=vs)

    DBSession.flush()

    for rec in bibtex.Database.from_file(args.data_file('eWAVE2References.bib')):
        data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec))

    for i, example in enumerate(excel.rows(xlrd.open_workbook(args.data_file('eWAVE2-Examples_tidy-1.xlsx')).sheets()[0], as_dict=True)):
        if example['primary_text'] == 'Cf. Table 1 in section 3.1':
            continue
        lang = abbr2lang[example['language']]
        if isinstance(example['feature number'], basestring):
            fid = re.match('([0-9]+)', example['feature number']).groups()[0]
        else:
            fid = example['feature number']
        fid = str(int(fid))
        s = data.add(
            common.Sentence, i+1,
            id=str(i+1),
            name=example['primary_text'],
            gloss=example['gloss'] or None,
            comment=example['comment'] or None,
            description=example['translation'] or None,
            language=lang)

        for ref in (example['Source'] or '').split(';'):
            if ref:
                ref = ref.strip()
                desc = None
                if ':' in ref:
                    ref, desc = [_s.strip() for _s in ref.split(':', 1)]
                recid = slug(ref)
                recid = {
                    'allsopp996': 'allsopp1996',
                    'orton1962': 'orton19621971',
                    'bbcvoices': 'voices',
                    'cottmann1963': 'cottman1963',
                    'mooreetal1991': 'moore1991',
                }.get(recid, recid)
                if recid not in data['Source']:
                    assert recid == '50'
                    continue
                DBSession.add(common.SentenceReference(
                    sentence=s, source=data['Source'][recid], description=desc, key=ref))

        vs = DBSession.query(common.ValueSet)\
            .join(common.Parameter).join(common.Language)\
            .filter(common.Parameter.id == fid)\
            .filter(common.Language.pk == lang.pk).one()
        DBSession.add(common.ValueSentence(sentence=s, value=vs.values[0]))
Beispiel #15
0
def main(args):
    #
    # order of init:
    # - villages
    # - files
    # - movies
    #
    videos = defaultdict(list)
    for f in util.iter_files(args):
        obj = models.File(**attr.asdict(f))
        if obj.mime_type.startswith('video'):
            videos[slug(obj.name.split('.')[0])].append(obj)
        DBSession.add(obj)

    lexicon = list(util.iter_lexicon(args))
    villages = util.get_villages(args)
    ff_images = list(util.ff_images(args))
    bib = list(util.get_bib(args))
    data = Data()

    dataset = common.Dataset(
        id=dogonlanguages.__name__,
        name="Dogon and Bangime Linguistics",
        contact="*****@*****.**",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='dogonlanguages.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'}
    )
    DBSession.add(dataset)

    if Glottolog:
        if socket.gethostname() == 'dlt5502178l':
            glottolog = Glottolog(
                Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath(
                    'glottolog3', 'glottolog'))
        else:
            glottolog = Glottolog(
                Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath(
                    'glottolog'))
        languoids = {l.id: l for l in glottolog.languoids()}
    else:
        languoids = {}
    print('got glottolog')

    for c in util.CONTRIBUTORS:
        id_ = slug(c.name.split()[-1])
        data.add(models.Member, id_, id=id_, **attr.asdict(c))
    data.add(
        models.Member, 'forkel',
        id='forkel',
        name='Robert Forkel',
        email='*****@*****.**',
        in_project=False)

    for i, id_ in enumerate(['moran', 'forkel', 'heath']):
        DBSession.add(common.Editor(
            dataset=dataset, ord=i + 1, contributor=data['Member'][id_]))

    contrib = data.add(common.Contribution, 'd', id='d', name='Dogon Languages')
    for doc in bib:
        obj = data.add(
            models.Document,
            doc.rec.id,
            _obj=bibtex2source(doc.rec, cls=models.Document))
        keywords = nfilter([s.strip() for s in doc.rec.get('keywords', '').split(',')])
        for dt in 'grammar lexicon typology texts'.split():
            if dt in keywords:
                obj.doctype = dt
                break
        obj.project_doc = ('DLP' in keywords) or bool(doc.files)
        if obj.project_doc:
            for i, cid in enumerate(util.get_contributors(doc.rec, data)):
                models.DocumentContributor(
                    document=obj, contributor=data['Member'][cid], ord=i)
        for i, (path, cdstar) in enumerate(doc.files):
            common.Source_files(
                id='%s-%s' % (obj.id, i + 1),
                name=path,
                object=obj,
                mime_type=guess_type(path)[0],
                jsondata=cdstar,
            )

    print('got bib')

    for name, (gc, desc) in LANGUAGES.items():
        gl_lang = languoids[gc]
        lat, lon = gl_lang.latitude, gl_lang.longitude
        lang = data.add(
            models.Languoid, gc,
            id=gc,
            name=name,
            description=desc,
            latitude=lat,
            longitude=lon,
            family=gl_lang.family.name if gl_lang and gl_lang.family else name,
        )
        if name == 'Penange' and lang.longitude > 0:
            lang.longitude = -lang.longitude
        if name == 'Bankan Tey':
            lang.latitude, lang.longitude = 15.07, -2.91
        if name == 'Ben Tey':
            lang.latitude, lang.longitude = 14.85, -2.95
        if name == 'Togo Kan':
            lang.latitude, lang.longitude = 14.00, -3.25
        add_language_codes(data, lang, gl_lang.iso, glottocode=gc)

    villages_by_name = defaultdict(list)
    contrib_by_initial = {c.abbr: c for c in data['Member'].values()}
    for i, village in enumerate(villages):
        lang = None
        if village.glottocode:
            lang = data['Languoid'].get(village.glottocode)
            if not lang:
                gl_lang = languoids[village.glottocode]
                lang = data.add(
                    models.Languoid, gl_lang.id,
                    id=gl_lang.id,
                    name=gl_lang.name,
                    in_project=False,
                    family=gl_lang.family.name if gl_lang.family else gl_lang.name)
        v = data.add(
            models.Village, str(i + 1),
            id=str(i + 1),
            name=village.name,
            description=village.data.pop('social info'),
            surnames=village.data.pop('surnames'),
            major_city=village.data['MajorCity'] == 'Y',
            transcribed_name=village.data.pop('Transcribed Village Name'),
            source_of_coordinates=village.data.pop('sourceOfCoordinates'),
            latitude=village.lat,
            longitude=village.lon,
            languoid=lang,
            jsondata=village.data,
        )
        villages_by_name[village.name].append(v)
        for img in village.images:
            mimetype = guess_type(img.name)[0]
            if mimetype:
                f = models.Village_files(
                    id=img.id,
                    name=img.name,
                    description=img.description,
                    date_created=img.date,
                    latitude=img.coords[0] if img.coords else None,
                    longitude=-img.coords[1] if img.coords else None,
                    object=v,
                    mime_type=mimetype,
                    jsondata=img.cdstar,
                )
                for initial in img.creators:
                    if initial in contrib_by_initial:
                        models.Fotographer(
                            foto=f, contributor=contrib_by_initial[initial])

    for cat, desc, place, name in MOVIES:
        s = slug(name)
        m = models.Movie(
            id=s,
            name=desc,
            description=cat,
            place=place,
        )
        if place in villages_by_name and len(villages_by_name[place]) == 1:
            m.village = villages_by_name[place][0]
            #print('found village: %s' % name)
        for v in videos[s]:
            #print('found video: %s' % name)
            v.movie = m
            m.duration = v.duration

    names = defaultdict(int)
    for concept in lexicon:
        add(concept, data, names, contrib)

    count = set()
    for img in ff_images:
        if img.id in count:
            continue
        count.add(img.id)
        if img.ref:
            if img.ref in data['Concept']:
                concept = data['Concept'][img.ref]
                if img.tsammalex_taxon and not concept.tsammalex_taxon:
                    concept.tsammalex_taxon = img.tsammalex_taxon
                    #print(concept.tsammalex_taxon)
                common.Parameter_files(
                    object=concept,
                    id=img.id,
                    name=img.name.decode('utf8'),
                    mime_type=guess_type(img.name)[0],
                    jsondata=img.cdstar)
            else:
                print('missing ref: %s' % img.ref)
Beispiel #16
0
def main(args):
    data = Data()

    for rec in Database.from_file(
            data_path('references.bib'), lowercase=False):
        source = data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    
    dataset = common.Dataset(
        id=clts.__name__,
        name="CLTS",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='clts.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    for i, name in enumerate(['Johann-Mattis List', 'Cormac Anderson', 'Tiago Tresoldi', 
        'Thiago Chacon', 'Robert Forkel']):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    for i, line in enumerate(reader(data_path('sounds.tsv'), delimiter='\t',
            namedtuples=True)):
        if not i % 100:
            print('-', end="")
        key = line.NAME.replace(' ', '_')
        data.add(
                models.SoundSegment,
                key,
                id=key,
                name = line.NAME,
                grapheme=line.GRAPHEME,
                aliases=line.ALIASES,
                representation=len(line.REFLEXES.split(',')),
                reflexes = line.REFLEXES,
                generated = True if line.GENERATED else False,
                unicode = line.UNICODE,
                )
    print('')
    english = data.add(
        common.Language, 'eng',
        id='eng',
        name='English')



    contributions = {}
    for line in reader(data_path('datasets.tsv'),
            delimiter='\t', namedtuples=True):
        contributions[line.NAME] = data.add(
                models.CLTSDataSet,
                line.NAME,
                id=line.NAME,
                name=line.NAME,
                description=line.DESCRIPTION,
                datatype=line.TYPE
                )
        for id_ in line.REFS.split(', '):
            common.ContributionReference(
                    source=data['Source'][id_],
                    contribution=contributions[line.NAME])
    
        
    visited = set()
    for i, line in enumerate(reader(data_path('graphemes.tsv'), delimiter="\t",
            namedtuples=True)):
        if not i % 100: print('-', end='')
        key = line.DATASET + ':' + line.NAME+':'+line.GRAPHEME
        if key not in visited:
            sound_id = line.NAME.replace(' ', '_')
            vs = common.ValueSet(
                    id=key,
                    description=line.NAME,
                    language=english,
                    contribution=contributions[line.DATASET],
                    parameter=data['SoundSegment'][sound_id]
                    )
            data.add(
                    models.Grapheme,
                    key,
                    id=key,
                    grapheme=line.GRAPHEME,
                    bipa_grapheme=line.BIPA,
                    name=line.NAME,
                    dataset=line.DATASET,
                    datatype=line.DATATYPE,
                    frequency=line.FREQUENCY or 0,
                    image=line.IMAGE,
                    url=line.URL,
                    valueset=vs
                    )
            visited.add(key)
    print('-')
Beispiel #17
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    data = Data()
    concept_list = Concepticon(CONCEPTICON_REPOS).conceptlist('Key-2016-1310')

    def concepticon_id(ids_code):
        for item in concept_list:
            if item['IDS_ID'] == ids_code:
                return int(item['CONCEPTICON_ID']) if item['CONCEPTICON_ID'] else None

    def read(table):
        fname = args.data_file(table + '.all.csv')
        if not fname.exists():
            fname = args.data_file(table + '.csv')
        return list(dsv.reader(fname, namedtuples=True))

    dataset = common.Dataset(
        id=ids.__name__,
        name="IDS",
        description="The Intercontinental Dictionary Series",
        published=date(2015, 5, 25),
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name':
                'Creative Commons Attribution 4.0 International License',
        },
        domain='ids.clld.org')

    DBSession.add(dataset)

    for rec in Database.from_file(args.data_file('sources.bib'), lowercase=True):
        if rec.id not in data['Source']:
            data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    DBSession.flush()

    data_desc = defaultdict(dict)
    for l in read('x_lg_data'):
        data_desc[l.lg_id][l.map_ids_data] = l.header

    # language lang
    iso_codes = {l.id: l.sil_code for l in read('sil_lang')}
    iso_codes = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil')}
    languages = []

    exclude = []
    for l in read('lang'):
        if l.status == '1':
            exclude.append(l.lg_id)
            continue
        lang_changed = LANGS.get(int(l.lg_id), {})
        code = lang_changed.get('glotto') or lang_changed.get('iso') or iso_codes.get(l.lg_id)
        lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=lang_changed.get('name', l.lg_name))
        if code:
            languages.append((code, lang))
        data.add(
            models.Dictionary, l.lg_id,
            id=l.lg_id, name=l.lg_name,
            language=lang,
            default_representation=data_desc[l.lg_id].get('1'),
            alt_representation=data_desc[l.lg_id].get('2'),
            jsondata=dict(status=l.status, date=l.date))

    iso2glotto = {}
    for l in walk_tree(tree=languoids_path('tree', GLOTTOLOG_REPOS)):
        if l.iso:
            iso2glotto[l.iso] = l.id

    load_families(
        Data(), [(iso2glotto.get(c, c), l) for c, l in languages], glottolog=Glottolog(GLOTTOLOG_REPOS), isolates_icon='tcccccc')

    contributors = defaultdict(list)
    sources = defaultdict(list)
    for l in read('lang_compilers'):
        if l.lg_id in exclude:
            continue
        if l.name == "BIBIKO":
            continue
        #name	lg_id	what_did_id
        if int(l.what_did_id) in models.ROLES:
            contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id))
        else:
            assert int(l.what_did_id) in [4, 395]
            sources[l.name].append(l.lg_id)

    for s, roles in contributors.items():
        name = roles[0][0]
        c = data.add(common.Contributor, s, id=s, name=name)
        if name == 'Mary Ritchie Key':
            c.address = 'University of California, Irvine'
        for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]):
            sroles = sorted(
                [s[1] for s in specs],
                reverse=True,
                key=lambda what: what + 2 if what == 2 else what)
            what = sroles[0]
            DBSession.add(common.ContributionContributor(
                contribution=data['Dictionary'][lg],
                contributor=c,
                ord=what,
                primary=what == 2))

    data.add(
        common.Contributor, 'bernardcomrie',
        id='bernardcomrie',
        name="Bernard Comrie",
        address="University of California, Santa Barbara")

    for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']):
        common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1)

    #for i, name in enumerate(sorted(sources.keys())):
    #    c = data.add(common.Source, name, id=str(i + 1), name=name, description=name)

    DBSession.flush()
    for name, lgs in sources.items():
        for _src in name.split(';'):
            src = data['Source'].get(_src.strip())
            if not src:
                print('-- missing source --', _src)
                raise ValueError
            for lg in lgs:
                if lg in exclude:
                    continue
                assert lg in data['Dictionary']
                DBSession.add(common.ContributionReference(
                    contribution_pk=data['Dictionary'][lg].pk, source_pk=src.pk))

    altnames = {}
    for i, l in enumerate(read('alt_names')):
        if l.name in altnames:
            identifier = altnames[l.name]
        else:
            identifier = data.add(
                common.Identifier, l.name,
                id='name-%s' % i, type='name', name=l.name, description='IDS')
            altnames[l.name] = identifier
        if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name:
            DBSession.add(common.LanguageIdentifier(
                identifier=identifier,
                language=data['IdsLanguage'][l.lg_id]))

    # parameter chapter/entry
    for l in read('chapter'):
        data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title)

    entries = {}
    for l in read('entry'):
        id_ = '%s-%s' % (l.chap_id, l.entry_id)
        name = l.trans_english
        if name in entries:
            entries[name] += 1
            name = name + ' (%s)' % entries[name]
        else:
            entries[name] = 1
        kw = {
            'id': id_,
            'name': name,
            'concepticon_id': concepticon_id(id_),
            'chapter': data['Chapter'][l.chap_id]}
        for ll in 'french russian spanish portugese'.split():
            kw[ll] = getattr(l, 'trans_' + ll)
        data.add(models.Entry, id_, sub_code=l.entry_id, **kw)

    misaligned = []

    DBSession.flush()
    for entity in 'IdsLanguage Entry Chapter Dictionary'.split():
        for k in data[entity].keys()[:]:
            data[entity][k] = data[entity][k].pk

    synsets = set()
    counterparts = set()
    problems = defaultdict(list)

    for lg_id, entries in groupby(
            sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id):
        if lg_id in exclude or not lg_id:
            continue

        # keep the memory footprint reasonable
        transaction.commit()
        transaction.begin()

        language = common.Language.get(data['IdsLanguage'][lg_id])
        desc = data_desc.get(lg_id, {})
        words = defaultdict(list)
        for l in entries:
            if empty.match(l.data_1):
                continue

            entry_id = '%s-%s' % (l.chap_id, l.entry_id)
            if entry_id not in data['Entry']:
                continue
                #data.add(
                #    models.Entry, entry_id,
                #    id=entry_id,
                #    name=entry_id,
                #    concepticon_id=concepticon_id(entry_id),
                #    sub_code=l.entry_id,
                #    chapter_pk=data['Chapter'][l.chap_id])
                #DBSession.flush()
                #data['Entry'][entry_id] = data['Entry'][entry_id].pk

            id_ = '%s-%s' % (entry_id, l.lg_id)
            if id_ in synsets:
                vs = models.Synset.get(id_)
            else:
                vs = models.Synset(
                    id=id_,
                    comment=get_string(l.comment or ''),
                    alt_representation=get_string(l.data_2),
                    language=language,
                    contribution_pk=data['Dictionary'][l.lg_id],
                    parameter_pk=data['Entry'][entry_id])
                synsets.add(id_)

            trans1 = list(split_counterparts(l.data_1))
            trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2))

            if trans2:
                if len(trans2) != len(trans1):
                    if language.id != '238':
                        misaligned.append((l.chap_id, l.entry_id, l.lg_id))
                        #print('===', language.id, language.name)
                        #print(l.data_1)
                        #print(l.data_2)
                    # 83 cases of misaligned transcriptions
                    trans2 = None

            for i, word in enumerate(trans1):
                cid = id_ + '-' + str(i + 1 + len(vs.values))
                if cid not in counterparts:
                    v = models.Counterpart(
                        id=cid,
                        name=word,
                        description=desc.get('1'),
                        valueset=vs)
                    words[word].append((v, trans2[i] if trans2 else None))
                    counterparts.add(cid)
                else:
                    print(cid)
                    #12 - 420 - 811 - 3
                    #5 - 390 - 818 - 3
                    #2 - 930 - 819 - 3
                    #2 - 930 - 819 - 3
                    #3 - 120 - 819 - 3
                    #10 - 140 - 822 - 3
                    #9 - 160 - 825 - 3
                    #2 - 430 - 829 - 4

        for i, form in enumerate(words.keys()):
            # Since we identify words based on their string representation, we have to
            # make sure a word has the same alternative transcription for all meanings.
            if language.id == '238':
                alt_names = []
            else:
                alt_names = set(norm(w[1] or '', desc.get('2'), language.id)
                                for w in words[form])
            alt_names = nfilter(alt_names)
            try:
                assert len(alt_names) <= 1
            except AssertionError:
                problems[(language.id, language.name)].append(alt_names)
            word = models.Word(
                id='%s-%s' % (language.id, i + 1),
                name=form,
                description=desc.get('1'),
                language=language,
                alt_name=', '.join(alt_names) if alt_names else None,
                alt_description=desc.get('2')
            )
            for v, _ in words[form]:
                word.counterparts.append(v)
            DBSession.add(word)

        DBSession.flush()

    with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp:
        fp.writerows(misaligned)

    # about 250 cases where alternative transcriotions do not covary across meanings.
    for k, v in problems.items():
        print(k, len(v))
Beispiel #18
0
def inclusive_excusive(args, data, bib):
    """
    Incl	Inclusive/exclusive distinction. 1 = present, 0 = absent.
    Belh	Belhare-type inclusive/exclusive distinction. 1 = present, 0 = absent. NA = no information available.
    MinAug	Minimal/augmented system. 1 = present, 0 = absent. 1? = probably present
    """
    value_map = {
        '0': 'absent',
        '1': 'present',
        '1?': 'probably present',
        'NA': 'no information available'}
    name_map = OrderedDict()
    name_map['Incl'] = 'Inclusive/exclusive distinction'
    name_map['Belh'] = 'Belhare-type inclusive/exclusive distinction'
    name_map['MinAug'] = 'Minimal/augmented system'
    varspec = [(name, set()) for name in name_map.values()]
    rev_name_map = dict(zip(name_map.values(), name_map.keys()))

    p, contrib = param_and_contrib(
        data, 'inclusive/exclusive distinction', 'inclusive.exclusive', 2)

    DBSession.add(common.ContributionContributor(
        contribution=contrib, contributor=data['Contributor']['bickel']))
    DBSession.add(common.ContributionContributor(
        contribution=contrib, contributor=data['Contributor']['nichols']))

    allv = rows(
        args.data_file('InclExcl_ISO_bib_stripped.txt'), namedtuples=True, encoding='utf8', newline='\r')

    for lid, values in groupby(sorted(allv, key=lambda j: j.LID), lambda i: i.LID):
        vsid = '%s-%s' % (p.id, lid)
        values = list(values)

        if vsid not in data['ValueSet']:
            vs = data.add(
                common.ValueSet, vsid,
                id=vsid,
                language=data['Languoid'][lid],
                contribution=contrib,
                parameter=p)
        else:
            vs = data['ValueSet'][vsid]

        bibkeys = []
        for v in values:
            bibkeys.extend(filter(None, [v.strip() for v in v.bibkey.split(',')]))

        for key in set(bibkeys):
            if key in data['Source']:
                source = data['Source'][key]
            else:
                if key in bib.keymap:
                    source = data.add(common.Source, key, _obj=bibtex2source(bib[key]))
                else:
                    print key

                    source = None
            if source:
                DBSession.add(common.ValueSetReference(valueset=vs, source=source))

        for i, value in enumerate(values):
            if i > 0:
                print 'multiuple values!'
                raise ValueError
            value_data = OrderedDict()
            for var in name_map.keys():
                val = value_map.get(getattr(value, var))
                if not val:
                    print getattr(value, var)
                    raise ValueError
                value_data[var] = val
            v = data.add(
                common.Value, vsid,
                id=vsid,
                name=' / '.join(value_data.values()),
                #jsondata=value,
                valueset=vs)
            DBSession.flush()
            for j, spec in enumerate(varspec):
                attr, domain = spec
                domain.add(value_data[rev_name_map[attr]])
                DBSession.add(common.Value_data(key=attr, value=value_data[rev_name_map[attr]], ord=j, object_pk=v.pk))

    p.jsondata = {'varspec': [(name, list(domain)) for name, domain in varspec]}
Beispiel #19
0
def main(args):
    data = Data()
    data_path = lambda *cs: args.data_file('concepticon-data', 'concepticondata', *cs)

    dataset = common.Dataset(
        id=concepticon.__name__,
        name="Concepticon 1.0",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='concepticon.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    for i, name in enumerate(['Johann-Mattis List', 'Michael Cysouw', 'Robert Forkel']):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    english = data.add(
        common.Language, 'eng',
        id='eng',
        name='English')

    files = {}
    for fname in data_path('sources').iterdir():
        files[fname.stem] = \
            "https://github.com/clld/concepticon-data/blob/master/concepticondata/sources/%s" % fname.name

    for rec in Database.from_file(
            data_path('references', 'references.bib'), lowercase=True):
        source = data.add(common.Source, rec.id, _obj=bibtex2source(rec))
        if rec.id in files:
            DBSession.flush()
            DBSession.add(common.Source_files(
                mime_type='application/pdf',
                object_pk=source.pk,
                jsondata=dict(url=files[rec.id])))

    for concept in reader(data_path('concepticon.tsv'), namedtuples=True):
        data.add(
            models.ConceptSet,
            concept.ID,
            id=concept.ID,
            name=concept.GLOSS,
            description=concept.DEFINITION,
            semanticfield=concept.SEMANTICFIELD,
            ontological_category=concept.ONTOLOGICAL_CATEGORY)

    for rel in reader(data_path('conceptrelations.tsv'), namedtuples=True):
        DBSession.add(models.Relation(
            source=data['ConceptSet'][rel.SOURCE],
            target=data['ConceptSet'][rel.TARGET],
            description=rel.RELATION))

    unmapped = Counter()
    number_pattern = re.compile('(?P<number>[0-9]+)(?P<suffix>.*)')

    for cl in reader(data_path('conceptlists.tsv'), dicts=True):
        concepts = data_path('conceptlists', '%(ID)s.tsv' % cl)
        if not concepts.exists():
            continue
        langs = [l.lower() for l in split(cl['SOURCE_LANGUAGE'])]
        conceptlist = data.add(
            models.Conceptlist,
            cl['ID'],
            id=cl['ID'],
            name=' '.join(cl['ID'].split('-')),
            description=cl['NOTE'],
            target_languages=cl['TARGET_LANGUAGE'],
            source_languages=' '.join(langs),
            year=int(cl['YEAR']) if cl['YEAR'] else None,
        )
        for id_ in split(cl['REFS']):
            common.ContributionReference(
                source=data['Source'][id_], contribution=conceptlist)
        for i, name in enumerate(split(cl['AUTHOR'], sep=' and ')):
            name = strip_braces(name)
            contrib = data['Contributor'].get(name)
            if not contrib:
                contrib = data.add(
                    common.Contributor, name, id=slug(name), name=name)
            DBSession.add(common.ContributionContributor(
                ord=i, contribution=conceptlist, contributor=contrib))
        for k in 'ID NOTE TARGET_LANGUAGE SOURCE_LANGUAGE YEAR REFS AUTHOR'.split():
            del cl[k]
        DBSession.flush()
        for k, v in cl.items():
            DBSession.add(common.Contribution_data(
                object_pk=conceptlist.pk, key=k, value=v))

        for concept in reader(concepts, namedtuples=True):
            if not concept.ID or not concept.CONCEPTICON_ID or concept.CONCEPTICON_ID == 'NAN':
                #print conceptlist.id, getattr(concept, 'ENGLISH', getattr(concept, 'GLOSS', None))
                unmapped.update([conceptlist.id])
                continue

            lgs = {}
            for lang in langs:
                v = getattr(concept, lang.upper())
                if v:
                    lgs[lang] = v

            match = number_pattern.match(concept.NUMBER)
            if not match:
                print(concept.ID)
                raise ValueError
            vs = common.ValueSet(
                id=concept.ID,
                description=getattr(concept, 'GLOSS', getattr(concept, 'ENGLISH', None)),
                language=english,
                contribution=conceptlist,
                parameter=data['ConceptSet'][concept.CONCEPTICON_ID])
            d = {}
            for key, value in concept.__dict__.items():
                if not key.startswith('CONCEPTICON_') and \
                        key not in ['NUMBER', 'ID', 'GLOSS'] + [l.upper() for l in langs]:
                    d[key.lower()] = value
            v = models.Concept(
                id=concept.ID,
                valueset=vs,
                description=getattr(concept, 'GLOSS', None),  # our own gloss, if available
                name='; '.join('%s [%s]' % (lgs[l], l) for l in sorted(lgs.keys())),
                number=int(match.group('number')),
                number_suffix=match.group('suffix'),
                jsondata=d)
            DBSession.flush()
            for key, value in lgs.items():
                DBSession.add(
                    common.Value_data(key='lang_' + key, value=value, object_pk=v.pk))

    print('Unmapped concepts:')
    for clid, no in unmapped.most_common():
        print(clid, no)

    for fname in data_path('concept_set_meta').iterdir():
        if fname.suffix == '.tsv':
            md = load(fname.parent.joinpath(fname.name + '-metadata.json'))
            provider = models.MetaProvider(
                id=fname.stem,
                name=md['dc:title'],
                description=md['dc:description'],
                url=md['dc:source'],
                jsondata=md)
            for meta in reader(fname, dicts=True):
                try:
                    for k, v in meta.items():
                        if v and k != 'CONCEPTICON_ID':
                            models.ConceptSetMeta(
                                metaprovider=provider,
                                conceptset=data['ConceptSet'][meta['CONCEPTICON_ID']],
                                key=k,
                                value=v)
                except:
                    print(fname)
                    print(meta)
                    raise
Beispiel #20
0
def import_dataset(path, data, icons, add_missing_features = False):
    # look for metadata
    # look for sources
    # then loop over values
    
    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)
    glottolog = Glottolog()

    try:
        contrib = CulturebankContribution(id=basename, name=basename, desc=glottolog.languoid(basename).name)
    except:
        print("Basename {:s} did not match a glottolog languoid, skipped.".format(basename))
        return

    md = {}
    mdpath = path + '-metadata.json'
    if os.path.exists(mdpath):
        md = jsonload(mdpath)
    contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS'))
    contributor_id = slug(contributor_name.last + contributor_name.first)
    contributor = data['Contributor'].get(contributor_id)
    if not contributor:
        contributor = data.add(
            Contributor,
            contributor_id,
            id=contributor_id,
            name='%s' % contributor_name)
    DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor))

    bibpath = os.path.join(dirpath, basename + '.bib')
    if os.path.exists(bibpath):
        for rec in Database.from_file(bibpath):
            if rec['key'] not in data['Source']:
                data.add(Source, rec['key'], _obj=bibtex2source(rec))

    languages = {f['properties']['glottocode']: f for f in md.get('features', [])}

    for i, row in pandas.io.parsers.read_csv(
            path,
            sep=',' if 'c' in ext else '\t',
            encoding='utf-16').iterrows():
        if pandas.isnull(row['Value']) or pandas.isnull(row['Feature_ID']):
            print("Expected columns not found: ", row)
            continue
        vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID'])
        vid = row.get('ID', '%s-%s' % (basename, i + 1))

        parameter = data['Feature'].get(row['Feature_ID'])
        if parameter is None:
            if add_missing_features:
                parameter = data.add(Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID']))
            else: 
                print(('skip value for invalid feature %s' % row['Feature_ID']))
                continue

        language = data['CulturebankLanguage'].get(row['Language_ID'])
        if language is None:
            # query glottolog!
            try:
                languoid = glottolog.languoid(row['Language_ID'])
            except AttributeError:
                print(('Skipping, no Glottocode found for %s' % row['Language_ID']))
                continue
            
            gl_md = {
                'name': languoid.name,
                'longitude': languoid.longitude,
                'latitude': languoid.latitude}
            lmd = languages.get(row['Language_ID'])
            if lmd:
                if lmd.get('properties', {}).get('name'):
                    gl_md['name'] = lmd['properties']['name']
                if lmd.get('geometry', {}).get('coordinates'):
                    gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates']

            language = data.add(
                CulturebankLanguage, row['Language_ID'],
                id=row['Language_ID'],
                name=gl_md['name'],
                latitude=gl_md.get('latitude'),
                longitude=gl_md.get('longitude'))

        
        vs = data['ValueSet'].get(vsid)
        if vs is None:
            vs = data.add(
                ValueSet, vsid,
                id=vsid,
                parameter=parameter,
                language=language,
                contribution=contrib,
                source=row['Source'])

        domain = {de.abbr: de for de in parameter.domain}    
        name = row['Value']
        if name in domain:
            name = domain[name].name
        else:
            name = str(name)
            if name in domain:
                name = domain[name].name
            else:
                raise ValueError("For feature {:s} in language {:s}: Name {:s} not found among domain values {:}".format(
                    row['Language_ID'],
                    row['Feature_ID'],
                    name,
                    {d: de for d, de in domain.items()}))

        data.add(Value,
            vid,
            id=vid,
            valueset=vs,
            name=name,
            description=row['Comment'],
            domainelement=domain.get(row['Value']))

        print(".", end="")
        if vs.source is not None:
            for key, src in list(data['Source'].items()):
                if key in vs.source:
                    ValueSetReference(valueset=vs, source=src, key=key)
Beispiel #21
0
    def test_bibtex2source(self):
        from clld.scripts.util import bibtex2source

        bibtex2source(Record('book', 'id', title='tb', customfield='cf', year="1920}"))
Beispiel #22
0
def case_alignment(args, data, bib):
    varspec = [
        ('alignment', set()),
        ('referential_type', set()),
        ('tense_aspect', set()),
        ('morphological_form.PoS', set()),
        ('A.marked', set()),
        ('P.marked', set())]
    p, contrib = param_and_contrib(
        data, 'case alignment', 'case.alignment', 1)
    alena = data.add(
        common.Contributor, 'witzlack',
        id='witzlack', name='Alena Witzlack-Makarevich')
    DBSession.add(common.ContributionContributor(contribution=contrib, contributor=alena))

    with open(args.data_file('case.alignment.Dec.2013.csv')) as fp:
        allv = list(UnicodeCSVDictReader(fp))

    for lid, values in groupby(sorted(allv, key=lambda j: j['LID']), lambda i: i['LID']):
        vsid = '%s-%s' % (p.id, lid)
        values = list(values)

        if vsid not in data['ValueSet']:
            vs = data.add(
                common.ValueSet, vsid,
                id=vsid,
                language=data['Languoid'][lid],
                contribution=contrib,
                parameter=p)
        else:
            vs = data['ValueSet'][vsid]

        bibkeys = []
        for v in values:
            bibkeys.extend(filter(None, [v.strip() for v in v['bibtex'].split(',')]))

        for key in set(bibkeys):
            if key in data['Source']:
                source = data['Source'][key]
            else:
                if key in bib.keymap:
                    source = data.add(common.Source, key, _obj=bibtex2source(bib[key]))
                else:
                    print key
# Marchese1978Time
# Kibriketal2000Jazyk
# check Mreta1998Analysis
# Nababan1971Grammar
# Werleetal1976Phonologie

                    source = None
            if source:
                DBSession.add(common.ValueSetReference(valueset=vs, source=source))

        for i, value in enumerate(values):
            vid = '%s-%s' % (vsid, i + 1)
            v = data.add(
                common.Value, vid,
                id=vid,
                name=' '.join('%('+spec[0]+')s' for spec in varspec) % value,
                jsondata=value,
                valueset=vs)
            DBSession.flush()
            for j, spec in enumerate(varspec):
                attr, domain = spec
                domain.add(value[attr])
                DBSession.add(common.Value_data(key=attr, value=value[attr], ord=j, object_pk=v.pk))

    p.jsondata = {'varspec': [(name, list(domain)) for name, domain in varspec]}
Beispiel #23
0
def main(args):
    citations.main(args)
    data = Data()

    pairs = {}
    languages = {}

    coords = {}
    for lang in dsv.rows(
        args.data_file('MB_Map_Data_Aug13WLabels'),
        namedtuples=True,
        newline='\n',
        encoding='latin1'
    ):
        coords[slug(lang.Label.split('<')[0].strip())] = (
            float(lang.y), float(lang.x))

    xls = xlrd.open_workbook(args.data_file('MB_BoCatSum_AFBO.xlsx'))
    matrix = xls.sheet_by_name('MB_BoCatSum_AFBO.txt')
    md = "area\trecipient language iso\trecipient language genus\tdonor language iso\tdonor language genus".split('\t')

    fields = []
    params = []
    for i in range(matrix.ncols):
        colname = xlrd.colname(i)
        if len(colname) == 2 and colname > 'BE':
            break
        colval = matrix.cell(0, i).value.strip()
        if (len(colname) == 1 and colname > 'G') or (len(colname) == 2 and colname < 'AY'):
            params.append(colval)
            fields.append(colval)
        else:
            fields.append(colval.lower())

    for f in fields:
        if fields.count(f) > 1:
            print(f)

    assert len(fields) == len(set(fields))

    for j in range(1, matrix.nrows):
        values = dict(zip(fields, [matrix.cell(j, i).value for i in range(matrix.ncols)]))
        try:
            id_ = int(values['perm.id'])
        except:
            continue

        pairs[id_] = values
        for type_ in ['recipient', 'donor']:
            languages[values[type_ + ' language'].strip()] = {
                'macroarea': values['area']}
            for md in ['iso', 'genus']:
                languages[values[type_ + ' language'].strip()][md] \
                    = values['%s language %s' % (type_, md)]

    for name in COORDS:
        assert name in languages

    sources = {}
    with open(args.data_file('MB_Case_List_with_links.html')) as fp:
        worddoc = fp.read()
        for m in re.finditer('\"__(?P<recid>[^_]+)__\"', worddoc):
            sources[m.group('recid').decode('utf8')] = 1
        soup = bs(worddoc)

    doc = {}
    cols = []
    table = soup.find('table')
    for tr in table.children:
        if tr.name != 'tr':
            continue
        tds = filter(lambda n: n.name == 'td', tr.children)
        if not cols:
            cols = map(text, tds)
        else:
            values = dict(zip(cols, tds))
        try:
            id_ = int(text(values['perm.id']))
            doc[id_] = values
            if id_ in pairs:
                assert doc['Recipient lg.'] == pairs[id_][1]['recipient language']
                assert doc['Don'] == pairs[id_][1]['donor language']
        except:
            continue

    dataset = common.Dataset(
        id='afbo',
        name="AfBo: A world-wide survey of affix borrowing",
        contact="*****@*****.**",
        domain="afbo.info",
        license='http://creativecommons.org/licenses/by/3.0/',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'})

    DBSession.add(dataset)
    for i, spec in enumerate([('seifart', "Frank Seifart")]):
        DBSession.add(common.Editor(
            dataset=dataset,
            ord=i + 1,
            contributor=common.Contributor(id=spec[0], name=spec[1])))

    contrib = data.add(common.Contribution, 'afbo', name="AfBo", id="afbo")

    iso_map = {
        ('ron', 'Meglenite Romanian'): ('ruq', None),
        ('fra', 'Norman French'): ('xno', None),
        ('tur', 'Turkic'): (None, 'turk1311'),
        ('xuu', 'Kxoe languages'): (None, 'khoe1241'),
        ('zoc', 'Zoquean languages'): (None, 'zoqu1261'),
        ('tzm', 'Moroccan Berber languages'): (None, 'atla1275'),
        ('cvn', 'Quechua'): ('qvn', None),
        ('rop', 'Gurindji Kriol'): (None, 'guri1249'),
        ('ita', 'Sicilian Italian'): ('scn', None),
        ('srp', 'Croatian'): ('hrv', None),
        ('eme', 'Wayampi‑Emerillon‑Zo’é'): (None, 'waya1271'),
        ('ale', 'Copper Island Aleut'): ('mud', None),
        ('car', 'intermediate Proto‑Carib'): (None, 'cari1283'),
        ('ell', 'Cappadocian Greek'): ('cpg', None),
        ('eng', 'Middle English'): ('enm', None),
        ('als', 'Arvanitic Albanian'): ('aat', None),
        ('nys', 'Northern Nyungic'): (None, 'dese1234'),
        ('ron', 'Istro‑Romanian'): ('ruo', None),
        ('chf', 'Cho’ol'): ('ctu', None),
        ('tuo', 'Eastern Tucanoan languages'): (None, 'east2698'),
        ('ceb', 'Visayan'): (None, 'bisa1268'),
        ('por', 'Sri Lanka Portuguese'): (None, 'mala1544'),
        ('brx', 'Tibeto-Burman languages'): (None, 'brah1260'),
    }

    with open('name_conflicts.tab', 'w') as fp:
        fp.write('iso\tafbo\tglottolog\tproposed iso\n')
        for i, name in enumerate(languages.keys()):
            md = languages[name]
            iso = md.pop('iso')
            if iso == 'cvn' and name == 'Quechua':
                iso = 'qvn'
            kw = dict(name=name, id=str(i+1), jsondata=md)
            if name in COORDS:
                kw['latitude'], kw['longitude'] = COORDS[name]
            elif slug(name) in coords:
                kw['latitude'], kw['longitude'] = coords[slug(name)]
            elif glottocoords.get(iso):
                kw['latitude'], kw['longitude'] = glottocoords[iso]

            if glottonames.get(iso) and slug(glottonames.get(iso)) != slug(name):
                fp.write(('%s\t%s\t%s\t%s\n' % (
                    iso, name, glottonames.get(iso), rglottonames.get(slug(name), ''))).encode('utf8'))

            if name == 'Meglenite Romanian':
                kw['name'] = 'Megleno Romanian'
            if not 'latitude' in kw:
                print(name)
            l = data.add(common.Language, name, **kw)

            iso, gc = iso_map.get((iso, name), (iso, None))

            for code, type_ in [
                (iso, common.IdentifierType.iso),
                (gc or glottocodes.get(iso), common.IdentifierType.glottolog)
            ]:
                if code:
                    identifier = data.add(
                        common.Identifier, code, id=code, name=code, type=type_.value)
                    data.add(
                        common.LanguageIdentifier, '%s-%s' % (code, l.id),
                        identifier=identifier, language=l)

    include = sources.keys() + [
        'myersscottoncontact2002', 'myersscottonlanguage2007',
        'meakinsborrowing2011', 'seifartprinciple2012',
    ]
    refdb = bibtex.Database.from_file(args.data_file('FSeifartZoteroLibrary14Nov2013.bib'))
    for rec in refdb:
        if slug(rec.id) in include:
            data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec))

    for i, name in enumerate(params):
        data.add(models.AffixFunction, name, id=str(i + 1), name=name)

    for id_, vd in pairs.items():
        assert id_ in doc

        donor = data['Language'][vd['donor language'].strip()]
        recipient = data['Language'][vd['recipient language'].strip()]

        p = data.add(
            models.Pair,
            id_,
            id=str(id_),
            name=vd['pairs'].replace('Meglenite', 'Megleno'),
            area=recipient.jsondata['macroarea'],
            description=unicode(doc[id_]['comment']).replace('<h1', '<p').replace('</h1>', '</p>').replace('Meglenite', 'Megleno'),
            reliability=vd['reliability'],
            int_reliability=['high', 'mid', 'low'].index(vd['reliability']),
            count_interrel=int(vd[u'number of interrelated affixes']),
            count_borrowed=int(vd['number of borrowed affixes']),
            donor=donor,
            recipient=recipient)
        DBSession.flush()

        for i, param in enumerate(params):
            param_id = i + 1
            value = vd[param]
            if value != '':
                vsid = '%s-%s' % (recipient.id, param_id)
                if vsid in data['ValueSet']:
                    vs = data['ValueSet'][vsid]
                else:
                    vs = data.add(
                        common.ValueSet, vsid,
                        id=vsid,
                        parameter=data['AffixFunction'][param],
                        language=recipient,
                        contribution=contrib)
                data.add(
                    models.waabValue,
                    '%s-%s' % (id_, param_id),
                    id='%s-%s' % (id_, param_id),
                    pair=p,
                    name='%s' % int(value),
                    numeric=int(value),
                    description='%s' % p,
                    valueset=vs)
Beispiel #24
0
def main(args):
    # http://clld.readthedocs.org/en/latest/extending.html
    data = Data(
        created=utc.localize(datetime(2013, 11, 15)),
        updated=utc.localize(datetime(2013, 12, 12)))
    icons = issues.Icons()

    languoids = list(Glottolog(GLOTTOLOG_REPOS).languoids())
    iso_to_gc = dict([(l.iso, l.id) for l in languoids]) #glottocodes = glottocodes_by_isocode(args.glottolog_dburi)
    iso_to_name = {l.iso: l.name for l in languoids}
    #Languages
    dp = dtab("dp.tab")
    lons = dict([(d['iso-639-3'], d['lon']) for d in dp])
    lats = dict([(d['iso-639-3'], d['lat']) for d in dp])

    tabfns = [fn.name for fn in DATA_DIR.glob('sails_*.tab')]
    print "Sheets found", tabfns
    ldps = [ld for fn in tabfns for ld in dtab(fn)]
    ldps = [dict([(k, v.replace(".", "-") if k in ['feature_alphanumid', 'value'] else v)
                  for (k, v) in ld.iteritems()]) for ld in ldps]
    ldcps = dtab("constructions_data.tab")
    dedup = opv(grp2([((ld['construction_id'], ld['feature_alphanumid'].replace('.', "-")), (ld["value"],) + tuple(ld.items())) for ld in ldcps]), max)
    dldps = [dict(dld[1:]) for dld in dedup.itervalues()]
    lgs = dict([(ld['language_id'], ld['language_name'] if ld.has_key('language_name') else iso_to_name[ld['language_id']]) for ld in ldps + ldcps])
    nfeatures = opv(grp2([(ld['language_id'], ld['feature_alphanumid'])
                          for ld in ldps + ldcps if ld["value"] != "?"]), len)

    # Families
    fp = treetxt(loadunicode('lff.txt') + loadunicode('lof.txt'))
    ps = paths(fp)
    lg_to_fam = dict([(p[-1], p[0]) for p in ps])
    families = grp2([(lg_to_fam[lg], lg) for lg in lgs.keys()])
    ficons = dict(icons.iconizeall([
        f for (f, sailslgs) in families.iteritems() if len(sailslgs) != 1]).items() +
                  [(f, icons.graytriangle) for (f, sailslgs) in families.iteritems()
                   if len(sailslgs) == 1])
    for family in families.iterkeys():
        data.add(
            models.Family, family,
            id=family, name=family, jsondata={"icon": ficons[family]})

    DBSession.flush()

    # Lgs
    for lgid in lgs.iterkeys():
        lang = data.add(
            models.sailsLanguage, lgid,
            id=lgid,
            name=lgs[lgid],
            family=data["Family"][lg_to_fam[lgid]],
            representation=nfeatures[lgid],
            latitude=float(lats[lgid]),
            longitude=float(lons[lgid]))
        if not lgid.startswith('NOCODE'):
            iso = data.add(
                common.Identifier, lgid,
                id=lgid,
                name=lgid,
                type=common.IdentifierType.iso.value,
                description=lgs[lgid])
            data.add(common.LanguageIdentifier, lgid, language=lang, identifier=iso)
        if lgid in iso_to_gc:
            gc = iso_to_gc[lgid]
            gc = data.add(
                common.Identifier, 'gc' + lgid,
                id=gc,
                name=gc,
                type=common.IdentifierType.glottolog.value,
                description=lgs[lgid])
            data.add(common.LanguageIdentifier, lgid, language=lang, identifier=gc)
    DBSession.flush()

    # Domains
    for domain in set(ld['feature_domain'] for ld in ldps):
        data.add(models.FeatureDomain, domain, id=slug(domain), name=domain)
    DBSession.flush()

    designer_to_id = {}
    for dd in dtab("sailscontributions.tab"):
        contributionid = slug("%s-%s" % (dd["designer"], dd["domain"]))
        if dd["domain"].find("Construction-Based") == -1:
            designer_to_id[dd["designer"]] = contributionid
    
    
    contribution_statistics = {}
    contribution_statistics["nfeatures"] = opv(grp2([(designer_to_id[ld["designer"]], ld['feature_alphanumid']) for ld in ldps]), len)
    contribution_statistics["nlanguages"] = opv(grp2([(designer_to_id[ld["designer"]], ld['language_id']) for ld in ldps]), len)
    contribution_statistics["ndatapoints"] = opv(grp2([(designer_to_id[ld["designer"]], (ld['feature_alphanumid'], ld['language_id'])) for ld in ldps if ld["value"] != "?"]), len)

    contributionid = slug("%s-%s" % ("Rik van Gijn", "Construction-Based Subordination Data (SUB)"))
    contribution_statistics["nfeatures"][contributionid] = len(set([(ld['feature_alphanumid']) for ld in dldps]))
    contribution_statistics["nlanguages"][contributionid] = len(set([(ld['language_id']) for ld in dldps]))
    contribution_statistics["ndatapoints"][contributionid] = len(set([(ld['feature_alphanumid'], ld['language_id']) for ld in ldps if ld["value"] != "?"]))

    # Designers
    citation_template = "%s. 2014. %s. In Muysken, Pieter et al. (eds.) "\
    "South American Indian Language Structures (SAILS) Online. Leipzig: Online "\
    "Max Planck Institute of Evolutionary Anthropology. "\
    "(Available at http://sails.clld.org)"
    #for (designer_id, (designer, domain)) in enumerate(designers.iteritems()):
    designer_to_id = {}
    for dd in dtab("sailscontributions.tab"):
        contributionid = slug("%s-%s" % (dd["designer"], dd["domain"]))
        orientation = "Language-Based"
        if dd["domain"].find("Construction-Based") == -1:
            designer_to_id[dd["designer"]] = contributionid
            orientation = "Construction-Based"
        data.add(
            models.Designer, contributionid,
            id=contributionid,
            name=contributionid,
            domain=dd["domain"],
            orientation=orientation,
            contributor=dd["designer"],
            nlanguages=contribution_statistics["nlanguages"][contributionid],
            nfeatures=contribution_statistics["nfeatures"][contributionid],
            ndatapoints=contribution_statistics["ndatapoints"][contributionid],
            citation=citation_template % (dd["designer"], dd["domain"]),
            more_information=dd["citation"],
            pdflink=dd["pdflink"])
    DBSession.flush()

    # Features
    fs = dict([(ld['feature_alphanumid'], ld) for ld in ldps])
    nameclash_fs = grp2([(ld['feature_name'], ld['feature_alphanumid']) for ld in ldps])
    fnamefix = {}
    for (dfeature, dfsids) in nameclash_fs.iteritems():
        if len(dfsids) != 1:
            print "Feature name clash", dfeature, dfsids
            for dfsid in dfsids:
                fnamefix[dfsid] = dfeature + " [%s]" % dfsid

    nlgs = opv(grp2([(ld['feature_alphanumid'], ld['language_id'])
                     for ld in ldps if ld["value"] != "?"]), len)
    
    (fidstr, fidint) = sortinfo(fs.keys())
    for (fid, f) in fs.iteritems():
        if nlgs[fid] == 0:
            continue
        data.add(
            models.Feature, fid,
            id=fid,
            name=fnamefix.get(fid, f['feature_name']),
            description=f['feature_information'],
            jsondata=dict(vdoc=f['feature_possible_values']),
            representation=nlgs[fid],
            designer=data["Designer"][designer_to_id[f['designer']]],
            dependson=f["depends_on"],
            featuredomain=data['FeatureDomain'][f["feature_domain"]],
            sortkey_str=fidstr[fid],
            sortkey_int=fidint[fid])

    DBSession.flush()

    fvs = dict([(ld['feature_alphanumid'], ld['feature_possible_values']) for ld in ldps])
    fvdesc = {}
    for (fid, vs) in fvs.iteritems():
        vdesclist = [veq.split("==") for veq in vs.split("||")]
        try:
            vdesc = dict([(v.replace(".", "-"), desc) for [v, desc] in vdesclist])
        except ValueError:
            print "Faulty value desc", vdesclist, vs
        if not vdesc.has_key("?"):
            vdesc["?"] = "Not known"
        if not vdesc.has_key("N/A") and fs[fid]["depends_on"]:
            vdesc["N/A"] = "Not Applicable"
        vi = dict([(v, i) for (i, v) in enumerate(sorted(vdesc.keys()))])
        vicons = icons.iconize(vi.keys())
        if len(vdesc) == 0:
            print "VDESC missing", vs, fid, v
        for (v, desc) in vdesc.iteritems():
            fvdesc[(fid, v)] = desc
            data.add(
                common.DomainElement, (fid, v),
                id='%s-%s' % (fid, v),
                name=v,
                description=desc,
                jsondata={"icon": vicons[v]},
                number=vi[v],
                parameter=data['Feature'][fid])
    DBSession.flush()

    done = set()
    for ld in ldps:
        parameter = data['Feature'][ld['feature_alphanumid']]
        language = data['sailsLanguage'][ld['language_id']]
        
        id_ = '%s-%s' % (parameter.id, language.id)

        if (ld['feature_alphanumid'], ld['value']) not in data['DomainElement']:
            print ld['feature_alphanumid'], ld['feature_name'], ld['language_id'], ld['value'], "not in the set of legal values"
            continue

        valueset = data.add(
            common.ValueSet,
            id_,
            id=id_,
            language=language,
            parameter=parameter,
            contribution=parameter.designer,
            source=ld["source"].strip() or None,
        )
        data.add(
            models.sailsValue,
            id_,
            id=id_,
            domainelement=data['DomainElement'][(ld['feature_alphanumid'], ld['value'])],
            jsondata={"icon": data['DomainElement'][(ld['feature_alphanumid'], ld['value'])].jsondata},
            description=fvdesc[(ld['feature_alphanumid'], ld['value'])],
            comment=ld["comment"],
            example=ld["example"],
            valueset=valueset,
            contributed_datapoint=ld["contributor"]
        )
        done.add(id_)

    cdatapts = [dict(dld[1:]) for dld in dedup.itervalues() if dld[0].strip() and dld[0] != "?"]
    fccl = grp2([(ld['feature_alphanumid'].replace('.', "-"), (ld['construction_id'], ld['language_id'])) for ld in cdatapts])
    fcstats = opv(fccl, lambda cls: (len(set([c for (c, l) in cls])), len(set([l for (c, l) in cls]))))
    fcstrs = dict([(ld['feature_alphanumid'].replace('.', "-"), ld) for ld in dtab("constructions_features.tab")])

    # Construction Feature Domains
    for domain in set(ld['feature_domain'] for ld in fcstrs.values()):
        data.add(models.ConstructionFeatureDomain, domain, id=slug(domain), name=domain)
    DBSession.flush()

    

    (fidstr, fidint) = sortinfo(fcstrs.keys())
    for (fid, ld) in fcstrs.iteritems():
        (ncs, nlgs) = fcstats[fid]
        data.add(
            models.sailsUnitParameter, fid,
            id=fid,
            name=ld['feature_name'],
            description=ld['feature_information'],
            jsondata=dict(vdoc=ld['feature_possible_values']),
            designer=data["Designer"][slug("%s-%s" % (ld['designer'], "Construction-Based Subordination Data (SUB)"))],
            dependson=f["depends_on"],
            constructionfeaturedomain=data['ConstructionFeatureDomain'][ld["feature_domain"]],
            nconstructions=ncs,
            nlanguages=nlgs,
            sortkey_str=fidstr[fid],
            sortkey_int=fidint[fid])
    DBSession.flush()

    #ldcps = dtab("constructions_data.tab")
    cs = set([(ld['construction_id'], ld['language_id']) for ld in ldcps])
    for (cid, lid) in cs:
        language = data['sailsLanguage'][lid]
        data.add(
            models.sailsConstruction, cid,
            id=cid,
            name=cid,
            language = language) 
    DBSession.flush()

    for ld in dldps: #dld in dedup.itervalues():
        #ld = dict(dld[1:])
        #print fid, ld['language_id'], ld['construction_id'], "HEJ"
        fid = ld['feature_alphanumid'].replace('.', "-")
        language = data['sailsLanguage'][ld['language_id']]
        construction = data['sailsConstruction'][ld['construction_id']]
        construction_feature = data['sailsUnitParameter'][fid]
        id_ = '%s-%s' % (construction.id, construction_feature.id)
        print ld
        data.add(models.sailsUnitValue, id_, id=id_, name=ld['value'], unit=construction, unitparameter=construction_feature, contribution=construction_feature.designer, source = ld["source"].strip(), comment = ld["comment"], provenance = ld["provenance"], contributed_datapoint = "Rik van Gijn")

        
        #1xf vs
        #contribution??!?!
        #TODO fixa unitvalues
	#fs v #lgs #cstrs TODO fixa snippet
        

        #Constrction Features
        #(unit-)values
        
        #done.add(id_)
    DBSession.flush()


        
    # Sources
    sources = [ktfbib(bibsource) for ld in ldps if ld.get('bibsources') for bibsource in ld['bibsources'].split(",,,")] + [ktfbib(bibsource) for dld in dldps if dld.get('bibsources') for bibsource in dld['bibsources'].split(",,,")]
    for (k, (typ, bibdata)) in sources:
        rec = Record(typ, k, **bibdata)
        if not data["Source"].has_key(k):
            data.add(common.Source, k, _obj=bibtex2source(rec))
    DBSession.flush()

    for ld in ldps:
        sources = [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,") if ld.get('bibsources')]
        for (k, (typ, bibdata)) in sources:
            parameter = data['Feature'][ld['feature_alphanumid']]
            language = data['sailsLanguage'][ld['language_id']]
            id_ = '%s-%s' % (parameter.id, language.id)
            data.add(
                common.ValueSetReference,
                "%s-%s" % (id_, k),
                valueset=data["ValueSet"][id_],
                source=data['Source'][k])
    DBSession.flush()

    dataset = common.Dataset(
        id="SAILS",
        name='SAILS Online',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_url="http://shh.mpg.de",
        publisher_place="Jena",
        description="Dataset on Typological Features for South American Languages, collected 2009-2013 in the Traces of Contact Project (ERC Advanced Grant 230310) awarded to Pieter Muysken, Radboud Universiteit, Nijmegen, the Netherlands.",
        domain='sails.clld.org',
        published=date(2014, 2, 20),
        contact='*****@*****.**',
        license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en',
        jsondata={
            'license_icon': 'http://wals.info/static/images/cc_by_nc_nd.png',
            'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany'})
    DBSession.add(dataset)
    DBSession.flush()

    editor = data.add(common.Contributor, "Harald Hammarstrom", id="Harald Hammarstrom", name="Harald Hammarstrom", email = "*****@*****.**")
    common.Editor(dataset=dataset, contributor=editor, ord=0)
    DBSession.flush()

    #To CLDF
    cldf = {}
    for ld in ldps:
        parameter = data['Feature'][ld['feature_alphanumid']]
        language = data['sailsLanguage'][ld['language_id']]
        id_ = '%s-%s' % (parameter.id, language.id)
        if not id_ in done:
            continue
        dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"], ld["comment"])
        cldf[dt] = None

    tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows])
    savu(tab([("Language", "iso-639-3", "Feature", "Value", "Comment")] + cldf.keys()), "sails.cldf")