Exemple #1
0
def issue24(session, timestamp):  # pragma: no cover
    #- Update language cea (name, coords, alternative names, iso code (and name))
    #Change name of Cree (Eastern) to Cree (Swampy)
    #Change coordinates to 56dN, 90dW
    #Change the Ethnologue name to Cree (Swampy)
    #Remove the Routledge and Other names
    #Change the ISO code to csw. glottocode to swam1239
    cea = common.Language.get('cea', session=session)
    cre = common.Language.get('cre', session=session)

    for i in range(len(cea.languageidentifier)):
        try:
            del cea.languageidentifier[i]
        except IndexError:
            pass

    for values in [
        ('gc-csw', 'swam1239', 'Swampy Cree', 'glottolog'),
        ('csw', 'csw', 'Cree (Swampy)', 'iso639-3'),
        ('ethnologue-csw', 'Cree (Swampy)', 'ethnologue', 'name'),
    ]:
        id = common.Identifier(
            **dict(zip('id name description type'.split(), values)))
        cea.languageidentifier.append(
            common.LanguageIdentifier(language=cea, identifier=id))

    cea.updated = timestamp
    cea.name = 'Cree (Swampy)'
    cea.ascii_name = slug('Cree (Swampy)')
    cea.latitude = 56.0
    cea.longitude = -90.0

    for pid in ['81A', '82A', '83A']:
        vsq = session.query(common.ValueSet)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == pid)
        vs1 = vsq.filter(common.ValueSet.language_pk == cea.pk).one()
        vs2 = vsq.filter(common.ValueSet.language_pk == cre.pk).one()
        vs2.updated = timestamp
        vs1.updated = timestamp

        for ref in vs1.references:
            if ref.source.id == 'Hive-1948':
                ref.valueset = vs2

    session.flush()

    #- Delete valueset 85A-cea
    vs = session.query(common.ValueSet)\
        .join(common.Parameter)\
        .filter(common.Parameter.id == '85A')\
        .filter(common.ValueSet.language_pk == cea.pk).one()

    session.delete(vs.values[0])
    session.delete(vs.references[0])
    session.delete(vs)

    #- delete valueset 131A-cea add 131A-cre
    vs_switch_lang(session, timestamp, '131A-cea', 'cre')
Exemple #2
0
def add_language_codes(data, lang, isocode, glottocodes=None, glottocode=None):
    def identifier(type_, id_):
        return data.add(common.Identifier,
                        '%s:%s' % (type_, id_),
                        id='%s:%s' % (type_, id_),
                        name=id_,
                        type=getattr(common.IdentifierType, type_).value)

    if isocode and len(isocode) == 3:
        DBSession.add(
            common.LanguageIdentifier(language=lang,
                                      identifier=identifier('iso', isocode)))

    if glottocode or (glottocodes and isocode and isocode in glottocodes):
        glottocode = glottocode or glottocodes[isocode]
        DBSession.add(
            common.LanguageIdentifier(language=lang,
                                      identifier=identifier(
                                          'glottolog', glottocode)))
Exemple #3
0
def update_iso(session, timestamp, lang, *obsolete, **new):  # pragma: no cover
    if isinstance(lang, basestring):
        lang = common.Language.get(lang, session=session)
    lang.updated = timestamp

    for code in obsolete:
        for li in lang.languageidentifier:
            if li.identifier.id == code or li.identifier.id == 'ethnologue-' + code:
                session.delete(li)

    for code, name in new.items():
        iso = common.Identifier.get(code, session=session, default=None)
        ethnologue = common.Identifier.get('ethnologue-' + code,
                                           session=session,
                                           default=None)

        if not iso:
            iso = common.Identifier(id=code,
                                    name=code,
                                    description=name,
                                    type=common.IdentifierType.iso.value)

        if not ethnologue:
            ethnologue = common.Identifier(id='ethnologue-' + code,
                                           name=name,
                                           description='ethnologue',
                                           type='name')

        if iso.id not in [li.identifier.id for li in lang.languageidentifier]:
            session.add(
                common.LanguageIdentifier(language=lang, identifier=iso))

        if ethnologue.id not in [
                li.identifier.id for li in lang.languageidentifier
        ]:
            session.add(
                common.LanguageIdentifier(language=lang,
                                          identifier=ethnologue))
    return lang
Exemple #4
0
def add_identifier(languoid, data, name, type, description, lang='en'):
    identifier = data['Identifier'].get((name, type, description, lang))
    if not identifier:
        identifier = data.add(common.Identifier,
                              (name, type, description, lang),
                              id='{0}-{1}-{2}-{3}'.format(
                                  slug(name), slug(type),
                                  slug(description or ''), lang),
                              name=name,
                              type=type,
                              description=description,
                              lang=lang)
    DBSession.add(
        common.LanguageIdentifier(language=languoid, identifier=identifier))
Exemple #5
0
def add_codes(lang):
    for attr, prefix in dict(wals='wals_code_', iso='', glottolog='').items():
        code = getattr(lang, 'code_' + attr)
        if code:
            if attr == 'iso' and not re.match('[a-z]{3}$', code):
                continue
            id_ = prefix + code
            identifier = common.Identifier.get(id_, default=None)
            if not identifier:
                identifier = common.Identifier(id=id_,
                                               name=code,
                                               type=getattr(
                                                   common.IdentifierType,
                                                   attr).value)
            common.LanguageIdentifier(identifier=identifier, language=lang)
Exemple #6
0
def add_identifier(languoid, data, name, type, description, lang='en'):
    if len(lang) > 3:
        # Weird stuff introduced via hhbib_lgcode names. Roll back language parsing.
        name, lang = '{0} [{1}]'.format(name, lang), 'en'
    identifier = data['Identifier'].get((name, type, description, lang))
    if not identifier:
        identifier = data.add(
            common.Identifier,
            (name, type, description, lang),
            id='{0}-{1}-{2}-{3}'.format(
                slug(name), slug(type), slug(description or ''), lang),
            name=name,
            type=type,
            description=description,
            lang=lang)
    DBSession.add(common.LanguageIdentifier(language=languoid, identifier=identifier))
Exemple #7
0
def update_language(session,
                    timestamp,
                    lang,
                    keep_old_name=False,
                    **kw):  # pragma: no cover
    if isinstance(lang, basestring):
        lang = common.Language.get(lang, session=session)

    if 'name' in kw and keep_old_name:
        name = common.Identifier(id=str(uuid1()),
                                 name=lang.name,
                                 description='other',
                                 type='name')
        session.add(common.LanguageIdentifier(language=lang, identifier=name))

    return update_obj(session, timestamp, lang, **kw)
Exemple #8
0
def add_new_name(args):
    # TODO: Check the length of the args array is valid
    gcode, lang, name, type, desc = \
        args.args[0], args.args[1], args.args[2], args.args[3], args.args[4]

    with_session(args)
    with transaction.manager:
        languoid = DBSession.query(common.Language) \
                            .filter_by(id='{0}'.format(gcode)) \
                            .first()
        identifier = common.Identifier(
            (name, type, desc, lang),
            id='{0}-{1}-{2}-{3}'.format(slug(name), slug(type),
                                        slug(desc or ''), lang),
            name=name,
            type=type,
            description=desc,
            lang=lang)
        DBSession.add(identifier)
        DBSession.add(
            common.LanguageIdentifier(language=languoid,
                                      identifier=identifier))
Exemple #9
0
def update(args):
    count = 0
    assert args.json

    iid = int(
        DBSession.execute("select max(cast(id as integer)) from identifier").
        fetchone()[0]) + 1
    pk = DBSession.execute("select max(pk) from identifier").fetchone()[0] + 1

    langs = {}
    for gid, name in args.json['wikipedia'].items():
        if gid not in langs:
            langs[gid] = Languoid.get(gid)
        langs[gid].update_jsondata(wikipedia=name.split('/')[-1])

    for gid, codes in args.json['multitree'].items():
        l = langs[gid]
        lcodes = [i.name for i in l.identifiers if i.type == 'multitree']

        for code in set(codes):
            if code not in lcodes:
                identifier = DBSession.query(common.Identifier)\
                    .filter(common.Identifier.type == 'multitree')\
                    .filter(common.Identifier.name == code)\
                    .first()
                if not identifier:
                    identifier = common.Identifier(pk=pk,
                                                   id=str(iid),
                                                   name=code,
                                                   type='multitree')
                    iid += 1
                    pk += 1
                count += 1
                DBSession.add(
                    common.LanguageIdentifier(language=l,
                                              identifier=identifier))

    print count, 'new multitree identifiers'
Exemple #10
0
def update_glottocode(session, timestamp, lang, gc):  # pragma: no cover
    if isinstance(lang, basestring):
        lang = common.Language.get(lang, session=session)
    lang.updated = timestamp

    for li in lang.languageidentifier:
        if li.identifier.name == gc and li.identifier.type == common.IdentifierType.glottolog.value:
            session.delete(li)

    glottocode = session.query(common.Identifier)\
        .filter(common.Identifier.type == common.IdentifierType.glottolog.value)\
        .filter(common.Identifier.name == gc)\
        .first()

    if not glottocode:
        glottocode = common.Identifier(
            id=gc,
            name=gc,
            description=gc,
            type=common.IdentifierType.glottolog.value)
    session.add(common.LanguageIdentifier(language=lang,
                                          identifier=glottocode))
    return lang
Exemple #11
0
def main(args):
    data = Data()

    editors = OrderedDict()
    editors['Susanne Maria Michaelis'] = None
    editors['Philippe Maurer'] = None
    editors['Martin Haspelmath'] = None
    editors['Magnus Huber'] = None

    for row in read(args, 'People'):
        name = row['First name'] + ' ' if row['First name'] else ''
        name += row['Last name']
        kw = dict(
            name=name,
            id=slug('%(Last name)s%(First name)s' % row),
            url=row['Contact Website'].split()[0]
            if row['Contact Website'] else None,
            address=row['Comments on database'],
        )
        contrib = data.add(common.Contributor, row['Author ID'], **kw)
        if kw['name'] in editors:
            editors[kw['name']] = contrib

    DBSession.flush()

    dataset = common.Dataset(
        id='apics',
        name='APiCS Online',
        description='Atlas of Pidgin and Creole Language Structures Online',
        domain='apics-online.info',
        published=date(2013, 11, 4),
        license='http://creativecommons.org/licenses/by/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'
        })
    DBSession.add(dataset)
    for i, editor in enumerate(editors.values()):
        common.Editor(dataset=dataset, contributor=editor, ord=i + 1)

    colors = dict(
        (row['ID'], row['RGB_code']) for row in read(args, 'Colours'))

    abbrs = {}
    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for id_, name in {
            'C**T': 'clitic',
            'IMPF': 'imperfect',
            'INTERM': 'intermediate',
            'NCOMPL': 'noncompletive',
            'NONFUT': 'nonfuture',
            'NPROX': 'nonproximal',
            'NSG': 'nonsingular',
            'PP': 'past participle',
            'PROP': 'proprietive',
            'TMA': 'tense-mood-aspect',
    }.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'),
                      delimiter=',',
                      namedtuples=True):
        for match in GLOSS_ABBR_PATTERN.finditer(row.standard):
            if match.group('abbr') not in abbrs:
                abbrs[match.group('abbr')] = 1
                DBSession.add(
                    common.GlossAbbreviation(id=match.group('abbr'),
                                             name=row.meaning))

    non_bibs = {}
    for row in read(args, 'References', 'Reference_ID'):
        if row['Reference_type'] == 'Non-bib':
            non_bibs[row['Reference_ID']] = row['Reference_name']
            continue

        if isinstance(row['Year'], int):
            year_int = row['Year']
            year = str(row['Year'])
        elif row['Year']:
            year_int = None
            for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']):
                year_int = int(m.group('year'))
                break
            year = row['Year']
        else:
            year, year_int = None, None

        title = row['Article_title'] or row['Book_title']
        attrs = {}
        jsondata = {}
        for attr, field in {
                'Additional_information': 'note',
                'Article_title': 'title',
                'Book_title': 'booktitle',
                'City': 'address',
                'Editors': 'editor',
                'Full_reference': None,
                'Issue': None,
                'Journal': 'journal',
                'Language_codes': None,
                'LaTeX_cite_key': None,
                'Pages': 'pages',
                'Publisher': 'publisher',
                'Reference_type': 'type',
                'School': 'school',
                'Series_title': 'series',
                'URL': 'url',
                'Volume': 'volume',
        }.items():
            value = row.get(attr)
            if not isinstance(value, int):
                value = (value or '').strip()
            if attr == 'Issue' and value:
                try:
                    value = str(int(value))
                except ValueError:
                    pass
            if value:
                if field:
                    attrs[field] = value
                else:
                    jsondata[attr] = value
        p = data.add(common.Source,
                     row['Reference_ID'],
                     id=str(row['Reference_ID']),
                     name=row['Reference_name'],
                     description=title,
                     author=row['Authors'],
                     year=year,
                     year_int=year_int,
                     bibtex_type=getattr(EntryType, row['BibTeX_type']
                                         or 'misc'),
                     jsondata=jsondata,
                     **attrs)
        if p.bibtex_type.value == 'misc' and not p.description:
            p.description = p.note
        DBSession.flush()

    DBSession.flush()

    infobox = jsonload(args.data_file('infobox.json'))
    glottocodes = jsonload(args.data_file('glottocodes.json'))
    for row in read(args, 'Languages', 'Order_number'):
        lon, lat = [
            float(c.strip()) for c in row['map_coordinates'].split(',')
        ]
        kw = dict(
            name=row['Language_name'],
            id=str(row['Order_number']),
            latitude=lat,
            longitude=lon,
            region=row['Category_region'],
        )
        lect = data.add(models.Lect, row['Language_ID'], **kw)
        DBSession.flush()

        for i, item in enumerate(infobox[lect.id]):
            DBSession.add(
                common.Language_data(object_pk=lect.pk,
                                     ord=i,
                                     key=item[0],
                                     value=item[1]))

        if row["Languages_contribution_documentation::Lect_description_checked_status"] \
                != "Checked":
            print 'unchecked! ---', row['Language_name']

        desc = row.get(
            'Languages_contribution_documentation::Lect description', '')
        markup_desc = normalize_markup(row[
            'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description']
                                       )

        c = data.add(
            models.ApicsContribution,
            row['Language_ID'],
            id=str(row['Order_number']),
            name=row['Language_name'],
            description=desc,
            markup_description=markup_desc,
            survey_reference=data['Source'][row['Survey_reference_ID']],
            language=lect)

        for ext, label, mtype in [
            ('pdf', 'Glossed text', 'application/pdf'),
            ('mp3', 'Glossed text audio', 'audio/mpeg'),
        ]:
            fid = '%s-gt.%s' % (c.id, ext)
            if args.data_file('files', 'contribution', c.id, fid).exists():
                common.Contribution_files(object=c,
                                          id=fid,
                                          name=label,
                                          mime_type=mtype)
            else:
                print label, 'missing for:', row['Language_name']

        #
        # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE
        #

        iso = None
        if row['ISO_code'] and len(row['ISO_code']) == 3:
            iso = row['ISO_code'].lower()
            if 'iso:%s' % row['ISO_code'] not in data['Identifier']:
                data.add(common.Identifier,
                         'iso:%s' % row['ISO_code'],
                         id=row['ISO_code'].lower(),
                         name=row['ISO_code'].lower(),
                         type=common.IdentifierType.iso.value)

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=data['Identifier']['iso:%s' % row['ISO_code']]))

        if lect.id in glottocodes:
            identifier = data.add(common.Identifier,
                                  'gc:%s' % glottocodes[lect.id],
                                  id=glottocodes[lect.id],
                                  name=glottocodes[lect.id],
                                  type=common.IdentifierType.glottolog.value)

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=identifier))

        if row['Language_name_ethnologue']:
            if row['Language_name_ethnologue'] not in data['Identifier']:
                data.add(common.Identifier,
                         row['Language_name_ethnologue'],
                         id=iso
                         or 'ethnologue:%s' % row['Language_name_ethnologue'],
                         name=row['Language_name_ethnologue'],
                         type='ethnologue')

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=data['Identifier'][
                        row['Language_name_ethnologue']]))

    example_count = {}
    for row in read(args, 'Examples', 'Order_number'):
        assert row['Language_ID']
        lang = data['Lect'][row['Language_ID']]
        id_ = '%(Language_ID)s-%(Example_number)s' % row
        atext, gloss = igt(row)
        example_count[row['Language_ID']] = max(
            [example_count.get(row['Language_ID'], 1), row['Example_number']])
        p = add_sentence(
            args,
            data,
            id_,
            id='%s-%s' % (lang.id, row['Example_number']),
            name=row['Text'] or row['Analyzed_text'],
            description=row['Translation'],
            type=row['Type'].strip().lower() if row['Type'] else None,
            comment=row['Comments'],
            gloss=gloss,
            analyzed=atext,
            markup_text=normalize_markup(row['z_calc_Text_CSS']),
            markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']),
            markup_comment=normalize_markup(row['z_calc_Comments_CSS']),
            markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']),
            original_script=row['Original_script'],
            jsondata={
                'sort': row['Order_number'],
                'alt_translation': (row['Translation_other'] or '').strip()
                or None
            },
            language=lang)

        if row['Reference_ID']:
            if row['Reference_ID'] in data['Source']:
                source = data['Source'][row['Reference_ID']]
                DBSession.add(
                    common.SentenceReference(
                        sentence=p,
                        source=source,
                        key=source.id,
                        description=row['Reference_pages']))
            else:
                p.source = non_bibs[row['Reference_ID']]

    DBSession.flush()

    for row in read(args, 'Language_references'):
        if row['Reference_ID'] not in data['Source']:
            assert row['Reference_ID'] in non_bibs
            continue
        assert row['Language_ID'] in data['ApicsContribution']
        source = data['Source'][row['Reference_ID']]
        DBSession.add(
            common.ContributionReference(
                contribution=data['ApicsContribution'][row['Language_ID']],
                source=source,
                description=row['Pages'],
                key=source.id))

    #
    # global counter for features - across feature types
    #
    feature_count = 0
    for row in read(args, 'Features', 'Feature_number'):
        id_ = str(row['Feature_number'])
        if int(id_) > feature_count:
            feature_count = int(id_)
        wals_id = None
        desc = row['Feature_annotation_publication']
        if row['WALS_match'] == 'Total':
            if isinstance(row['WALS_No.'], int):
                wals_id = row['WALS_No.']
            else:
                wals_id = int(row['WALS_No.'].split('.')[0].strip())

        p = data.add(models.Feature,
                     row['Feature_code'],
                     name=row['Feature_name'],
                     id=id_,
                     description=desc,
                     markup_description=normalize_markup(
                         row['z_calc_Feature_annotation_publication_CSS']),
                     feature_type='primary',
                     multivalued=row['Value_relation_type'] != 'Single',
                     area=row['Feature_area'],
                     wals_id=wals_id)

        names = {}
        for i in range(1, 10):
            if not row['Value%s_publication' % i] \
                    or not row['Value%s_publication' % i].strip():
                continue
            name = row['Value%s_publication' % i].strip()
            if name in names:
                name += ' (%s)' % i
            names[name] = 1
            de = data.add(
                common.DomainElement,
                '%s-%s' % (row['Feature_code'], i),
                id='%s-%s' % (id_, i),
                name=name,
                parameter=p,
                abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name,
                number=int(row['Value%s_value_number_for_publication' % i]),
                jsondata={'color': colors[row['Value_%s_colour_ID' % i]]},
            )
            assert de

        if row['Authors_FeatureArticles']:
            authors, _ = row['Authors_FeatureArticles'].split('and the APiCS')
            authors = authors.strip()
            if authors.endswith(','):
                authors = authors[:-1].strip()
            for i, name in enumerate(authors.split(',')):
                assert name.strip() in editors
                p._authors.append(
                    models.FeatureAuthor(ord=i + 1,
                                         contributor=editors[name.strip()]))

        DBSession.flush()

    primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41}
    segment_to_primary = dict(
        zip(primary_to_segment.values(), primary_to_segment.keys()))
    number_map = {}
    names = {}
    for row in read(args, 'Segment_features', 'Order_number'):
        symbol = row['Segment_symbol']
        if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate':
            symbol = 't\u0361s'
        truth = lambda s: s and s.strip().lower() == 'yes'
        name = '%s - %s' % (symbol, row['Segment_name'])

        if name in names:
            number_map[row['Segment_feature_number']] = names[name]
            continue

        number_map[
            row['Segment_feature_number']] = row['Segment_feature_number']
        names[name] = row['Segment_feature_number']
        feature_count += 1
        if row['Segment_feature_number'] in segment_to_primary:
            primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\
                = str(feature_count)
        p = data.add(models.Feature,
                     row['Segment_feature_number'],
                     name=name,
                     id=str(feature_count),
                     feature_type='segment',
                     area='Vowels' if truth(row['Vowel']) else
                     ('Obstruent consonants'
                      if truth(row['Obstruent']) else 'Sonorant consonants'),
                     jsondata=dict(
                         number=int(row['Segment_feature_number']),
                         vowel=truth(row['Vowel']),
                         consonant=truth(row['Consonant']),
                         obstruent=truth(row['Obstruent']),
                         core_list=truth(row['Core_list_segment']),
                         symbol=symbol,
                     ))

        for i, spec in SEGMENT_VALUES.items():
            data.add(common.DomainElement,
                     '%s-%s' % (row['Segment_feature_number'], spec[0]),
                     id='%s-%s' % (p.id, i),
                     name=spec[0],
                     parameter=p,
                     jsondata={'color': spec[1]},
                     number=i)

    print '--> remapped:', primary_to_segment
    DBSession.flush()

    for row in read(args, 'Sociolinguistic_features',
                    'Sociolinguistic_feature_number'):
        feature_count += 1
        p = data.add(models.Feature,
                     row['Sociolinguistic_feature_code'],
                     name=row['Sociolinguistic_feature_name'],
                     id='%s' % feature_count,
                     description=row['Sociolinguistic_feature_annotation'],
                     area='Sociolinguistic',
                     feature_type='sociolinguistic')

        names = {}

        for i in range(1, 10):
            id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i)
            if row.get('Value%s' % i) and row['Value%s' % i].strip():
                name = row['Value%s' % i].strip()
                if name in names:
                    name += ' (%s)' % i
                names[name] = 1
            else:
                continue
            kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i)
            data.add(common.DomainElement,
                     id_,
                     id='%s-%s' % (p.id, i),
                     name=name,
                     parameter=p,
                     number=i,
                     jsondata={
                         'color':
                         colors.get(row['Value%s_colour_ID' % i],
                                    colors.values()[i])
                     })

    sd = {}
    for row in read(args, 'Segment_data'):
        if row['Segment_feature_number'] not in number_map:
            continue
        number = number_map[row['Segment_feature_number']]

        if not row['Presence_in_the_language']:
            continue

        lang = data['Lect'][row['Language_ID']]
        param = data['Feature'][number]
        id_ = '%s-%s' % (lang.id, param.id)
        if id_ in sd:
            assert row['c_Record_is_a_duplicate'] == 'Yes'
            continue
        sd[id_] = 1
        valueset = data.add(
            common.ValueSet,
            id_,
            id=id_,
            parameter=param,
            language=lang,
            contribution=data['ApicsContribution'][row['Language_ID']],
            description=row['Comments'],
            markup_description=normalize_markup(row['z_calc_Comments_CSS']),
        )
        v = data.add(
            common.Value,
            id_,
            id=id_,
            frequency=float(100),
            valueset=valueset,
            domainelement=data['DomainElement'][
                '%s-%s' % (number, row['Presence_in_the_language'])],
        )
        if row['Example_word'] and row['Example_word_gloss']:
            example_count[row['Language_ID']] += 1
            p = add_sentence(args,
                             data,
                             '%s-p%s' % (lang.id, data['Feature'][number].id),
                             id='%s-%s' %
                             (lang.id, example_count[row['Language_ID']]),
                             name=row['Example_word'],
                             description=row['Example_word_gloss'],
                             language=lang)
            DBSession.add(common.ValueSentence(value=v, sentence=p))

        source = data['Source'].get(row['Refers_to_references_Reference_ID'])
        if source:
            DBSession.add(
                common.ValueSetReference(valueset=valueset,
                                         source=source,
                                         key=source.id))
        elif row['Refers_to_references_Reference_ID'] in non_bibs:
            valueset.source = non_bibs[
                row['Refers_to_references_Reference_ID']]

    lects = defaultdict(lambda: 1)
    lect_map = {}
    records = {}
    false_values = {}
    no_values = {}
    wals_value_number = {}
    for row in read(args, 'wals'):
        if row['z_calc_WALS_value_number']:
            wals_value_number[
                row['Data_record_id']] = row['z_calc_WALS_value_number']

    def prefix(attr, _prefix):
        if _prefix:
            return '%s_%s' % (_prefix, attr)
        return attr.capitalize()

    for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]:
        num_values = 10
        for row in read(args, prefix('data', _prefix)):
            if not row[prefix('feature_code', _prefix)]:
                print('no associated feature for', prefix('data', _prefix),
                      row[prefix('data_record_id', _prefix)])
                continue

            lid = row['Language_ID']
            lect_attr = row.get('Lect_attribute', 'my default lect').lower()
            if lect_attr != 'my default lect':
                if (row['Language_ID'], row['Lect_attribute']) in lect_map:
                    lid = lect_map[(row['Language_ID'], row['Lect_attribute'])]
                else:
                    lang = data['Lect'][row['Language_ID']]
                    c = lects[row['Language_ID']]
                    lid = '%s-%s' % (row['Language_ID'], c)
                    kw = dict(
                        name='%s (%s)' % (lang.name, row['Lect_attribute']),
                        id='%s' % (1000 + 10 * int(lang.id) + c),
                        latitude=lang.latitude,
                        longitude=lang.longitude,
                        description=row['Lect_attribute'],
                        language=lang,
                    )
                    data.add(models.Lect, lid, **kw)
                    lects[row['Language_ID']] += 1
                    lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid

            id_ = abbr + str(row[prefix('data_record_id', _prefix)])
            assert id_ not in records
            records[id_] = 1

            assert row[prefix('feature_code', _prefix)] in data['Feature']
            language = data['Lect'][lid]
            parameter = data['Feature'][row[prefix('feature_code', _prefix)]]
            valueset = common.ValueSet(
                id='%s-%s' % (language.id, parameter.id),
                description=row['Comments_on_value_assignment'],
                markup_description=normalize_markup(
                    row.get('z_calc_Comments_on_value_assignment_CSS')),
            )

            values_found = {}
            for i in range(1, num_values):
                if not row['Value%s_true_false' % i]:
                    continue

                if row['Value%s_true_false' % i].strip().lower() != 'true':
                    assert row['Value%s_true_false' %
                               i].strip().lower() == 'false'
                    false_values[row[prefix('data_record_id', _prefix)]] = 1
                    continue

                iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i)
                if iid not in data['DomainElement']:
                    print(iid, row[prefix('data_record_id',
                                          _prefix)], '--> no domainelement!')
                    continue
                values_found['%s-%s' % (id_, i)] = dict(
                    id='%s-%s' % (valueset.id, i),
                    domainelement=data['DomainElement']['%s-%s' % (row[prefix(
                        'feature_code', _prefix)], i)],
                    confidence=row['Value%s_confidence' % i],
                    frequency=float(row['c_V%s_frequency_normalised' %
                                        i]) if _prefix == '' else 100)

            if values_found:
                if row[prefix('data_record_id', _prefix)] in wals_value_number:
                    valueset.jsondata = {
                        'wals_value_number':
                        wals_value_number.pop(row[prefix(
                            'data_record_id', _prefix)])
                    }
                valueset.parameter = parameter
                valueset.language = language
                valueset.contribution = data['ApicsContribution'][
                    row['Language_ID']]
                valueset = data.add(common.ValueSet, id_, _obj=valueset)
                for i, item in enumerate(values_found.items()):
                    if i > 0 and not parameter.multivalued:
                        print 'multiple values for single-valued parameter: %s' % id_
                        break
                    id_, kw = item
                    kw['valueset'] = valueset
                    value = data.add(common.Value, id_, **kw)

                #
                # store references to additional data for segments which should be reused
                # for corresponding primary features!
                #
                if int(parameter.id) in primary_to_segment:
                    assert len(values_found) == 1
                    seg_id = '%s-%s' % (language.id, primary_to_segment[int(
                        parameter.id)])
                    seg_valueset = data['ValueSet'][seg_id]
                    seg_value = data['Value'][seg_id]
                    if not valueset.description and seg_valueset.description:
                        valueset.description = seg_valueset.description

                    for s in seg_value.sentence_assocs:
                        DBSession.add(
                            common.ValueSentence(value=value,
                                                 sentence=s.sentence))

                    for r in seg_valueset.references:
                        DBSession.add(
                            common.ValueSetReference(valueset=valueset,
                                                     source=r.source,
                                                     key=r.key))

                    if not valueset.source and seg_valueset.source:
                        valueset.source = seg_valueset.source

                DBSession.flush()
            else:
                no_values[id_] = 1

    DBSession.flush()

    for prefix, abbr, num_values in [
        ('D', '', 10),
        ('Sociolinguistic_d', 'sl', 7),
    ]:
        for row in read(args, prefix + 'ata_references'):
            assert row['Reference_ID'] in data['Source'] \
                or row['Reference_ID'] in non_bibs
            try:
                vs = data['ValueSet'][abbr +
                                      str(row[prefix + 'ata_record_id'])]
                if row['Reference_ID'] in data['Source']:
                    source = data['Source'][row['Reference_ID']]
                    DBSession.add(
                        common.ValueSetReference(
                            valueset=vs,
                            source=source,
                            key=source.id,
                            description=row['Pages'],
                        ))
                else:
                    if vs.source:
                        vs.source += '; ' + non_bibs[row['Reference_ID']]
                    else:
                        vs.source = non_bibs[row['Reference_ID']]
            except KeyError:
                continue

    DBSession.flush()

    missing = 0
    for row in read(args, 'Value_examples'):
        try:
            DBSession.add(
                common.ValueSentence(
                    value=data['Value']['%(Data_record_id)s-%(Value_number)s' %
                                        row],
                    sentence=data['Sentence'][
                        '%(Language_ID)s-%(Example_number)s' % row],
                    description=row['Notes'],
                ))
        except KeyError:
            missing += 1
    print('%s Value_examples are missing data' % missing)

    print('%s data sets with false values' % len(false_values))
    print('%s data sets without values' % len(no_values))

    for k, v in wals_value_number.items():
        print 'unclaimed wals value number:', k, v

    for i, row in enumerate(read(args, 'Contributors')):
        kw = dict(contribution=data['ApicsContribution'][row['Language ID']],
                  contributor=data['Contributor'][row['Author ID']])
        if row['Order_of_appearance']:
            kw['ord'] = int(float(row['Order_of_appearance']))
        data.add(common.ContributionContributor, i, **kw)

    DBSession.flush()
Exemple #12
0
    def setUp(self):
        TestWithDb.setUp(self)

        DBSession.add(
            common.Dataset(id='dataset',
                           name='dataset',
                           description='desc',
                           domain='clld'))

        source = common.Source(id='source')
        contributors = {
            'contributor': 'A Name',
            'b': 'b Name',
            'c': 'c Name',
            'd': 'd Name'
        }
        for id_, name in contributors.items():
            contributors[id_] = common.Contributor(id=id_, name=name)

        contribution = common.Contribution(id='contribution',
                                           name='Contribution')
        cr = common.ContributionReference(contribution=contribution,
                                          source=source)
        assert common.ContributionContributor(
            contribution=contribution,
            primary=True,
            contributor=contributors['contributor'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=False,
                                              contributor=contributors['b'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=True,
                                              contributor=contributors['c'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=False,
                                              contributor=contributors['d'])

        DBSession.add(contribution)

        language = common.Language(id='language',
                                   name='Language 1',
                                   latitude=10.5,
                                   longitude=0.3)
        language.sources.append(source)
        identifier = common.Identifier(type='iso639-3', id='iso')
        li = common.LanguageIdentifier(language=language,
                                       identifier=identifier)

        for i in range(2, 102):
            _l = common.Language(id='l%s' % i, name='Language %s' % i)
            _i = common.Identifier(type='iso639-3',
                                   id='%.3i' % i,
                                   name='%.3i' % i)
            _li = common.LanguageIdentifier(language=_l, identifier=_i)
            DBSession.add(_l)

        param = common.Parameter(id='parameter', name='Parameter')
        de = common.DomainElement(id='de',
                                  name='DomainElement',
                                  parameter=param)
        de2 = common.DomainElement(id='de2',
                                   name='DomainElement2',
                                   parameter=param)
        valueset = common.ValueSet(id='valueset',
                                   language=language,
                                   parameter=param,
                                   contribution=contribution)
        value = common.Value(id='value',
                             domainelement=de,
                             valueset=valueset,
                             frequency=50,
                             confidence='high')
        DBSession.add(value)
        paramnd = common.Parameter(id='no-domain',
                                   name='Parameter without domain')
        valueset = common.ValueSet(id='vs2',
                                   language=language,
                                   parameter=paramnd,
                                   contribution=contribution)
        vr = common.ValueSetReference(valueset=valueset, source=source)
        value = common.Value(id='v2',
                             valueset=valueset,
                             frequency=50,
                             confidence='high')
        DBSession.add(value)

        unit = common.Unit(id='unit', name='Unit', language=language)
        up = common.UnitParameter(id='unitparameter', name='UnitParameter')
        DBSession.add(unit)
        DBSession.add(
            common.UnitValue(id='unitvalue',
                             name='UnitValue',
                             unit=unit,
                             unitparameter=up))

        up2 = common.UnitParameter(id='up2', name='UnitParameter with domain')
        de = common.UnitDomainElement(id='de', name='de', parameter=up2)
        DBSession.add(
            common.UnitValue(id='uv2',
                             name='UnitValue2',
                             unit=unit,
                             unitparameter=up2,
                             unitdomainelement=de))

        DBSession.add(common.Source(id='s'))

        sentence = common.Sentence(id='sentence',
                                   name='sentence name',
                                   description='sentence description',
                                   analyzed='a\tmorpheme\tdoes\tdo',
                                   gloss='a\tmorpheme\t1SG\tdo.SG2',
                                   source='own',
                                   comment='comment',
                                   original_script='a morpheme',
                                   language=language)
        sr = common.SentenceReference(sentence=sentence, source=source)
        DBSession.add(common.Config(key='key', value='value'))
        DBSession.flush()
Exemple #13
0
def add_languoids(data, languoids: dict):
    instances = []
    id_to_instance = dict()

    for slug, fields in languoids.items():
        key_ = slug

        name = fields['name']
        parent = fields.get('parent', None)
        latitude = fields.get('lat', None)
        longitude = fields.get('lng', None)
        rich_description = fields.get('rich_description', None)
        glottocode = fields.get('glottolog', None)
        ethnologue_code = fields.get('ethnologue', None)
        nex_url = fields.get('nex_url', None)
        description_file_url = fields.get('description_file_url', None)

        add_kwargs = {
            'name': name,
            'id': key_,
        }

        if parent is not None:
            add_kwargs['parent'] = id_to_instance[parent]

        if latitude is not None and longitude is not None:
            add_kwargs['latitude'] = latitude
            add_kwargs['longitude'] = longitude

        if rich_description is not None:
            add_kwargs['description'] = rich_description
            add_kwargs['markup_description'] = rich_description

        if glottocode is not None:
            add_kwargs['glottocode_'] = glottocode

        if ethnologue_code is not None:
            add_kwargs['ethnologue_code'] = ethnologue_code

        if nex_url is not None:
            add_kwargs['nex_url'] = nex_url

        if description_file_url is not None:
            add_kwargs['description_file_url'] = description_file_url

        add_kwargs['level'] = fields.get('level',
                                         models.LanguoidLevel.language)

        instance = data.add(models.Languoid, key_, **add_kwargs)
        instances.append(instance)
        id_to_instance[instance.id] = instance

        if glottocode is not None:
            identifier = data.add(models.Identifier,
                                  f'{key_}-glottocode',
                                  type=models.IdentifierType.glottolog.value,
                                  name=glottocode)
            DBSession.add(
                common.LanguageIdentifier(language=instance,
                                          identifier=identifier))

        if ethnologue_code is not None:
            identifier = data.add(models.Identifier,
                                  f'{key_}-ethnologue',
                                  type=models.IdentifierType.ethnologue.value,
                                  name=ethnologue_code)
            DBSession.add(
                common.LanguageIdentifier(language=instance,
                                          identifier=identifier))

    return instances
Exemple #14
0
    def setUp(self):
        TestWithDb.setUp(self)

        DBSession.add(
            common.Dataset(id='dataset',
                           name='dataset',
                           description='desc',
                           domain='clld',
                           jsondata={'license_icon': 'cc-by'}))

        DBSession.add(
            common.Source(id='replaced',
                          active=False,
                          jsondata={'__replacement_id__': 'source'}))
        source = common.Source(id='source')
        contributors = {
            'contributor': 'A Name',
            'b': 'b Name',
            'c': 'c Name',
            'd': 'd Name'
        }
        for id_, name in contributors.items():
            contributors[id_] = common.Contributor(id=id_,
                                                   name=name,
                                                   url='http://example.org')

        contribution = common.Contribution(id='contribution',
                                           name='Contribution')
        common.ContributionReference(contribution=contribution, source=source)
        assert common.ContributionContributor(
            contribution=contribution,
            primary=True,
            contributor=contributors['contributor'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=False,
                                              contributor=contributors['b'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=True,
                                              contributor=contributors['c'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=False,
                                              contributor=contributors['d'])

        DBSession.add(contribution)

        language = common.Language(id='language',
                                   name='Language 1',
                                   latitude=10.5,
                                   longitude=0.3)
        language.sources.append(source)
        for i, type_ in enumerate(common.IdentifierType):
            id_ = common.Identifier(type=type_.value,
                                    id=type_.value + str(i),
                                    name='abc')
            common.LanguageIdentifier(language=language, identifier=id_)

        for i in range(2, 102):
            _l = common.Language(id='l%s' % i, name='Language %s' % i)
            _i = common.Identifier(type='iso639-3',
                                   id='%.3i' % i,
                                   name='%.3i' % i)
            common.LanguageIdentifier(language=_l, identifier=_i)
            DBSession.add(_l)

        param = common.Parameter(id='parameter', name='Parameter')
        de = common.DomainElement(id='de',
                                  name='DomainElement',
                                  parameter=param)
        de2 = common.DomainElement(id='de2',
                                   name='DomainElement2',
                                   parameter=param)
        valueset = common.ValueSet(id='valueset',
                                   language=language,
                                   parameter=param,
                                   contribution=contribution)
        value = common.Value(id='value',
                             domainelement=de,
                             valueset=valueset,
                             frequency=50,
                             confidence='high')
        DBSession.add(value)
        value2 = common.Value(id='value2',
                              domainelement=de2,
                              valueset=valueset,
                              frequency=50,
                              confidence='high')
        DBSession.add(value2)
        paramnd = common.Parameter(id='no-domain',
                                   name='Parameter without domain')
        valueset = common.ValueSet(id='vs2',
                                   language=language,
                                   parameter=paramnd,
                                   contribution=contribution)
        common.ValueSetReference(valueset=valueset, source=source)
        value = common.Value(id='v2',
                             valueset=valueset,
                             frequency=50,
                             confidence='high')
        DBSession.add(value)

        unit = common.Unit(id='unit', name='Unit', language=language)
        up = common.UnitParameter(id='unitparameter', name='UnitParameter')
        DBSession.add(unit)
        DBSession.add(
            common.UnitValue(id='unitvalue',
                             name='UnitValue',
                             unit=unit,
                             unitparameter=up))

        up2 = common.UnitParameter(id='up2', name='UnitParameter with domain')
        de = common.UnitDomainElement(id='de', name='de', parameter=up2)
        DBSession.add(
            common.UnitValue(id='uv2',
                             name='UnitValue2',
                             unit=unit,
                             unitparameter=up2,
                             unitdomainelement=de))

        DBSession.add(common.Source(id='s'))

        sentence = common.Sentence(
            id='sentence',
            name='sentence name',
            description='sentence description',
            analyzed='a\tmorpheme\tdoes\tdo',
            gloss='a\tmorpheme\t1SG\tdo.SG2',
            source='own',
            comment='comment',
            original_script='a morpheme',
            language=language,
            jsondata={'alt_translation': 'Spanish: ...'})
        common.SentenceReference(sentence=sentence, source=source)
        DBSession.add(common.Config(key='key', value='value'))

        common.Config.add_replacement('replaced',
                                      'language',
                                      model=common.Language)
        common.Config.add_replacement('gone', None, model=common.Language)
        DBSession.flush()
Exemple #15
0
def add_languoids(data, languoids: list):
    converted_pk = dict()
    languoids = sorted(languoids,
                       key=lambda x: operator.itemgetter('level')(x['fields']))

    current_pk = 0

    for languoid in languoids:
        fields = languoid['fields']
        old_pk = languoid['pk']

        key_ = get_languoid_key_(old_pk)

        name = fields['name']
        parent = fields.get('parent', None)
        latitude = fields.get('latitude', None)
        longitude = fields.get('longitude', None)
        rich_description = fields.get('rich_description', None)
        glottocode = fields.get('glottocode', None)
        ethnologue_code = fields.get('ethnologue_code', None)
        nex_url = fields.get('nex_url', None)
        description_file_url = fields.get('description_file_url', None)

        add_kwargs = {
            'name': name,
            'id': slugify(name),
        }

        if parent is not None:
            print(parent)
            add_kwargs['parent_pk'] = converted_pk[parent]

        if latitude is not None and longitude is not None:
            add_kwargs['latitude'] = latitude
            add_kwargs['longitude'] = longitude

        if rich_description is not None:
            add_kwargs['description'] = rich_description
            add_kwargs['markup_description'] = rich_description

        if glottocode is not None:
            add_kwargs['glottocode_'] = glottocode

        if ethnologue_code is not None:
            add_kwargs['ethnologue_code'] = ethnologue_code

        if nex_url is not None:
            add_kwargs['nex_url'] = nex_url

        if description_file_url is not None:
            add_kwargs['description_file_url'] = description_file_url

        add_kwargs['level'] = [
            models.LanguoidLevel.superfamily,
            models.LanguoidLevel.family,
            models.LanguoidLevel.group,
            models.LanguoidLevel.language,
        ][fields.get('level', 3)]

        instance = data.add(models.Languoid, key_, **add_kwargs)

        if glottocode is not None:
            identifier = data.add(models.Identifier,
                                  f'{key_}-glottocode',
                                  type=models.IdentifierType.glottolog.value,
                                  name=glottocode)
            DBSession.add(
                common.LanguageIdentifier(language=instance,
                                          identifier=identifier))

        if ethnologue_code is not None:
            identifier = data.add(models.Identifier,
                                  f'{key_}-ethnologue',
                                  type=models.IdentifierType.glottolog.value,
                                  name=ethnologue_code)
            DBSession.add(
                common.LanguageIdentifier(language=instance,
                                          identifier=identifier))

        converted_pk[old_pk] = instance.pk
        current_pk += 1
Exemple #16
0
def populate_test_db(engine):
    set_alembic_version(engine, '58559d4eea0d')

    data = TestData()
    data.add_default(common.Dataset,
                     domain='clld',
                     jsondata={
                         'license_icon': 'cc-by',
                         'license_url': 'http://example.org'
                     })

    data.add_default(common.Contributor, name='A Name', email='*****@*****.**')
    for id_, name in {
            'b': 'b Name',
            'c': 'c Name',
            'd': 'd Name',
    }.items():
        data.add(common.Contributor,
                 id_,
                 id=id_,
                 name=name,
                 url='http://example.org')

    DBSession.add(
        common.Editor(dataset=data[common.Dataset],
                      contributor=data[common.Contributor]))

    data.add_default(common.Source)
    data.add(common.Source,
             'replaced',
             id='replaced',
             active=False,
             jsondata={'__replacement_id__': 'source'})

    data.add_default(common.Contribution)
    common.ContributionReference(contribution=data[common.Contribution],
                                 source=data[common.Source])

    for primary, c in [(True, 'contributor'), (False, 'b'), (True, 'c'),
                       (False, 'd')]:
        common.ContributionContributor(contribution=data[common.Contribution],
                                       primary=primary,
                                       contributor=data['Contributor'][c])

    data.add_default(common.Language, latitude=10.5, longitude=0.3)
    data[common.Language].sources.append(data[common.Source])

    for i, type_ in enumerate(common.IdentifierType):
        common.LanguageIdentifier(
            language=data[common.Language],
            identifier=common.Identifier(
                type=type_.value,
                id=type_.value + str(i),
                name='abc' if type_.name == 'iso' else 'glot1234'))

    common.LanguageIdentifier(language=data[common.Language],
                              identifier=common.Identifier(type='name',
                                                           id='name',
                                                           name='a'))

    for i in range(2, 102):
        _l = common.Language(id='l%s' % i, name='Language %s' % i)
        _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='abc')
        common.LanguageIdentifier(language=_l, identifier=_i)
        DBSession.add(_l)

    param = data.add_default(common.Parameter)
    de = common.DomainElement(id='de', name='DomainElement', parameter=param)
    de2 = common.DomainElement(id='de2',
                               name='DomainElement2',
                               parameter=param)

    valueset = data.add_default(common.ValueSet,
                                language=data[common.Language],
                                parameter=param,
                                contribution=data[common.Contribution])
    common.ValueSetReference(valueset=valueset,
                             source=data[common.Source],
                             description='10-20')

    data.add_default(common.Value,
                     domainelement=de,
                     valueset=valueset,
                     frequency=50,
                     confidence='high')
    data.add(common.Value,
             'value2',
             id='value2',
             domainelement=de2,
             valueset=valueset,
             frequency=50,
             confidence='high')

    paramnd = data.add(common.Parameter,
                       'no-domain',
                       id='no-domain',
                       name='Parameter without domain')
    valueset = common.ValueSet(id='vs2',
                               language=data[common.Language],
                               parameter=paramnd,
                               contribution=data[common.Contribution])

    common.ValueSetReference(valueset=valueset,
                             source=data[common.Source],
                             description='10-20')
    common.Value(id='v2', valueset=valueset, frequency=50, confidence='high')

    unit = data.add_default(common.Unit, language=data[common.Language])
    up = data.add_default(common.UnitParameter)
    common.UnitValue(id='unitvalue',
                     name='UnitValue',
                     unit=unit,
                     unitparameter=up)

    up2 = common.UnitParameter(id='up2', name='UnitParameter with domain')
    de = common.UnitDomainElement(id='de', name='de', parameter=up2)
    DBSession.add(
        common.UnitValue(id='uv2',
                         name='UnitValue2',
                         unit=unit,
                         unitparameter=up2,
                         unitdomainelement=de))

    DBSession.add(common.Source(id='s'))

    sentence = data.add_default(common.Sentence,
                                description='sentence description',
                                analyzed='a\tmorpheme\tdoes\tdo',
                                gloss='a\tmorpheme\t1SG\tdo.SG2',
                                source='own',
                                comment='comment',
                                original_script='a morpheme',
                                language=data[common.Language],
                                jsondata={'alt_translation': 'Spanish: ...'})
    common.SentenceReference(sentence=sentence, source=data[common.Source])
    DBSession.add(common.Config(key='key', value='value'))

    common.Config.add_replacement('replaced',
                                  'language',
                                  model=common.Language)
    common.Config.add_replacement('gone', None, model=common.Language)
    DBSession.flush()
Exemple #17
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """

    print('Parsing markdown intros...')
    for contrib in DBSession.query(models.Contribution):
        if contrib.description:
            contrib.markup_description = markdown(contrib.description)
        else:
            contrib.markup_description = None
    print('... done')

    print('Retrieving language data from glottolog...')

    catconf = cldfcatalog.Config.from_file()
    glottolog_path = catconf.get_clone('glottolog')
    glottolog = Glottolog(glottolog_path)

    lang_ids = [lang.id for lang in DBSession.query(common.Language)]
    languoids = {l.id: l for l in glottolog.languoids(lang_ids)}

    glottocodes = [(l.id,
                    common.Identifier(id=l.id, name=l.id, type='glottolog'))
                   for l in languoids.values()]
    glottocodes = OrderedDict(sorted(glottocodes, key=lambda t: t[0]))

    isocodes = [(l.iso, common.Identifier(id=l.iso,
                                          name=l.iso,
                                          type='iso639-3'))
                for l in languoids.values() if l.iso]
    isocodes = OrderedDict(sorted(isocodes, key=lambda t: t[0]))

    DBSession.add_all(glottocodes.values())
    DBSession.add_all(isocodes.values())
    DBSession.flush()

    for lang in DBSession.query(common.Language):
        if lang.id not in languoids:
            continue
        languoid = languoids[lang.id]
        lang.name = languoid.name
        lang.latitude = languoid.latitude
        lang.longitude = languoid.longitude
        lang.macroarea = languoid.macroareas[
            0].name if languoid.macroareas else ''

        DBSession.add(
            common.LanguageIdentifier(
                language=lang, identifier_pk=glottocodes[languoid.id].pk))

        if languoid.iso in isocodes:
            DBSession.add(
                common.LanguageIdentifier(
                    language=lang, identifier_pk=isocodes[languoid.iso].pk))

    DBSession.flush()
    print('... done')

    print('Making pretty colourful dots for parameter values...')
    all_icons = [icon.name for icon in ORDERED_ICONS]

    code_query = DBSession.query(common.DomainElement)\
        .order_by(common.DomainElement.parameter_pk, common.DomainElement.id)
    for _, param_codes in groupby(code_query, lambda c: c.parameter_pk):
        icons = cycle(all_icons)
        for code in param_codes:
            code.update_jsondata(icon=next(icons))

    DBSession.flush()
    print('... done')