Beispiel #1
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    for vs in DBSession.query(common.ValueSet).options(
            joinedload(common.ValueSet.values)):
        d = []
        for generic_term, words in groupby(
            sorted(vs.values, key=lambda v: v.description), key=lambda v: v.description
        ):
            if generic_term:
                generic_term += ': '
            else:
                generic_term = ''
            d.append(generic_term + ', '.join(nfilter([w.name for w in words])))

        vs.description = '; '.join(d)

    for model in [models.Country, models.Ecoregion]:
        for instance in DBSession.query(model).options(
                joinedload(getattr(model, 'taxa'))
        ):
            if not instance.taxa:
                instance.active = False
Beispiel #2
0
def add_sources(args, data):
    bib = Database.from_file(args.data_file('phoible-references.bib'),
                             lowercase=True)
    ext = [
        Record.from_string('@' + s, lowercase=True)
        for s in nfilter(BIB.split('@'))
    ]

    for rec in chain(ext, bib):
        if rec.id not in data['Source']:
            data.add(Source, rec.id, _obj=bibtex2source(rec))

    #
    # add aliases to lookup records with bibtex keys with numeric prefixes without
    # specifying the prefix
    #
    for key in list(data['Source'].keys()):
        if '_' in key:
            no, rem = key.split('_', 1)
            try:
                int(no)
                if rem not in data['Source']:
                    data['Source'][rem] = data['Source'][key]
            except (ValueError, TypeError):
                pass
Beispiel #3
0
 def _get(d, marker):
     _l = set(nfilter(d.get(marker, [])))
     if _l:
         _l = list(_l)
         if marker not in ['oo', 'or']:
             assert len(_l) == 1
             _l = _l[0]
         return _l
Beispiel #4
0
def load_ecoregions(data_file, data):
    ecoregions = jsonload(data_file('ecoregions.json'))['features']

    biome_map = {
        1: ('Tropical & Subtropical Moist Broadleaf Forests', '008001'),
        2: ('Tropical & Subtropical Dry Broadleaf Forests', '557715'),
        3: ('Tropical & Subtropical Coniferous Forests', ''),
        4: ('Temperate Broadleaf & Mixed Forests', ''),
        5: ('Temperate Conifer Forests', ''),
        6: ('Boreal Forests/Taiga', ''),
        7: ('Tropical & Subtropical Grasslands, Savannas & Shrublands', '98ff66'),
        8: ('Temperate Grasslands, Savannas & Shrublands', ''),
        9: ('Flooded Grasslands & Savannas', '0265fe'),
        10: ('Montane Grasslands & Shrublands', 'cdffcc'),
        11: ('Tundra', ''),
        12: ('Mediterranean Forests, Woodlands & Scrub', 'cc9900'),
        13: ('Deserts & Xeric Shrublands', 'feff99'),
        14: ('Mangroves', '870083'),
    }

    for eco_code, features in groupby(
            sorted(ecoregions, key=lambda e: e['properties']['eco_code']),
            key=lambda e: e['properties']['eco_code']):
        features = list(features)
        props = features[0]['properties']
        if int(props['BIOME']) not in biome_map:
            continue
        biome = data['Biome'].get(props['BIOME'])
        if not biome:
            name, color = biome_map[int(props['BIOME'])]
            biome = data.add(
                Biome, props['BIOME'],
                id=str(int(props['BIOME'])),
                name=name,
                description=color or 'ffffff')
        centroid = (None, None)
        f = sorted(features, key=lambda _f: _f['properties']['AREA'])[-1]
        if f['geometry']:
            coords = f['geometry']['coordinates'][0]
            if f['geometry']['type'] == 'MultiPolygon':
                coords = coords[0]
            centroid = get_center(coords)

        polygons = nfilter([_f['geometry'] for _f in features])
        data.add(
            Ecoregion, eco_code,
            id=eco_code,
            name=props['ECO_NAME'],
            description=props['G200_REGIO'],
            latitude=centroid[1],
            longitude=centroid[0],
            biome=biome,
            area=props['area_km2'],
            gbl_stat=Ecoregion.gbl_stat_map[int(props['GBL_STAT'])],
            realm=Ecoregion.realm_map[props['REALM']],
            jsondata=dict(polygons=polygons))
Beispiel #5
0
 def __init__(self, req, *args, **kw):
     Parameters.__init__(self, req, *args, **kw)
     if kw.get('languages'):
         self.languages = kw['languages']
     elif 'languages' in req.params:
         self.languages = nfilter([
             Language.get(id_, default=None)
             for id_ in req.params['languages'].split(',')])
     else:
         self.languages = []
     self._langs = [
         aliased(ValueSet, name='l%s' % i) for i in range(len(self.languages))]
Beispiel #6
0
    def add_counterpart(d, vs, id,
                        phonetic,  # forms
                        cognate,  # oo
                        me, cm, so, org):
        assert phonetic or cognate

        if not cognate:
            if vs.language.proto:
                cognate = phonetic
                phonetic = None
            else:
                cognate = '[%s]' % phonetic
        m = models.Counterpart(
            id=id,
            name=cognate,
            phonetic=phonetic,
            description=me or '[%s]' % vs.parameter.name,
            comment=cm,
            original_entry=org,
            other_reconstructions='; '.join(_get(d, 'or') or []) if vs.language.id == 'psi' else None,
            valueset=vs)
        if so:
            for sid in nfilter([s.strip() for s in SEP_PATTERN.split(so or '')]):
                match = SID_PATTERN.match(sid)
                if not match:
                    continue

                name = sid
                sid = normalize_sid(match.group('key'))
                source = data['Source'].get(sid)
                if not source:
                    if sid in sources:
                        s = sources[sid]
                        source = data.add(
                            common.Source, sid,
                            id=sid,
                            name=s['Name'].upper() if len(s['Name']) <= 3 else s['Name'],
                            description=s.get('Title', s['citation']),
                            author=s.get('Author'),
                            title=s.get('Title'),
                            year=s.get('Year'),
                        )
                    else:
                        source = data.add(
                            common.Source, sid,
                            id=sid,
                            name=name.upper() if len(name) <= 3 else name)
                m.references.append(models.ValueReference(
                    source=source, description=match.group('pages')))
Beispiel #7
0
def add_sources(args, data):
    bib = Database.from_file(args.data_file('phoible-references.bib'), lowercase=True)
    ext = [Record.from_string('@' + s, lowercase=True) for s in nfilter(BIB.split('@'))]

    for rec in chain(ext, bib):
        if rec.id not in data['Source']:
            data.add(Source, rec.id, _obj=bibtex2source(rec))

    #
    # add aliases to lookup records with bibtex keys with numeric prefixes without
    # specifying the prefix
    #
    for key in list(data['Source'].keys()):
        if '_' in key:
            no, rem = key.split('_', 1)
            try:
                int(no)
                if rem not in data['Source']:
                    data['Source'][rem] = data['Source'][key]
            except (ValueError, TypeError):
                pass
Beispiel #8
0
def parsed_words(words, fp, lang):
    for s, t in [
        ("nǁá(q)'ám", "nǁáq'ám, nǁá'ám"),
        ("ǀga̋é.b/(s)", "ǀga̋é.b, ǀga̋é.s"),
        ("ǀkhóò.b/(s)", "ǀkhóò.b, ǀkhóò.s"),
        ("nǂúq(y)è", "nǂúqyè, nǂúqè"),
        ("ǀ'hùī (n̏ǀ'hùīn)", "ǀ'hùī, n̏ǀ'hùīn"),
        ("sùr(ù)tsi̋ǃgùűbȅ.s", "sùr(ù)tsi̋ǃgùűbȅ.s, sùrùtsi̋ǃgùűbȅ.s"),
        ("dàqhńn(tê)", "dàqhńn, dàqhńntê"),
        ("ǀàālè (ǀàlé)", "ǀàālè, ǀàlé"),
        ("ǁúq(l)è", "ǁúqlè, ǁúqè"),
        ("nǃhȁè (nǃȁhè)", "nǃhȁè, nǃȁhè"),
        ("(ǀxòo) tsàhnà", "ǀxòo tsàhnà, tsàhnà"),
        ("(kú-)ǃáná", "kúǃáná, ǃáná"),
        ("ǀgài̋o.b(/s)", "ǀgài̋o.b, ǀgài̋o.s"),
        ("dz(h)òhè", "dzhòhè, dzòhè, dzhòè (?)"),
        ("ǀqx'á(y)è", "ǀqx'áyè, ǀqx'áè"),
    ]:
        words = words.replace(s, t)
    words = list(split_words(words))
    return nfilter(chain(*[parsed_word(words, i, fp, lang) for i in range(len(words))]))
 def normalized_branch(line):
     """parse a line specifying a language family as comma separated list of
     ancestors.
     """
     name_map = {
         'Unattested',  # keep top-level family as subfamily
         'Unclassifiable',  # keep top-level family as subfamily
         'Pidgin',  # keep top-level family as subfamily
         'Mixed Language',  # keep top-level family as subfamily
         'Artificial Language',  # keep top-level family as subfamily
         'Speech Register',  # keep top-level family as subfamily
         # FIXME: also 'Sign Language'?
         'Spurious',  # bookkeeping 'Preliminary'
     }
     branch = [
         unescape(n.strip().replace('_', ' ')) for n in line.split(',')
     ]
     if branch[0] not in name_map:
         return branch, 'established'
     family = branch.pop(0)
     subfamily = None
     retired = False
     if branch:
         # there's a second level!
         if family == 'Spurious':
             if branch[0] == 'Retired':
                 retired = True
                 branch.pop(0)
         else:
             subfamily = '%s (%s)' % (branch.pop(0), family)
     status = 'established'
     if family in ['Spurious', 'Unattested']:
         status = family.lower()
         if retired:
             status += ' retired'
     if family == 'Spurious':
         family = BOOKKEEPING
     return nfilter([family, subfamily]), status
 def normalized_branch(line):
     """parse a line specifying a language family as comma separated list of
     ancestors.
     """
     name_map = {
         "Unattested",  # keep top-level family as subfamily
         "Unclassifiable",  # keep top-level family as subfamily
         "Pidgin",  # keep top-level family as subfamily
         "Mixed Language",  # keep top-level family as subfamily
         "Artificial Language",  # keep top-level family as subfamily
         "Speech Register",  # keep top-level family as subfamily
         # FIXME: also 'Sign Language'?
         "Spurious",  # bookkeeping 'Preliminary'
     }
     branch = [unescape(n.strip().replace("_", " ")) for n in line.split(",")]
     if branch[0] not in name_map:
         return branch, "established"
     family = branch.pop(0)
     subfamily = None
     retired = False
     if branch:
         # there's a second level!
         if family == "Spurious":
             if branch[0] == "Retired":
                 retired = True
                 branch.pop(0)
         else:
             subfamily = "%s (%s)" % (branch.pop(0), family)
     status = "established"
     if family in ["Spurious", "Unattested"]:
         status = family.lower()
         if retired:
             status += " retired"
     if family == "Spurious":
         family = BOOKKEEPING
     return nfilter([family, subfamily]), status
Beispiel #11
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    data = Data()

    def read(table):
        return list(dsv.reader(
            args.data_file(table + '.csv'), delimiter=',', namedtuples=True))

    dataset = common.Dataset(
        id=ids.__name__,
        name="IDS",
        description="The Intercontinental Dictionary Series",
        #published=date(2009, 8, 15),
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'http://i.creativecommons.org/l/by-nc-nd/2.0/de/88x31.png',
            'license_name':
                'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany License',
        },
        domain='ids.clld.org')

    DBSession.add(dataset)
    data_desc = defaultdict(dict)
    for l in read('x_lg_data'):
        data_desc[l.lg_id][l.map_ids_data] = l.header

    # language lang
    exclude = []
    for l in read('lang'):
        if l.status == '1':
            exclude.append(l.lg_id)
            continue
        lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=l.lg_name)
        data.add(
            models.Dictionary, l.lg_id,
            id=l.lg_id, name=l.lg_name,
            language=lang,
            default_representation=data_desc[l.lg_id].get('1'),
            alt_representation=data_desc[l.lg_id].get('2'),
            jsondata=dict(status=l.status, date=l.date))

    iso_codes = {l.id: l.sil_code for l in read('sil_lang')}
    languages = {l.lg_id: iso_codes[l.sil_id]
                 for l in read('x_lg_sil') if l.lg_id not in exclude}
    load_families(Data(), [(v, data['IdsLanguage'][k]) for k, v in languages.items()])

    contributors = defaultdict(list)
    sources = defaultdict(list)
    for l in read('lang_compilers'):
        if l.lg_id in exclude:
            continue
        if l.name == "BIBIKO":
            continue
        #name	lg_id	what_did_id
        if int(l.what_did_id) in models.ROLES:
            contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id))
        else:
            if int(l.what_did_id) not in [4, 395]:
                print(l.what_did_id)
                raise ValueError
            sources[l.name].append(l.lg_id)

    for s, roles in contributors.items():
        name = roles[0][0]
        c = data.add(common.Contributor, s, id=s, name=name)
        if name == 'Mary Ritchie Key':
            c.address = 'University of California, Irvine'
        for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]):
            sroles = sorted(
                [s[1] for s in specs],
                reverse=True,
                key=lambda what: what + 2 if what == 2 else what)
            what = sroles[0]
            DBSession.add(common.ContributionContributor(
                contribution=data['Dictionary'][lg],
                contributor=c,
                ord=what,
                primary=what == 2))

    data.add(
        common.Contributor, 'bernardcomrie',
        id='bernardcomrie',
        name="Bernard Comrie",
        address="Max Planck Institute for Evolutionary Anthropology, Leipzig")

    for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']):
        common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1)

    for i, name in enumerate(sorted(sources.keys())):
        c = data.add(common.Source, name, id=str(i + 1), name=name, description=name)

    DBSession.flush()
    for name, lgs in sources.items():
        for lg in lgs:
            if lg in exclude:
                continue
            try:
                DBSession.add(common.LanguageSource(
                    language_pk=data['IdsLanguage'][lg].pk,
                    source_pk=data['Source'][name].pk))
            except KeyError:
                print(name, lgs)
                continue

    altnames = {}
    for i, l in enumerate(read('alt_names')):
        if l.name in altnames:
            identifier = altnames[l.name]
        else:
            identifier = data.add(
                common.Identifier, l.name,
                id='name-%s' % i, type='name', name=l.name, description='IDS')
            altnames[l.name] = identifier
        if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name:
            DBSession.add(common.LanguageIdentifier(
                identifier=identifier,
                language=data['IdsLanguage'][l.lg_id]))

    # parameter chapter/entry
    for l in read('chapter'):
        data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title)

    entries = {}
    for l in read('entry'):
        id_ = '%s-%s' % (l.chap_id, l.entry_id)
        name = l.trans_english
        if name in entries:
            entries[name] += 1
            name = name + ' (%s)' % entries[name]
        else:
            entries[name] = 1
        kw = {'id': id_, 'name': name, 'chapter': data['Chapter'][l.chap_id]}
        for ll in 'french russian spanish portugese'.split():
            kw[ll] = getattr(l, 'trans_' + ll)
        data.add(models.Entry, id_, sub_code=l.entry_id, **kw)

    misaligned = []

    DBSession.flush()
    for entity in 'IdsLanguage Entry Chapter Dictionary'.split():
        for k in data[entity].keys()[:]:
            data[entity][k] = data[entity][k].pk

    synsets = set()

    for lg_id, entries in groupby(
            sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id):
        if lg_id in exclude or not lg_id:
            continue

        # keep the memory footprint reasonable
        transaction.commit()
        transaction.begin()

        try:
            language = common.Language.get(data['IdsLanguage'][lg_id])
        except KeyError:
            print(list(entries))
            raise
        desc = data_desc.get(lg_id, {})
        words = defaultdict(list)
        for l in entries:
            if empty.match(l.data_1):
                continue

            entry_id = '%s-%s' % (l.chap_id, l.entry_id)
            if entry_id not in data['Entry']:
                data.add(
                    models.Entry, entry_id,
                    id=entry_id,
                    name=entry_id,
                    #active=False,
                    sub_code=l.entry_id,
                    chapter_pk=data['Chapter'][l.chap_id])
                DBSession.flush()
                data['Entry'][entry_id] = data['Entry'][entry_id].pk

            id_ = '%s-%s' % (entry_id, l.lg_id)
            if id_ in synsets:
                vs = models.Synset.get(id_)
            else:
                vs = models.Synset(
                    id=id_,
                    comment=get_string(l.comment or ''),
                    alt_representation=get_string(l.data_2),
                    language=language,
                    contribution_pk=data['Dictionary'][l.lg_id],
                    parameter_pk=data['Entry'][entry_id])
                synsets.add(id_)

            trans1 = list(split_counterparts(l.data_1))
            trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2))

            if trans2:
                if len(trans2) != len(trans1):
                    if language.id != '238':
                        misaligned.append((l.chap_id, l.entry_id, l.lg_id))
                        #print('===', language.id, language.name)
                        #print(l.data_1)
                        #print(l.data_2)
                    #assert language.id == '238'  # Rapa Nui has problems!
                    trans2 = None

            for i, word in enumerate(trans1):
                v = models.Counterpart(
                    id=id_ + '-' + str(i + 1 + len(vs.values)),
                    name=word,
                    description=desc.get('1'),
                    valueset=vs)
                words[word].append((v, trans2[i] if trans2 else None))

        for i, form in enumerate(words.keys()):
            # Since we identify words based on their string representation, we have to
            # make sure a word has the same alternative transcription for all meanings.
            if language.id == '238':
                alt_names = []
            else:
                alt_names = set(norm(w[1] or '', desc.get('2'), language.id)
                                for w in words[form])
            alt_names = nfilter(alt_names)
            try:
                assert len(alt_names) <= 1
            except AssertionError:
                print('---', language.id, language.name)
                print(alt_names)
            word = models.Word(
                id='%s-%s' % (language.id, i + 1),
                name=form,
                description=desc.get('1'),
                language=language,
                alt_name=', '.join(alt_names) if alt_names else None,
                alt_description=desc.get('2')
            )
            for v, _ in words[form]:
                word.counterparts.append(v)
            DBSession.add(word)

        DBSession.flush()

    with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp:
        fp.writerows(misaligned)
Beispiel #12
0
 def normalize_bib(name):
     return nfilter([slug(n.strip()) for n in name.split(' and ')])
Beispiel #13
0
def normalize_comma_separated(s, d, lower=False):
    if not s:
        return
    chunks = nfilter([_s.strip() for _s in s.split(',')])
    return ', '.join(
        d.get(_s.lower(), _s.lower() if lower else _s) for _s in chunks)
Beispiel #14
0
    def col_defs(self):
        kw = {}
        if self.language:
            kw['bSearchable'] = False
            kw['bSortable'] = False
        name_col = ValueNameCol(self, 'value', **kw)
        if self.parameter and self.parameter.domain:
            name_col.choices = [de.name for de in self.parameter.domain]

        class ValueLanguageCol(LinkCol):
            def search(self, qs):
                if self.dt.language:
                    return ValueSet.language_pk == int(qs)
                if self.dt.parameter:
                    return icontains(self.dt.vs_lang.name, qs)

            def order(self):
                if self.dt.parameter:
                    return cast(self.dt.vs_lang.id, Integer)
                if self.dt.language:
                    return ValueSet.language_pk

        lang_col = ValueLanguageCol(
            self,
            'language',
            model_col=Language.name,
            get_obj=lambda item: item.valueset.language,
            bSearchable=bool(self.parameter or self.language),
            bSortable=bool(self.parameter or self.language))
        if self.language:
            if self.language.lects:
                lang_col.choices = [
                    (l.pk, l.name) for l in [self.language] + self.language.lects]
                lang_col.js_args['sTitle'] = 'lect'
            else:
                lang_col = None

        get_param = lambda i: i.valueset.parameter
        if self.parameter:
            return nfilter([
                lang_col,
                name_col,
                FrequencyCol(self, '%') if self.parameter.multivalued else None,
                Col(self,
                    'lexifier',
                    format=lambda i: i.valueset.language.lexifier,
                    model_col=self.vs_lect.lexifier,
                    choices=get_distinct_values(
                        Lect.lexifier,
                        key=lambda v: 'z' + v if v == 'Other' else v)),
                LinkToMapCol(
                    self, 'm', get_object=lambda i: None
                    if i.valueset.language.language_pk else i.valueset.language),
                DetailsRowLinkCol(self, 'more')
                if self.parameter.feature_type != 'sociolinguistic' else None,
                RefsCol(self, 'source')
                if self.parameter.feature_type != 'segment' else None,
            ])
        if self.language:
            return nfilter([
                IntegerIdCol(self, 'id', get_obj=get_param, model_col=Parameter.id),
                LinkCol(self, 'parameter', get_obj=get_param, model_col=Parameter.name),
                name_col,
                FrequencyCol(self, '%'),
                lang_col,
                DetailsRowLinkCol(self, 'more'),
                RefsCol(self, 'source'),
            ])
        return [
            LinkCol(self, 'parameter', get_obj=get_param, model_col=Parameter.name),
            name_col,
            FrequencyCol(self, '%'),
            lang_col,
            DetailsRowLinkCol(self, 'more'),
            RefsCol(self, 'source'),
        ]