Beispiel #1
0
def update_providers(args):
    if not args.data_file(args.version, 'provider.txt').exists():
        return

    with open(args.data_file(args.version, 'provider.txt')) as fp:
        content = fp.read().decode('latin1')

    if '\r\n' in content:
        content = content.replace('\r\n', '\n')

    provider_map = get_map(Provider)
    for block in content.split('\n\n\n\n'):
        lines = block.split('\n')
        id_, abbr = lines[0].strip().split(':')
        id_ = id_.split('.')[0]
        description = unescape('\n'.join(lines[1:]))
        name = description.split('.')[0]

        if id_ == 'hedvig-tirailleur':
            id_ = u'skirgard'

        if slug(id_) not in provider_map:
            args.log.info('adding provider %s' % slug(id_))
            DBSession.add(
                Provider(id=slug(id_), name=name, description=description, abbr=abbr))
Beispiel #2
0
def update_providers(args, verbose=False):
    filepath = args.data_dir.joinpath('references', 'bibtex', 'BIBFILES.ini')
    p = RawConfigParser()
    with io.open(filepath, encoding='utf-8-sig') as fp:
        p.readfp(fp)

    provider_map = get_map(Provider)
    for section in p.sections():
        sectname = section[:-4] if section.endswith('.bib') else section
        id_ = slug(sectname)
        attrs = {
            'name': p.get(section, 'title'),
            'description': p.get(section, 'description'),
            'abbr': p.get(section, 'abbr'),
        }
        if id_ in provider_map:
            provider = provider_map[id_]
            for a in list(attrs):
                before, after = getattr(provider, a), attrs[a]
                if before == after:
                    del attrs[a]
                else:
                    setattr(provider, a, after)
                    attrs[a] = (before, after)
            if attrs:
                args.log.info('updating provider %s %s' % (slug(id_), sorted(attrs)))
            if verbose:
                for a, (before, after) in attrs.items():
                    before, after = (' '.join(_.split()) for _ in (before, after))
                    if before != after:
                        args.log.info('%s\n%r\n%r' % (a, before, after))
        else:
            args.log.info('adding provider %s' % slug(id_))
            DBSession.add(Provider(id=id_, **attrs))
Beispiel #3
0
def main(args, reload=False):
    species = {}
    db = args.data_file('theplantlist', 'db.json')
    if reload:
        for a in bs(get('/1.1/browse/-/')).find('ul', id='nametree').find_all('a'):
            with iopen(args.data_file('theplantlist', a.text + '.csv'), 'w', encoding='utf8') as fp:
                fp.write(get(a['href'] + a.text + '.csv'))

    if db.exists():
        with open(db) as fp:
            species = json.load(fp)
    else:
        for p in args.data_file('theplantlist').files('*.csv'):
            for row in reader(p, namedtuples=True, delimiter=','):
                if row.Taxonomic_status_in_TPL == 'Accepted':
                    id_ = slug(row.Genus + row.Species)
                    species[id_] = row.ID
        with open(db, 'w') as fp:
            json.dump(species, fp)

    with transaction.manager:
        found = 0
        for p in DBSession.query(Parameter):
            id_ = slug(p.name)
            if id_ in species:
                found += 1
                p.tpl_id = species[id_]

    print(found)
Beispiel #4
0
def reflink(name, mm, bib):
    name = name.replace('&', 'and')
    name = slug(name)
    name = authors_map.get(name, name)
    if name == 'puscariu' and mm.group('year') == '1943':
        name = 'puscariuandkuen'
    if name == 'mohling' and mm.group('year') == '1986':
        name = 'mohlig'
    if name == 'baht' and mm.group('year') == '1987':
        name = 'bhat'
    if (name, mm.group('year') + (mm.group('letter') or '')) not in bib:
        if (name, mm.group('year')) not in bib:
            print('###', (name, mm.group('year') + (mm.group('letter') or '')))
            return '%s%s%s' % (
                mm.group('year'),
                mm.group('letter') or '',
                ': ' + mm.group('pages') if mm.group('pages') else '')
        else:
            recid = bib[(name, mm.group('year'))]
    else:
        recid = bib[(name, mm.group('year') + (mm.group('letter') or ''))]
    global LINKED
    LINKED[recid] = 1
    return '<a href="__%s__">%s%s</a>%s' % (
        slug(recid),
        mm.group('year'),
        mm.group('letter') or '',
        ': ' + mm.group('pages') if mm.group('pages') else '')
Beispiel #5
0
 def get_ref(self, e, category=None):
     for f in e.find_all('font'):
         f.unwrap()
     t = text(e)
     ref = self.refs.get(slug(t))
     if ref:
         return dict(
             key=ref.name,
             id=slug(t),
             text='%s. %s.' % (ref.name, ref.description),
             html=u'<a href="/sources/{0.id}">{0.name}</a>. {0.description}.'.format(ref),
             category=category)
     match = YEAR.search(t)
     if match:
         authors = t[:match.start()].split('(')[0].strip()
         authors = [HumanName(n.strip()).last for n in authors.split('&')]
         key = '%s %s' % (' & '.join(authors), match.group('year').strip())
     else:
         key = None
     return dict(
         key=key,
         id=slug(key) if key else unicode(md5(t.encode('utf8')).hexdigest()),
         text=t,
         html=unicode(e),
         category=category)
Beispiel #6
0
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=cognition.__name__,
        name="COSTATOL",
        description="Cognitive Structures across the Tree of Life",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='cognition.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    DBSession.add(dataset)

    #
    # TODO: add editors!
    #

    for rec in Database.from_file(args.data_file('sources.bib')):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    contrib = common.Contribution(id='costatol', name='COSTATOL')
    for datapoint in reader(args.data_file('values.csv'), delimiter=',', dicts=True):
        param = data['Parameter'].get(datapoint['cognitive capacity'])
        if not param:
            name = datapoint['cognitive capacity']
            param = data.add(common.Parameter, name, id=slug(name), name=name)

        species = data['Language'].get(datapoint['species'])
        if not species:
            name = datapoint['species']
            species = data.add(common.Language, name, id=slug(name), name=name)

        vid = '%s-%s' % (species.id, param.id)
        vs = data.add(
            common.ValueSet,
            vid,
            id=vid,
            language=species,
            parameter=param,
            contribution=contrib)
        data.add(common.Value, vid, id=vid, name=datapoint['value'], valueset=vs)
        match = source_pattern.match(datapoint['source'])
        if match:
            DBSession.add(common.ValueSetReference(
                valueset=vs,
                source=data['Source'][match.group('key')],
                description=match.group('pages')))

    for species in reader(args.data_file('species.csv'), delimiter=',', namedtuples=True):
        data['Language'][species.name].longitude = species.longitude
        data['Language'][species.name].latitude = species.latitude
Beispiel #7
0
def main(args):
    glottocodes = {}
    if getuser() == "robert":
        glottocodes = glottocodes_by_isocode("postgresql://robert@/glottolog3")

    data = Data()
    dataset = common.Dataset(id=autotyp.__name__, name="AUTOTYP", description="AUTOTYP", domain="autotyp.clld.org")
    DBSession.add(dataset)

    bib = Database.from_file(args.data_file("LenaBib.bib"), lowercase=True)

    for i, spec in enumerate(
        [
            ("bickel", "Balthasar Bickel", "University of Zurich"),
            ("nichols", "Johanna Nichols", "University of California, Berkeley"),
        ]
    ):
        contributor = data.add(common.Contributor, spec[0], id=spec[0], name=spec[1])
        DBSession.add(common.Editor(dataset=dataset, ord=i + 1, contributor=contributor))

    for l in rows(
        args.data_file("backbone_09Jan2014_directexport.tab"), newline="\r", encoding="macroman", namedtuples=True
    ):
        # LID	language	ISO639.3.2013	stock	continent	area	latitude	longitude
        if l.stock not in data["Stock"]:
            stock = data.add(models.Stock, l.stock, id=slug(l.stock), name=l.stock)
        else:
            stock = data["Stock"][l.stock]

        if l.continent not in data["Continent"]:
            continent = data.add(models.Continent, l.continent, id=slug(l.continent), name=l.continent)
        else:
            continent = data["Continent"][l.continent]

        if l.area not in data["Area"]:
            area = data.add(models.Area, l.area, id=slug(l.area), name=l.area, continent=continent)
        else:
            area = data["Area"][l.area]

        lang = data.add(
            models.Languoid,
            l.LID,
            id=l.LID,
            name=l.language,
            latitude=coord(l.latitude),
            longitude=coord(l.longitude),
            stock=stock,
            area=area,
        )
        add_language_codes(data, lang, l.ISO639_3_2013, glottocodes=glottocodes)

    loader.case_alignment(args, data, bib)
    loader.inclusive_excusive(args, data, bib)
Beispiel #8
0
 def __init__(self, fname):
     self.fname = fname
     self.authors = [c.id for c in DBSession.query(Contributor)]
     self.languages = {l.id: l.name for l in DBSession.query(Language)}
     self.id = self.get_id(fname)
     self.refs = {slug(s.name): s for s in DBSession.query(Source) if s.name}
     self.examples = defaultdict(list)
     for row in DBSession.query(Sentence):
         if row.description:
             self.examples[slug(row.description.split('OR:')[0])].append(
                 (row.name, row.id))
     for k in self.examples.keys():
         self.examples[k] = {slug(k): v for k, v in self.examples[k]}
Beispiel #9
0
def get_normalized_name(authors):
    authors = authors.lower()
    if ', and ' in authors:
        afirst, alast = authors.split(', and ')
        parts = afirst.split(',', 2)
        if len(parts) > 2:
            # Janhunen, Juha, Marja Peltomaa, Erika Sandman, and Xiawu Dongzhou
            return slug(parts[1] + parts[0] + parts[2] + alast)
        else:
            # Goswami, G. C., and Jyotiprakash Tamuli
            return slug(parts[1] + parts[0] + alast)
    else:
        # Fuchs, David R
        last, first = authors.split(',')
        return slug(first + last)
Beispiel #10
0
def main(args):
    data = Data()

    # fetch language data from glottolog:
    glottolog = glottocodes_by_isocode(
        'postgresql://robert@/glottolog3', ['id', 'name', 'latitude', 'longitude'])

    dataset = common.Dataset(
        id=jcld.__name__,
        name="Journal of Cross-Linguistic Databases",
        domain='jcld.clld.org')
    DBSession.add(dataset)

    contribution = data.add(common.Contribution, '1', id='1', name='fb')

    for i, row in enumerate(reader(file(args.data_file('fb_jcld.tab')), namedtuples=True, encoding='latin1')):
        if row.Feature not in data['Parameter']:
            parameter = data.add(common.Parameter, row.Feature, id='1', name=row.Feature)
        else:
            parameter = data['Parameter'][row.Feature]

        if row.Value not in data['DomainElement']:
            de = data.add(
                common.DomainElement, row.Value,
                id='%s-%s' % (parameter.id, slug(row.Value)), parameter=parameter, name=row.Value)
        else:
            de = data['DomainElement'][row.Value]

        if row.Language not in data['Language']:
            if row.Language not in glottolog:
                print '--->', row.Language
                continue
            glottocode, name, lat, lon = glottolog[row.Language]
            language = data.add(
                common.Language, row.Language,
                id=slug(row.Language), name=name, latitude=lat, longitude=lon)
        else:
            language = data['Language'][row.Language]

        id_ = str(i + 1)  #'%s-%s' % (parameter.id, language.id)
        vs = common.ValueSet(
            id=id_,
            parameter=parameter,
            language=language,
            contribution=contribution,
            description=row.Comment,
            source=row.Source)
        common.Value(valueset=vs, name=row.Value, domainelement=de)
Beispiel #11
0
def get_genera(data):
    """
    Zo'e: tupiguarani
    """
    sql = """select g.id, g.name, f.name
from genus as g, family as f
where g.family_pk = f.pk"""
    walsdb = create_engine('postgresql://robert@/wals3')
    genera = {}
    for row in walsdb.execute(sql):
        genus = data.add(models.Genus,
                         row[0],
                         id=row[0],
                         name=row[1],
                         description=row[2])
        genera[row[0]] = genus
        genera[slug(row[1])] = genus

    sql = """select l.iso_codes, g.id
from walslanguage as l, genus as g
where l.genus_pk = g.pk and l.iso_codes is not null"""
    for row in walsdb.execute(sql):
        for code in row[0].split(', '):
            if code not in genera:
                genera[code] = genera[row[1]]

    for row in walsdb.execute("select key, value from config"):
        if row[0].startswith('__Genus_'):
            gid = row[0].replace('_', '').split('Genus', 1)[1]
            genera[gid] = None if row[1] == '__gone__' else genera[row[1]]
    return genera
Beispiel #12
0
    def from_csv(cls, row, data=None, description=None):
        obj = cls(**{n: row[i] for i, n in enumerate(cls.__csv_head__) if '__' not in n and n != 'audio'})
        if not slug(row[1]):
            obj.active = False
        row = dict(list(zip(cls.__csv_head__, row)))
        sid = row['taxa__id']
        lid = row['languages__id']
        vsid = '%s-%s' % (sid, lid)
        if vsid in data['ValueSet']:
            obj.valueset = data['ValueSet'][vsid]
        else:
            # Note: source and references are dumped redundantly with each word, so we
            # only have to recreate these if a new ValueSet had to be created.
            obj.valueset = data.add(
                ValueSet, vsid,
                id=vsid,
                parameter=data['Taxon'][sid],
                language=data['Languoid'][lid],
                contribution=data['Contribution']['tsammalex'])

        if row['refs__ids']:
            for i, rid, pages in parse_ref_ids(row['refs__ids']):
                data.add(
                    NameReference, '%s-%s' % (obj.id, i),
                    name=obj,
                    source=data['Bibrec'][rid],
                    description=pages or None)
        for rel, cls in [
            ('categories', 'Category'),
            ('habitats', 'Category'),
            ('uses', 'Use')
        ]:
            for id_ in split_ids(row[rel + '__ids']):
                getattr(obj, rel).append(data[cls][id_.strip()])
        return obj
Beispiel #13
0
 def language_lookup(self):
     if not self._language_lookup:
         self._language_lookup = {
             slug(v): k
             for (k, v) in self.languages.items()
         }
     return self._language_lookup
Beispiel #14
0
def upgrade():
    conn = Connection(op.get_bind())

    # The genus for Yanesha’ needs to be renamed Yanesha’.
    conn.update(Genus, dict(name="Yanesha'"), id='westernarawakan')

    # Bahuana
    conn.update_name('bah', 'Xiriana')
    conn.update_glottocode('bah', 'xiri1243')
    conn.update_iso('bah', xir='Xiriâna')
    coords = Coordinates('2d40N', '62d30W')
    conn.update(Language,
                dict(latitude=coords.latitude, longitude=coords.longitude),
                id='bah')

    spk = conn.execute('select max(pk) from source').fetchone()[0] + 1
    lpk = conn.pk(Language, 'bah')
    spk = conn.insert(
        Source,
        pk=spk,
        id='Ramirez-1992',
        name='Ramirez 1992',
        description='Bahuana: une nouvelle langue de la famille Arawak',
        bibtex_type=EntryType.book,
        author='Ramirez, Henri',
        year='1992',
        title='Bahuana: une nouvelle langue de la famille Arawak',
        address='Paris',
        publisher='Amerindia')
    conn.insert(LanguageSource, language_pk=lpk, source_pk=spk)
    vspk = conn.pk(ValueSet, lpk, attr='language_pk')
    conn.insert(ValueSetReference,
                valueset_pk=vspk,
                source_pk=spk,
                description='35')

    # split northern arawakan
    GENERA = {
        'Alto-Orinoco': 'bnw mpr'.split(),
        'Caribbean Arawakan': 'ara grf goa'.split(),
        'Inland Northern Arawakan': 'acg bae cur ppc res tar wrk ycn'.split(),
        'Bahuanic': ['bah'],
        'Wapishanan': ['wps'],
    }
    ICONS = ['cdd0000', 'cffcc00', 'cffff00', 'cff6600', 'cffffcc']

    fpk = conn.pk(Family, 'arawakan')
    for icon, (name, lids) in zip(ICONS, GENERA.items()):
        gpk = conn.insert(Genus,
                          id=slug(name),
                          name=name,
                          icon=icon,
                          family_pk=fpk)
        for lid in lids:
            conn.update_genus(lid, gpk)

    conn.insert(Config,
                key=Config.replacement_key(Genus, 'northernarawakan'),
                value=Config.gone)
    conn.delete(Genus, id='northernarawakan')
Beispiel #15
0
def get_genera(data):
    """
    Zo'e: tupiguarani
    """
    sql = """select g.id, g.name, f.name
from genus as g, family as f
where g.family_pk = f.pk"""
    walsdb = create_engine('postgresql://robert@/wals3')
    genera = {}
    for row in walsdb.execute(sql):
        genus = data.add(models.Genus, row[0], id=row[0], name=row[1], description=row[2])
        genera[row[0]] = genus
        genera[slug(row[1])] = genus

    sql = """select l.iso_codes, g.id
from walslanguage as l, genus as g
where l.genus_pk = g.pk and l.iso_codes is not null"""
    for row in walsdb.execute(sql):
        for code in row[0].split(', '):
            if code not in genera:
                genera[code] = genera[row[1]]

    for row in walsdb.execute("select key, value from config"):
        if row[0].startswith('__Genus_'):
            gid = row[0].replace('_', '').split('Genus', 1)[1]
            genera[gid] = None if row[1] == '__gone__' else genera[row[1]]
    return genera
Beispiel #16
0
def issue24(session, timestamp):  # pragma: no cover
    #- Update language cea (name, coords, alternative names, iso code (and name))
    #Change name of Cree (Eastern) to Cree (Swampy)
    #Change coordinates to 56dN, 90dW
    #Change the Ethnologue name to Cree (Swampy)
    #Remove the Routledge and Other names
    #Change the ISO code to csw. glottocode to swam1239
    cea = common.Language.get('cea', session=session)
    cre = common.Language.get('cre', session=session)

    for i in range(len(cea.languageidentifier)):
        try:
            del cea.languageidentifier[i]
        except IndexError:
            pass

    for values in [
        ('gc-csw', 'swam1239', 'Swampy Cree', 'glottolog'),
        ('csw', 'csw', 'Cree (Swampy)', 'iso639-3'),
        ('ethnologue-csw', 'Cree (Swampy)', 'ethnologue', 'name'),
    ]:
        id = common.Identifier(
            **dict(zip('id name description type'.split(), values)))
        cea.languageidentifier.append(
            common.LanguageIdentifier(language=cea, identifier=id))

    cea.updated = timestamp
    cea.name = 'Cree (Swampy)'
    cea.ascii_name = slug('Cree (Swampy)')
    cea.latitude = 56.0
    cea.longitude = -90.0

    for pid in ['81A', '82A', '83A']:
        vsq = session.query(common.ValueSet)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == pid)
        vs1 = vsq.filter(common.ValueSet.language_pk == cea.pk).one()
        vs2 = vsq.filter(common.ValueSet.language_pk == cre.pk).one()
        vs2.updated = timestamp
        vs1.updated = timestamp

        for ref in vs1.references:
            if ref.source.id == 'Hive-1948':
                ref.valueset = vs2

    session.flush()

    #- Delete valueset 85A-cea
    vs = session.query(common.ValueSet)\
        .join(common.Parameter)\
        .filter(common.Parameter.id == '85A')\
        .filter(common.ValueSet.language_pk == cea.pk).one()

    session.delete(vs.values[0])
    session.delete(vs.references[0])
    session.delete(vs)

    #- delete valueset 131A-cea add 131A-cre
    vs_switch_lang(session, timestamp, '131A-cea', 'cre')
Beispiel #17
0
def match_obsolete_refs(args):
    with open(args.data_file(args.version, 'obsolete_refs.json')) as fp:
        refs = json.load(fp)
    matched = args.data_file(args.version, 'obsolete_refs_matched.json')
    if matched.exists():
        with open(matched) as fp:
            matched = json.load(fp)
    else:
        matched = {}

    #
    # TODO: optionally re-evaluate known-unmatched refs!
    #

    count = 0
    f, m = 0, 0
    for id_ in refs:
        if id_ in matched:
            continue
        count += 1
        if count > 1000:
            print '1000 obsolete refs processed!'
            break
        ref = Ref.get(id_)
        found = False
        if ref.description and len(ref.description) > 5:
            for match in DBSession.query(Ref)\
                    .filter(not_(Source.id.in_(refs)))\
                    .filter(Source.description.contains(ref.description))\
                    .filter(or_(Source.author == ref.author, Source.year == ref.year))\
                    .limit(10):
                print '++', ref.id, '->', match.id, '++', ref.author, '->', match.author, '++', ref.year, '->', match.year
                matched[ref.id] = match.id
                found = True
                break
            if not found and ref.name and len(ref.name) > 5:
                for match in DBSession.query(Ref)\
                        .filter(not_(Source.id.in_(refs)))\
                        .filter(Source.name == ref.name)\
                        .limit(10):
                    try:
                        if match.description and ref.description and slug(match.description) == slug(ref.description):
                            print '++', ref.id, '->', match.id, '++', ref.description, '->', match.description
                            matched[ref.id] = match.id
                            found = True
                            break
                    except AssertionError:
                        continue
        if not found:
            m += 1
            print '--', ref.id, ref.name, ref.description
            matched[ref.id] = None
        else:
            f += 1
    print f, 'found'
    print m, 'missed'

    with open(args.data_file(args.version, 'obsolete_refs_matched.json'), 'w') as fp:
        json.dump(matched, fp)
Beispiel #18
0
 def get_id(self, fname):
     match = self.fname_pattern.search(fname.name)
     assert match
     lid = self.language_lookup.get(slug(match.group('name')))
     if lid:
         return '%s.%s' % (lid, '%(vol)s-%(no)s' % match.groupdict())
     assert not match.group('no')
     return '%(vol)s-%(name)s' % match.groupdict()
Beispiel #19
0
 def update_name(self, lid, newname, other=None):
     lpk = self.pk(Language, lid)
     self.update(Language, dict(name=newname), pk=lpk)
     self.update(
         WalsLanguage, dict(ascii_name=slug(newname, remove_whitespace=False)), pk=lpk)
     if other:
         ipk = self.insert(Identifier, name=other, description='other', type='name')
         self.insert(LanguageIdentifier, identifier_pk=ipk, language_pk=lpk)
Beispiel #20
0
 def get_id(self, fname):
     match = self.fname_pattern.search(fname.name)
     assert match
     lid = self.language_lookup.get(slug(match.group('name')))
     if lid:
         return '%s.%s' % (lid, '%(vol)s-%(no)s' % match.groupdict())
     assert not match.group('no')
     return '%(vol)s-%(name)s' % match.groupdict()
Beispiel #21
0
 def repl2(match):
     s = match.string[match.start():match.end()]
     id_ = slug(match.group('key').replace('&amp;', '&'))
     ref = self.refs.get(id_)
     if not ref or id_ in ids:
         return s
     return '%s<a href="/sources/%s">%s</a>%s' \
            % (match.group('before'), ref.id, match.group('key'), match.group('after'))
Beispiel #22
0
def issue24(session, timestamp):
    #- Update language cea (name, coords, alternative names, iso code (and name))
    #Change name of Cree (Eastern) to Cree (Swampy)
    #Change coordinates to 56dN, 90dW
    #Change the Ethnologue name to Cree (Swampy)
    #Remove the Routledge and Other names
    #Change the ISO code to csw. glottocode to swam1239
    cea = common.Language.get('cea', session=session)
    cre = common.Language.get('cre', session=session)

    for i in range(len(cea.languageidentifier)):
        try:
            del cea.languageidentifier[i]
        except IndexError:
            pass

    for values in [
        ('gc-csw', 'swam1239', 'Swampy Cree', 'glottolog'),
        ('csw', 'csw', 'Cree (Swampy)', 'iso639-3'),
        ('ethnologue-csw', 'Cree (Swampy)', 'ethnologue', 'name'),
    ]:
        id = common.Identifier(**dict(zip('id name description type'.split(), values)))
        cea.languageidentifier.append(common.LanguageIdentifier(language=cea, identifier=id))

    cea.updated = timestamp
    cea.name = 'Cree (Swampy)'
    cea.ascii_name = slug('Cree (Swampy)')
    cea.latitude = 56.0
    cea.longitude = -90.0

    for pid in ['81A', '82A', '83A']:
        vsq = session.query(common.ValueSet)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == pid)
        vs1 = vsq.filter(common.ValueSet.language_pk == cea.pk).one()
        vs2 = vsq.filter(common.ValueSet.language_pk == cre.pk).one()
        vs2.updated = timestamp
        vs1.updated = timestamp

        for ref in vs1.references:
            if ref.source.id == 'Hive-1948':
                ref.valueset = vs2

    session.flush()


    #- Delete valueset 85A-cea
    vs = session.query(common.ValueSet)\
        .join(common.Parameter)\
        .filter(common.Parameter.id == '85A')\
        .filter(common.ValueSet.language_pk == cea.pk).one()

    session.delete(vs.values[0])
    session.delete(vs.references[0])
    session.delete(vs)

    #- delete valueset 131A-cea add 131A-cre
    vs_switch_lang(session, timestamp, '131A-cea', 'cre')
Beispiel #23
0
def main(args):
    if args.cmd == 'convert':
        outdir = args.data_file('texts', args.what).joinpath('lo')
        if args.what == 'Atlas':
            for p in args.data_file('texts', args.what).joinpath('in').files():
                if p.ext in ['.doc', '.docx']:
                    convert_chapter(p, outdir)
        elif args.what == 'Surveys':
            pass
    if args.cmd == 'parse':
        outdir = args.data_file('texts', args.what).joinpath('processed')
        for p in args.data_file('texts', args.what).joinpath('lo').files():
            if args.in_name in p.namebase:
                globals()[args.what](p)(outdir)
    if args.cmd == 'refs':
        refs = []
        for p in args.data_file(
                'texts', args.what).joinpath('processed').files('*.json'):
            if args.in_name in p.namebase:
                md = jsonload(p)
                refs.extend(md['refs'])
        db = get_bibtex(refs)
        unmatched = 0
        distinct = defaultdict(list)
        for i, rec in enumerate(db):
            if 'all' in rec:
                unmatched += 1
            distinct[(slug(rec.get('key', unicode(uuid4().hex))),
                      slug(unicode(rec.get('title',
                                           uuid4().hex)),
                           remove_whitespace=False))] = 1
        print unmatched, 'of', i, 'distinct', len(distinct)

        c = 0
        for key, refs in groupby(sorted(distinct.keys()), key=lambda t: t[0]):
            refs = list(refs)
            if len(refs) > 1:
                for t1, t2 in combinations([t[1] for t in refs], 2):
                    if fuzz.partial_ratio(t1, t2) > 80:
                        print t1
                        print t2
                        print
                        c += 1
        print c
        return
Beispiel #24
0
def main(args):
    if args.cmd == 'convert':
        outdir = args.data_file('texts', args.what).joinpath('lo')
        if args.what == 'Atlas':
            for p in args.data_file('texts', args.what).joinpath('in').files():
                if p.ext in ['.doc', '.docx']:
                    convert_chapter(p, outdir)
        elif args.what == 'Surveys':
            pass
    if args.cmd == 'parse':
        outdir = args.data_file('texts', args.what).joinpath('processed')
        for p in args.data_file('texts', args.what).joinpath('lo').files():
            if args.in_name in p.namebase:
                globals()[args.what](p)(outdir)
    if args.cmd == 'refs':
        refs = []
        for p in args.data_file('texts', args.what).joinpath('processed').files('*.json'):
            if args.in_name in p.namebase:
                md = jsonload(p)
                refs.extend(md['refs'])
        db = get_bibtex(refs)
        unmatched = 0
        distinct = defaultdict(list)
        for i, rec in enumerate(db):
            if 'all' in rec:
                unmatched += 1
            distinct[(
                slug(rec.get('key', unicode(uuid4().hex))),
                slug(unicode(rec.get('title', uuid4().hex)), remove_whitespace=False)
            )] = 1
        print unmatched, 'of', i, 'distinct', len(distinct)

        c = 0
        for key, refs in groupby(sorted(distinct.keys()), key=lambda t: t[0]):
            refs = list(refs)
            if len(refs) > 1:
                for t1, t2 in combinations([t[1] for t in refs], 2):
                    if fuzz.partial_ratio(t1, t2) > 80:
                        print t1
                        print t2
                        print
                        c += 1
        print c
        return
Beispiel #25
0
 def refactor(self, soup, md):
     d = BeautifulSoup('<body></body>')
     body = d.find('body')
     linked = 0
     notlinked = 0
     multiple = 0
     for p in self._chunks(soup):
         if not isinstance(p, list):
             p = [p]
         for pp in p:
             if pp.is_header:
                 continue
             elif pp.is_refs:
                 md['refs'] = [self.get_ref(line[0]) for line in pp.lines]
             else:
                 ex = None
                 if pp.is_example:
                     container = d.new_tag(
                         'blockquote',
                         **{
                             'class': 'example',
                             'style': 'font-size:100%;padding-left:1.8em;margin-left:0.3em'})
                     #body.append(Tag(name='hr'))
                 else:
                     container = body
                 for e, line, t in pp.lines:
                     body.append(e)
                     if pp.is_example:
                         if re.match('\([0-9]+\)', line):
                             e.attrs['style'] = 'text-indent:-2em'
                         equo = "’".decode('utf8')
                         if line.startswith("‘".decode('utf8')) and equo in line:
                             line = equo.join(line[1:].split(equo)[:-1]).strip()
                             examples = self.examples.get(slug(line))
                             if examples:
                                 if len(examples) > 1:
                                     #print '~~~', line
                                     multiple += 1
                                 else:
                                     ex = examples.values()[0]
                                     #print '+++'
                                     linked += 1
                             else:
                                 #print '---', line
                                 notlinked += 1
                     container.append(e)
                 if pp.is_example:
                     if ex:
                         container.attrs['id'] = 'ex-' + ex
                         container.append(new_tag(d, 'small', new_tag(
                             d, 'a', 'See example ' + ex, href='/sentences/' + ex)))
                     body.append(container)
     #print 'examples:', linked, 'linked,', notlinked, 'not linked,', multiple, 'multiple choices'
     for e in body.find_all('font'):
         e.unwrap()
     return d
def upgrade():
    conn = Connection(op.get_bind())

    # The genus for Yanesha’ needs to be renamed Yanesha’.
    conn.update(Genus, dict(name="Yanesha'"), id='westernarawakan')

    # Bahuana
    conn.update_name('bah', 'Xiriana')
    conn.update_glottocode('bah', 'xiri1243')
    conn.update_iso('bah', xir='Xiriâna')
    coords = Coordinates('2d40N', '62d30W')
    conn.update(
        Language, dict(latitude=coords.latitude, longitude=coords.longitude), id='bah')

    spk = conn.execute('select max(pk) from source').fetchone()[0] + 1
    lpk = conn.pk(Language, 'bah')
    spk = conn.insert(
        Source,
        pk=spk,
        id='Ramirez-1992',
        name='Ramirez 1992',
        description='Bahuana: une nouvelle langue de la famille Arawak',
        bibtex_type=EntryType.book,
        author='Ramirez, Henri',
        year='1992',
        title='Bahuana: une nouvelle langue de la famille Arawak',
        address='Paris',
        publisher='Amerindia')
    conn.insert(LanguageSource, language_pk=lpk, source_pk=spk)
    vspk = conn.pk(ValueSet, lpk, attr='language_pk')
    conn.insert(ValueSetReference, valueset_pk=vspk, source_pk=spk, description='35')

    # split northern arawakan
    GENERA = {
        'Alto-Orinoco': 'bnw mpr'.split(),
        'Caribbean Arawakan': 'ara grf goa'.split(),
        'Inland Northern Arawakan': 'acg bae cur ppc res tar wrk ycn'.split(),
        'Bahuanic': ['bah'],
        'Wapishanan': ['wps'],
    }
    ICONS = ['cdd0000', 'cffcc00', 'cffff00', 'cff6600', 'cffffcc']

    fpk = conn.pk(Family, 'arawakan')
    for icon, (name, lids) in zip(ICONS, GENERA.items()):
        gpk = conn.insert(Genus, id=slug(name), name=name, icon=icon, family_pk=fpk)
        for lid in lids:
            conn.update_genus(lid, gpk)

    conn.insert(
        Config,
        key=Config.replacement_key(Genus, 'northernarawakan'),
        value=Config.gone)
    conn.delete(Genus, id='northernarawakan')
Beispiel #27
0
def _get_bibtex(refs):
    for ref in refs:
        genre = 'misc'
        id = ref['id']
        attrs = dict(all=ref['text'])
        t = ref['text']
        match = YEAR.search(t)
        if match:
            authors = 'editor' if match.group('ed') else 'author'
            attrs['key'], attrs[authors] = normalized_author(t[:match.start()].strip())
            attrs['title'], rem = [s.strip() for s in re.split('\.|\?', t[match.end():], 1)]
            attrs['year'] = match.group('year')
            attrs['key'] = '%(key)s %(year)s' % attrs
            m = EDS.match(rem)
            if m:
                assert 'editor' not in attrs
                attrs['editor'] = normalized_author(m.group('eds').strip())[1]
                genre = 'incollection'
                rem = rem[m.end():].strip()
                mm = BTITLE_PAGES.match(rem)
                if mm:
                    attrs['booktitle'] = mm.group('btitle').strip()
                    attrs['pages'] = mm.group('pages').strip()
                    rem = rem[mm.end():].strip()
            else:
                mm = JOURNAL.match(rem)
                if mm:
                    genre = 'article'
                    attrs['journal'] = mm.group('journal').strip()
                    attrs['volume'] = mm.group('volume').strip()
                    if mm.group('number'):
                        attrs['number'] = mm.group('number').strip()
                    attrs['pages'] = mm.group('pages').strip()
                    rem = rem[mm.end():].strip()
            m = PUBLISHER.match(rem)
            if m:
                if genre == 'misc':
                    genre = 'book'
                attrs['place'] = m.group('place').strip()
                attrs['publisher'] = m.group('publisher').strip()
                rem = rem[m.end():].strip()
            _rem = []
            for piece in [p.strip() for p in re.split('\.(?:\s+|$)', rem) if p.strip()]:
                if piece.startswith('http') and not re.search('\s+', piece):
                    attrs['url'] = piece
                elif piece.startswith('(') and piece.endswith(')'):
                    attrs['note'] = piece[1:-1].strip()
                else:
                    _rem.append(piece)
            rem = '. '.join(_rem)
            if not slug(unicode(rem)):
                del attrs['all']
        yield Record(genre, id, **attrs)
Beispiel #28
0
def upgrade():
    conn = Connection(op.get_bind())

    # https://github.com/clld/wals-data/issues/50
    fpk = conn.pk(Family, 'utoaztecan')
    gname = 'California Uto-Aztecan'
    gid = slug(gname)
    gpk = conn.insert(Genus, id=gid, name=gname, icon='fffff00', family_pk=fpk)

    for oid in ['takic', 'tubatulabal']:
        opk = conn.pk(Genus, oid)
        conn.update(WalsLanguage, dict(genus_pk=gpk), genus_pk=opk)
        conn.insert(Config, key=Config.replacement_key(Genus, oid), value=gid)
        conn.delete(Genus, id=oid)

    # https://github.com/clld/wals-data/issues/49
    conn.update_name('aym', 'Aymara (Central)')
    conn.update_glottocode('aym', 'cent2142')
    conn.update_iso('aym', 'ayr', ayc='Southern Aymara')

    # https://github.com/clld/wals-data/issues/48
    # The genus Guaymi should be renamed Guaymiic.
    conn.update(Genus, dict(name='Guaymiic'), id='guaymi')

    # The genus Aruak should be renamed Arhuacic.
    conn.update(Genus, dict(name='Arhuacic'), id='aruak')

    # The language Motilón should be renamed Barí (while keeping Motilón as the name of
    # the genus).
    conn.update_name('mti', 'Barí')

    # The genus Chibchan Proper should be split into two genera, Chibcha-Duit, containing
    # the language Muisca, and Tunebo, containing the language Tunebo.
    conn.update_genus('msc', ('chibchaduit', 'Chibcha-Duit', 'fffff00'),
                      family='chibchan')
    conn.update_genus('tnb', ('tunebo', 'Tunebo', 'fffcc00'),
                      family='chibchan')
    conn.insert(Config,
                key=Config.replacement_key(Genus, 'chibchanproper'),
                value=Config.gone)
    conn.delete(Genus, id='chibchanproper')

    # https://github.com/clld/wals-data/issues/44
    conn.update_name('jlu', 'Luwo', other='Jur Luwo')

    # https://github.com/clld/wals-data/issues/43
    conn.update_genus('ctw', ('catawban', 'Catawban', 'fffcc00'),
                      family='siouan')
    conn.update(Genus, dict(name='Core Siouan'), id='siouan')

    # https://github.com/clld/wals-data/issues/40
    conn.update_source('Sumbuk-2002', year='1999', name='Sumbuk 1999')
Beispiel #29
0
 def yield_valid_authors(self, authors):
     for name in authors:
         n = HumanName(name)
         res = dict(name=name, id=slug(n.last + n.first + n.middle))
         if name == 'Margot C. van den Berg':
             res['id'] = 'vandenbergmargotc'
         if name == 'Khin Khin Aye':
             res['id'] = 'khinkhinayenone'
         if name == 'Melanie Halpap':
             res['id'] = 'revismelanie'
         if res['id'] not in self.authors:
             raise ValueError(name)
         yield res
def upgrade():
    conn = Connection(op.get_bind())

    # https://github.com/clld/wals-data/issues/50
    fpk = conn.pk(Family, 'utoaztecan')
    gname = 'California Uto-Aztecan'
    gid = slug(gname)
    gpk = conn.insert(Genus, id=gid, name=gname, icon='fffff00', family_pk=fpk)

    for oid in ['takic', 'tubatulabal']:
        opk = conn.pk(Genus, oid)
        conn.update(WalsLanguage, dict(genus_pk=gpk), genus_pk=opk)
        conn.insert(Config, key=Config.replacement_key(Genus, oid), value=gid)
        conn.delete(Genus, id=oid)

    # https://github.com/clld/wals-data/issues/49
    conn.update_name('aym', 'Aymara (Central)')
    conn.update_glottocode('aym', 'cent2142')
    conn.update_iso('aym', 'ayr', ayc='Southern Aymara')

    # https://github.com/clld/wals-data/issues/48
    # The genus Guaymi should be renamed Guaymiic.
    conn.update(Genus, dict(name='Guaymiic'), id='guaymi')

    # The genus Aruak should be renamed Arhuacic.
    conn.update(Genus, dict(name='Arhuacic'), id='aruak')

    # The language Motilón should be renamed Barí (while keeping Motilón as the name of
    # the genus).
    conn.update_name('mti', 'Barí')

    # The genus Chibchan Proper should be split into two genera, Chibcha-Duit, containing
    # the language Muisca, and Tunebo, containing the language Tunebo.
    conn.update_genus(
        'msc', ('chibchaduit', 'Chibcha-Duit', 'fffff00'), family='chibchan')
    conn.update_genus(
        'tnb', ('tunebo', 'Tunebo', 'fffcc00'), family='chibchan')
    conn.insert(
        Config, key=Config.replacement_key(Genus, 'chibchanproper'), value=Config.gone)
    conn.delete(Genus, id='chibchanproper')

    # https://github.com/clld/wals-data/issues/44
    conn.update_name('jlu', 'Luwo', other='Jur Luwo')

    # https://github.com/clld/wals-data/issues/43
    conn.update_genus('ctw', ('catawban', 'Catawban', 'fffcc00'), family='siouan')
    conn.update(Genus, dict(name='Core Siouan'), id='siouan')

    # https://github.com/clld/wals-data/issues/40
    conn.update_source('Sumbuk-2002', year='1999', name='Sumbuk 1999')
Beispiel #31
0
def update_providers(args, verbose=False):
    filepath = args.data_dir.joinpath('references', 'bibtex', 'BIBFILES.ini')
    p = RawConfigParser()
    with io.open(filepath, encoding='utf-8-sig') as fp:
        p.readfp(fp)

    provider_map = get_map(Provider)
    for section in p.sections():
        sectname = section[:-4] if section.endswith('.bib') else section
        id_ = slug(sectname)
        attrs = {
            'name': p.get(section, 'title'),
            'description': p.get(section, 'description'),
            'abbr': p.get(section, 'abbr'),
        }
        if id_ in provider_map:
            provider = provider_map[id_]
            for a in list(attrs):
                before, after = getattr(provider, a), attrs[a]
                if before == after:
                    del attrs[a]
                else:
                    setattr(provider, a, after)
                    attrs[a] = (before, after)
            if attrs:
                args.log.info('updating provider %s %s' %
                              (slug(id_), sorted(attrs)))
            if verbose:
                for a, (before, after) in attrs.items():
                    before, after = (' '.join(_.split())
                                     for _ in (before, after))
                    if before != after:
                        args.log.info('%s\n%r\n%r' % (a, before, after))
        else:
            args.log.info('adding provider %s' % slug(id_))
            DBSession.add(Provider(id=id_, **attrs))
Beispiel #32
0
    def __call__(self, outdir):
        """
        runs a parser workflow consisting of
        - preprocess
        - refactor
        - postprocess
        writes the results, an html, a css and a json file to disk.
        """
        cssutils_logger = logging.getLogger('CSSUTILS')
        cssutils_logger.setLevel(logging.ERROR)
        print(self.fname.namebase.encode('utf8'))

        with open(self.fname, encoding='utf8') as fp:
            c = fp.read()
        soup = BeautifulSoup(self.preprocess(self._preprocess(c)))

        # extract css from the head section of the HTML doc:
        css = cssutils.parseString('\n')
        for style in soup.find('head').find_all('style'):
            for rule in self.cssrules(style):
                css.add(rule)

        md = dict(outline=[], refs=[], authors=[])
        soup = self.refactor(soup, md)

        # enhance section headings:
        for section, t in tag_and_text(soup.find_all('h3')):
            t = t.split('[Note')[0]
            id_ = 'section-%s' % slug(t)
            md['outline'].append((t, id_))
            section.attrs['id'] = id_
            for s, attrs in [
                (u'\u21eb', {'href': '#top', 'title': 'go to top of the page', 'style': 'vertical-align: bottom'}),
                ('¶', {'class': 'headerlink', 'href': '#' + id_, 'title': 'Permalink to this section'}),
            ]:
                append(section, soup.new_string('\n'), new_tag(soup, 'a', s, **attrs))

        body = self.insert_links(unicode(soup.find('body')), md)

        # write output files:
        with open(outdir.joinpath('%s.html' % self.id), 'w', encoding='utf8') as fp:
            fp.write(self.wrap(self.postprocess(body)))

        with open(outdir.joinpath('%s.css' % self.id), 'wb') as fp:
            fp.write(self.csstext(css))

        md['authors'] = list(self.yield_valid_authors(md['authors']))
        jsondump(md, outdir.joinpath('%s.json' % self.id), indent=4)
Beispiel #33
0
def bibtex2source(rec):
    year = bibtex.unescape(rec.get('year', 'nd'))
    fields = {}
    jsondata = {}
    for field in bibtex.FIELDS:
        if field in rec:
            value = bibtex.unescape(rec[field])
            container = fields if hasattr(common.Source, field) else jsondata
            container[field] = value

    return common.Source(
        id=slug(rec.id),
        name=('%s %s' % (bibtex.unescape(
            rec.get('author', rec.get('editor', ''))), year)).strip(),
        description=bibtex.unescape(rec.get('title', rec.get('booktitle', ''))),
        jsondata=jsondata,
        bibtex_type=rec.genre,
        **fields)
Beispiel #34
0
def bibtex2source(rec):
    year = bibtex.unescape(rec.get('year', 'nd'))
    fields = {}
    jsondata = {}
    for field in bibtex.FIELDS:
        if field in rec:
            value = bibtex.unescape(rec[field])
            container = fields if hasattr(common.Source, field) else jsondata
            container[field] = value

    return common.Source(
        id=slug(rec.id),
        name=('%s %s' % (bibtex.unescape(
            rec.get('author', rec.get('editor', ''))), year)).strip(),
        description=bibtex.unescape(rec.get('title', rec.get('booktitle',
                                                             ''))),
        jsondata=jsondata,
        bibtex_type=rec.genre,
        **fields)
Beispiel #35
0
def glottocode(name, conn, codes=None):
    #
    # TODO: must take legacy glottocodes into account!
    #
    codes = {} if codes is None else codes
    letters = slug(name)[:4].ljust(4, 'a')
    r = conn.execute("select id from language where id like '" + letters + "%%' order by id desc limit 1").fetchone()
    if r:
        number = int(r[0][4:]) + 1
    else:
        number = 1234
    number = str(number)
    assert len(number) == 4
    res = letters + number
    i = 0
    while res in codes:
        i += 1
        res = letters + str(int(number) + i)
    codes[res] = True
    return res
Beispiel #36
0
def normalize_name(s):
    """This function is called to convert ASCII strings to something that can pass as
    python attribute name, to be used with namedtuples.

    >>> assert normalize_name('class') == 'class_'
    >>> assert normalize_name('a-name') == 'a_name'
    >>> assert normalize_name('a näme') == 'a_name'
    >>> assert normalize_name('Name') == 'Name'
    >>> assert normalize_name('') == '_'
    >>> assert normalize_name('1') == '_1'
    """
    s = s.replace('-', '_').replace('.', '_').replace(' ', '_')
    if s in keyword.kwlist:
        return s + '_'
    s = '_'.join(slug(ss, lowercase=False) for ss in s.split('_'))
    if not s:
        s = '_'
    if s[0] not in ascii_letters + '_':
        s = '_' + s
    return s
Beispiel #37
0
def normalize_name(s):
    """This function is called to convert ASCII strings to something that can pass as
    python attribute name, to be used with namedtuples.

    >>> assert normalize_name('class') == 'class_'
    >>> assert normalize_name('a-name') == 'a_name'
    >>> assert normalize_name('a näme') == 'a_name'
    >>> assert normalize_name('Name') == 'Name'
    >>> assert normalize_name('') == '_'
    >>> assert normalize_name('1') == '_1'
    """
    s = s.replace('-', '_').replace('.', '_').replace(' ', '_')
    if s in keyword.kwlist:
        return s + '_'
    s = '_'.join(slug(ss, lowercase=False) for ss in s.split('_'))
    if not s:
        s = '_'
    if s[0] not in ascii_letters + '_':
        s = '_' + s
    return s
Beispiel #38
0
def get_refs(line):
    line = line.strip()\
        .replace('|Wikipedia', '')\
        .replace(',|', ',')\
        .replace(' (ed.)', ' ed.')\
        .replace(':,77', ':77,')
    if '(' in line and ')' not in line:
        line = line + ')'
    for piece in SEP.findall(line):
        piece = piece.strip()
        if piece.startswith('http://'):
            # TODO: handle URL
            yield piece
            continue
        if not ('(' in piece and ')' in piece):
            if 'dobes' in piece.lower():
                yield 'DOBES'
            elif piece == 'Cunningham ed.':
                yield ('cunningham', None)
            else:
                print(piece)
                raise ValueError
            continue
        assert len(piece.split('(')) == 2

        pages = None
        year_pages = piece.split('(')[1].split(')')[0]
        m = YEAR_PAGES.match(year_pages)
        if not m:
            if year_pages == '?:15':
                pages = '15'
            assert year_pages in ['?:15', '1994:']
        else:
            pages = m.group('pages')
        if ':' in piece:
            r = piece.split(':')[0]
        else:
            r = piece.split(')')[0]
        r = slug(r)
        r = KEY_MAP.get(r, r)
        yield (r, pages)
Beispiel #39
0
def glottocode(name, conn, codes=None):
    letters = slug(name)[:4].ljust(4, 'a')
    active = select([cast(func.substring(Languoid.id, 5), Integer).label('number')])\
        .where(Languoid.id.startswith(letters))
    legacy = select([cast(func.substring(LegacyCode.id, 5), Integer).label('number')])\
        .where(LegacyCode.id.startswith(letters))
    if not codes:
        known = union_all(active, legacy)
    else:
        dirty = select([cast(func.substring(literal_column('dirty'), 5), Integer).label('number')])\
            .select_from(func.unnest(list(codes)).alias('dirty'))\
            .where(literal_column('dirty').startswith(letters))
        known = union_all(active, legacy, dirty)
    number = conn.execute(select([func.coalesce(func.max(literal_column('number') + 1), 1234)])\
        .select_from(known.alias())).scalar()
    number = str(number)
    assert len(number) == 4
    res = letters + number
    assert GLOTTOCODE_PATTERN.match(res)
    if codes is not None:
        codes[res] = True
    return res
def upgrade():
    conn = Connection(op.get_bind())

    for fname, genera in DATA.items():
        fpk = conn.insert(Family, id=slug(fname), name=fname)

        for gspec, lnames in genera.items():
            if isinstance(gspec, tuple):
                if len(gspec) == 3:
                    # new genus
                    gpk = conn.insert(
                        Genus, id=gspec[0], name=gspec[1], icon=gspec[2], family_pk=fpk)
                elif len(gspec) == 2:
                    # rename genus
                    gpk = conn.pk(Genus, gspec[0])
                    conn.update(Genus, dict(family_pk=fpk, name=gspec[1]), pk=gpk)
                else:
                    raise ValueError()
            else:
                # just update the family
                gpk = conn.pk(Genus, gspec)
                conn.update(Genus, dict(family_pk=fpk), pk=gpk)

            for lname in lnames:
                lpk = conn.pk(Language, lname, attr='name')
                conn.update(WalsLanguage, dict(genus_pk=gpk), pk=lpk)

    for gid in GONE:
        conn.insert(Config, key=Config.replacement_key(Genus, gid), value=Config.gone)
        conn.delete(Genus, id=gid)

    conn.insert(
        Config, key=Config.replacement_key(Family, 'australian'), value=Config.gone)
    conn.delete(Family, id='australian')

    conn.update_name('mdl', 'Matngele')
Beispiel #41
0
def create(args):
    args.log.info('starting migration ...')
    data = Data()
    db = create_engine('postgresql://robert@/glottolog2')

    with transaction.manager:
        sn = data.add(common.Contributor,
                      'sn',
                      id='sn',
                      name='Sebastian Nordhoff')
        hh = data.add(common.Contributor,
                      'hh',
                      id='hh',
                      name='Harald Hammarström')
        rf = data.add(common.Contributor,
                      'rf',
                      id='rf',
                      name='Robert Forkel',
                      url="https://github.com/xrotwang")
        mh = data.add(common.Contributor,
                      'mh',
                      id='mh',
                      name='Martin Haspelmath')
        contrib = data.add(common.Contribution,
                           'c',
                           id='classification',
                           name='Classification')
        data.add(common.ContributionContributor,
                 'hh',
                 contribution=contrib,
                 contributor=hh)
        params = dict(
            fc=data.add(common.Parameter,
                        'fc',
                        id='fc',
                        name='Family classification'),
            sc=data.add(common.Parameter,
                        'sc',
                        id='sc',
                        name='Subclassification'),
        )

        dataset = data.add(
            common.Dataset,
            'd',
            id='glottolog',
            name='Glottolog 2.0',
            description='',
            published=datetime.date(2013, 8, 15),
            domain='glottolog.org',
            contact='*****@*****.**',
            jsondata={
                'license_icon':
                'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
                'license_name':
                'Creative Commons Attribution-ShareAlike 3.0 Unported License'
            })
        for i, ed in enumerate([sn, hh, rf, mh]):
            DBSession.add(
                common.Editor(dataset=dataset, contributor=ed, ord=i + 1))

        valuesets = {}

        def create_languoid(row, father_pk=None):
            glottocode = {
                'akun1242': 'akun1241'
            }.get(row['alnumcode'], row['alnumcode'])
            attrs = dict(
                pk=row['id'],
                id=glottocode,
                name=row['primaryname'],
                description=row['globalclassificationcomment'],
                level=getattr(models2.LanguoidLevel, row['level']),
                status=getattr(models2.LanguoidStatus,
                               (row['status'] or '').replace(' ', '_'), None),
                father_pk=father_pk,
                created=row['updated'],
                jsondata={} if not row['hname'] else {'hname': row['hname']},
            )
            for attr in [
                    'active',
                    'updated',
                    'hid',
                    'latitude',
                    'longitude',
            ]:
                attrs[attr] = row[attr]
            l = data.add(models2.Languoid, row['id'], **attrs)
            for type_ in params:
                id_ = '%s%s' % (type_, row['id'])
                vs = data.add(common.ValueSet,
                              id_,
                              id=id_,
                              description=row['classificationcomment'] if type_
                              == 'fc' else row['subclassificationcomment'],
                              language=l,
                              parameter=params[type_],
                              contribution=contrib)
                data.add(common.Value,
                         id_,
                         id=id_,
                         name='%s - %s' % (row['level'], row['status']),
                         valueset=vs)
                DBSession.flush()
                valuesets[id_] = vs.pk
            return str(row['id'])

        level = 0
        parents = [
            create_languoid(row) for row in db.execute(
                'select * from languoidbase where father_id is null')
        ]
        while parents:
            args.log.info('level: %s' % level)
            level += 1
            parents = [
                create_languoid(
                    row, father_pk=data['Languoid'][row['father_id']].pk)
                for row in db.execute(
                    'select * from languoidbase where father_id in (%s)' %
                    ','.join(parents))
            ]

    def handler(offset, batch):
        svalues = []
        rvalues = []
        for row in batch:
            jsondata = json.loads(row['jsondata'] or "{}")
            jsondata['bibtexkey'] = row['bibtexkey']
            dicts = {
                's':
                dict(pk=row['id'],
                     polymorphic_type='base',
                     id=str(row['id']),
                     name='%(author)s %(year)s' % row,
                     description=row['title'],
                     bibtex_type=getattr(EntryType, row['type']),
                     jsondata=jsondata),
                'r':
                dict(pk=row['id']),
            }
            for model, map_ in {
                    's': {
                        'author': None,
                        'yearstring': 'year',
                        'year': 'year_int',
                        'startpage': 'startpage_int',
                        'numberofpages': 'pages_int',
                        'pages': None,
                        'edition': None,
                        'school': None,
                        'address': None,
                        'url': None,
                        'note': None,
                        'number': None,
                        'series': None,
                        'editor': None,
                        'booktitle': None,
                        'journal': None,
                        'volume': None,
                        'publisher': None,
                    },
                    'r': {
                        'endpage': 'endpage_int',
                        'inlg': None,
                        'inlg_code': None,
                        'subject': None,
                        'subject_headings': None,
                        'keywords': None,
                        'normalizedauthorstring': None,
                        'normalizededitorstring': None,
                        'ozbib_id': None,
                    }
            }.items():
                for okey, nkey in map_.items():
                    dicts[model][nkey or okey] = row[okey]
            svalues.append(dicts['s'])
            rvalues.append(dicts['r'])
        DBSession.execute(common.Source.__table__.insert(), svalues)
        DBSession.execute(models2.Ref.__table__.insert(), rvalues)

    select(db, 'select * from refbase order by id', handler)
    DBSession.execute('COMMIT')

    for table, model, value, order in [
        ('macroarea', models2.Macroarea,
         lambda i, row: dict(pk=row['id'],
                             id=slug(row['name']),
                             name=row['name'],
                             description=row['description']), None),
        ('country', models2.Country,
         lambda i, row: dict(pk=row['id'], id=row['alpha2'], name=row['name']),
         None),
        ('provider', models2.Provider,
         lambda i, row: dict(pk=row['id'],
                             id=slug(row['name']),
                             name=row['description'],
                             description=row['comment'],
                             abbr=row['abbr'],
                             url=row['url'],
                             refurl=row['refurl'],
                             bibfield=row['bibfield']), None),
        ('doctype', models2.Doctype,
         lambda i, row: dict(pk=row['id'],
                             id=slug(row['name']),
                             abbr=row['abbr'],
                             name=row['name'],
                             description=row['description']), None),
        ('refprovider', models2.Refprovider, lambda i, row: dict(
            pk=i, provider_pk=row['provider_id'], ref_pk=row['refbase_id']),
         ('provider_id', 'refbase_id')),
        ('refdoctype', models2.Refdoctype, lambda i, row: dict(
            pk=i, doctype_pk=row['doctype_id'], ref_pk=row['refbase_id']),
         ('doctype_id', 'refbase_id')),
    ]:
        insert(db, table, model, value, order=order)

    names = dict(
        (int(d['id']), d['pk'])
        for d in insert(db,
                        'namebase',
                        common.Identifier,
                        lambda i, row: dict(pk=i,
                                            id=str(row['id']),
                                            name=row['namestring'],
                                            type='name',
                                            description=row['nameprovider'],
                                            lang=row['inlg'] if row['inlg'] and
                                            len(row['inlg']) <= 3 else 'en'),
                        order='id'))

    codes = dict(
        (int(d['id']), d['pk'])
        for d in insert(db,
                        'codebase',
                        common.Identifier,
                        lambda i, row: dict(pk=i,
                                            id=str(row['id']),
                                            name=row['codestring'],
                                            type=common.IdentifierType.iso.
                                            value if row['codeprovider'] ==
                                            'ISO' else row['codeprovider']),
                        start=len(names),
                        order='id'))

    res = insert(
        db, 'nodecodes', common.LanguageIdentifier,
        lambda i, row: dict(pk=i,
                            language_pk=row['languoidbase_id'],
                            identifier_pk=codes[row['codebase_id']]))

    insert(db,
           'nodenames',
           common.LanguageIdentifier,
           lambda i, row: dict(pk=i,
                               language_pk=row['languoidbase_id'],
                               identifier_pk=names[row['namebase_id']]),
           start=len(res))

    for table, model, value in [
        ('languoidmacroarea', models2.Languoidmacroarea,
         lambda i, row: dict(pk=i,
                             languoid_pk=row['languoidbase_id'],
                             macroarea_pk=row['macroarea_id'])),
        ('languoidcountry', models2.Languoidcountry,
         lambda i, row: dict(pk=i,
                             languoid_pk=row['languoidbase_id'],
                             country_pk=row['country_id'])),
        ('noderefs', common.LanguageSource,
         lambda i, row: dict(pk=i,
                             language_pk=row['languoidbase_id'],
                             source_pk=row['refbase_id'])),
        ('refmacroarea', models2.Refmacroarea, lambda i, row: dict(
            pk=i, macroarea_pk=row['macroarea_id'], ref_pk=row['refbase_id'])),
        ('refcountry', models2.Refcountry, lambda i, row: dict(
            pk=i, country_pk=row['country_id'], ref_pk=row['refbase_id'])),
        ('spuriousreplacements', models2.Superseded,
         lambda i, row: dict(pk=i,
                             languoid_pk=row['languoidbase_id'],
                             replacement_pk=row['replacement_id'],
                             description=row['relation'])),
        ('justification', common.ValueSetReference, lambda i, row: dict(
            pk=i,
            valueset_pk=valuesets['%s%s' % ('fc' if row[
                'type'] == 'family' else 'sc', row['languoidbase_id'])],
            source_pk=row['refbase_id'],
            description=row['pages'])),
    ]:
        insert(db, table, model, value)
Beispiel #42
0
def issue20(session, timestamp):  # pragma: no cover
    #    Datapoint http://wals.info/datapoint/121A/wals_code_bej should be changed to be
    # about Kemant (wals_code_kem). The same applies to the Rossini source for that
    # datapoint. (This is the only datapoint for this source.)
    vs_switch_lang(session, timestamp, '121A-bej', 'kem')

    #    Eastern Ojibwa (wals_code_oji) should link to two ISO codes, ojg (as it is now) but also otw.
    update_iso(session, timestamp, 'oji', otw='Ottawa')

    #    There should be two ISO codes for Campa (Axininca) (wals_code_cax): cni and cpc
    update_iso(session, timestamp, 'cax', cpc='Ajyíninka Apurucayali')

    #    All of the datapoints for Fula (Nigerian) (wals_code_fni) based on Arnott (1970)
    # need to be moved to Fula (Cameroonian) (wals_code_fua). In some cases, this involves
    # merging these datapoints with existing datapoints for wals_code_fua.
    source = common.Source.get('Arnott-1970', session=session)
    for vsr in source.valuesetreferences:
        vs = vsr.valueset
        if vs.language.id == 'fni':
            vs_switch_lang(session, timestamp, vs, 'fua')

    #    The one datapoint for Fulani (Gombe) fgo needs to be moved to Fula (Cameroonian)
    # (wals_code_fua), thus removing Fulani (Gombe) as a language.
    vs_switch_lang(session, timestamp, '27A-fgo', 'fua')

    #    Tlapanec (wals_code_tlp) should link to ISO code tcf rather than tpx.
    update_iso(session, timestamp, 'tlp', 'tpx', tcf="Malinaltepec Me'phaa")

    #    Kongo (wals_code_kon) should link to two ISO codes, kwy and kng.
    update_iso(session, timestamp, 'kon', kwy=None)

    #    One of the sources for Vili (wals_code_vif), namely Carrie (1890) turns out not
    # to be a source for Vili but another source for Kongo (wals_code_kon). This means:
    #    the page numbers given for Vili for 81A and 82A should be added to the corresponding
    #    datapoints for Kongo
    #    the value and source given for Vili for 91A should be transferred to Congo (which
    #    currently does not have a value for that feature)
    #    all the datapoints for Vili for which Carrie was the source should be removed
    #    the values given for Vili for which Carrie was the source for the features
    #    associated with chapters 112, 143, and 144 are NOT being transferred to Kongo
    #    since they are inconsistent with the existing values for these features for Kongo
    source = common.Source.get('Carrie-1890', session=session)
    for vsr in source.valuesetreferences:
        vs = vsr.valueset
        if vs.language.id == 'vif':
            if vs.parameter.id in ['81A', '82A', '91A']:
                vs_switch_lang(session, timestamp, vs, 'kon')
            else:
                vs_delete(session, timestamp, vs)

    #    One of the sources for Chichewa (wals_code_cic), namely Mateene 1980, turns out
    #    to be a source for Nyanga (wals_code_nng). What this entail is
    #    the values listed for Chichewa for features 81A, 82A, 83A, 86A, 87A, and 88A,
    #    need to be added to Nyanga
    #    Mateene 1980 should be added as a source for Nyanga
    #    the references to Mateene as a source for datapoints for Chichewa need to be removed
    #    there is one datapoint for Chichewa were Mateene is listed as the only source,
    #    namely for 83A, but this is an error: the source for this datapoint should be
    #    Price 1966: passim; Mchombo 2004: 19 (the sources listed for 81A)
    source = common.Source.get('Mateene-1980', session=session)
    for vsr in source.valuesetreferences:
        vs = vsr.valueset
        if vs.language.id == 'cic':
            if vs.parameter.id in ['81A', '82A', '83A', '86A', '87A', '88A']:
                vs_copy_lang(session, timestamp, vs, 'nng')
            else:
                vs_delete(session, timestamp, vs)
            session.delete(vsr)
            if vs.parameter.id == '83A':
                session.add(
                    common.ValueSetReference(
                        valueset=vs,
                        source=common.Source.get('Price-1966',
                                                 session=session),
                        description='passim'))
                session.add(
                    common.ValueSetReference(valueset=vs,
                                             source=common.Source.get(
                                                 'Mchombo-2004',
                                                 session=session),
                                             description='19'))

    #    [gby] should be removed as an ISO code for Gwari (wals_code_gwa); the only one should be [gbr]
    update_iso(session, timestamp, 'gwa', 'gby', gbr=None)

    #    The ISO code for Grebo (wals_code_grb) should be corrected to [grj].
    update_iso(session, timestamp, 'grb', 'gry', grj="Southern Grebo")

    #    The only ISO code for Lega is [lea]; please remove the second one.
    update_iso(session, timestamp, 'leg', 'lgm')

    #    The sources for Ngbaka (wals_code_ngb) are actually for two different, only
    #    distantly related languages. GrandEury is the source for Ngbaka (Minagende), which
    #    has the same ISO code [nga] and location we are currently using for Ngbaka, so we
    #    should keep the WALS code for that Ngbaka (but should change the name to
    #    Ngbaka (Minagende)). Thomas (1963) is a source for what will be a new WALS language,
    #    Ngbaka (Ma’bo). Its ISO code is [nbm]. We could use the same code nbm as the WALS code.
    #    It belongs to the Ubangi genus, as Ngbaka (Minagende) does in the current WALS
    #    classification, but see below where Ngbaka (Minagende) is being moved out of
    #    Ubangi into a new genus. I would use the Glottolog location for it, but I can’t find
    #    that in the new Glottolog. It is also in the Democratic Republic of the Congo.
    #
    #    This means that all the datapoints in the current WALS that use Thomas 1963 as a
    #    source for Ngbaka need to be moved or copied to the new Ngbaka (Ma’bo). Those
    #    datapoints in the current Ngbaka that only use Thomas as a source will need to be
    #    removed (since that language is the new Ngbaka (Minagende)). Those datapoints that
    #    use both sources in the current WALS will now become two datapoints, one for each
    #    of these two languages.
    nbm = models.WalsLanguage(id='nbm',
                              name="Ngbaka (Ma'bo)",
                              ascii_name=slug("Ngbaka (Ma'bo)"),
                              latitude=3.56,
                              longitude=18.36,
                              genus=models.Genus.get('ubangi',
                                                     session=session))
    nbm.countries.append(models.Country.get('CD', session=session))
    session.add(nbm)
    update_iso(session, timestamp, nbm, nbm="Ngbaka Ma'bo")
    update_glottocode(session, timestamp, nbm, 'ngba1284')

    ngb = common.Language.get('ngb', session=session)
    ngb.name = 'Ngbaka (Minagende)'
    ngb.ascii_name = slug(ngb.name)

    for vs in ngb.valuesets:
        if 'Thomas-1963' in [ref.source.id for ref in vs.references]:
            if len(vs.references) > 1:
                vs_copy_lang(session, timestamp, vs, nbm)
            else:
                vs_switch_lang(session, timestamp, vs, nbm)

    #    The ISO code for Sisaala (wals_code_ssa) needs to be changed from [ssl] to [sld].
    update_iso(session, timestamp, 'ssa', 'ssl', sld='Sissala')

    #    The ISO code for Makua (wals_code_mua) should be changed to [mgh] and [xsq].
    update_iso(session,
               timestamp,
               'mua',
               'vmw',
               mgh='Makhuwa-Meetto',
               xsq='Makhuwa-Saka')

    #    A change to the genealogical classification: Four languages need to be taken out
    #    of the Ubangi genus and put into a new genus within Niger-Congo called
    #    Gbaya-Manza-Ngbaka: (first below is WALS code, last is ISO code):
    #
    #gbb Gbeya Bossangoa gbp
    #gbk Gbaya Kara gya
    #mdo Mbodomo gmm
    #ngb Ngbaka nga
    #
    update_classification(session,
                          timestamp, ['gbb', 'gbk', 'mdo', 'ngb'],
                          'gbayamanzangbaka',
                          genus_name='Gbaya-Manza-Ngbaka',
                          family_id='nigercongo')
Beispiel #43
0
 def words(s):
     return set(slug(s.strip(), remove_whitespace=False).split())
Beispiel #44
0
def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(cast(common.Source.id, Integer))\
            .options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(sources):
        filepath = args.data_file('gbs', 'source%s.json' % source.id)

        if command == 'update':
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ['verify', 'update']:
            if filepath.exists():
                with open(filepath) as fp:
                    try:
                        data = json.load(fp)
                    except ValueError:
                        log.warn('no JSON object found in: %s' % filepath)
                        continue
                if not data['totalItems']:
                    continue
                item = data['items'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item['volumeInfo'].get('publishedDate', '').split('-')[0]
            if not year or year != slug(source.year or unicode('')):
                needs_check = True
            twords = words(stitle)
            iwords = words(item['volumeInfo']['title'] + ' ' +
                           item['volumeInfo'].get('subtitle', ''))
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if int(source.id) == 241:
                log.info('%s' % sorted(list(words(stitle))))
                log.info('%s' % sorted(list(iwords)))
            if needs_check:
                log.info(
                    '------- %s -> %s' %
                    (source.id, item['volumeInfo'].get('industryIdentifiers')))
                log.info('%s %s' % (item['volumeInfo']['title'],
                                    item['volumeInfo'].get('subtitle', '')))
                log.info(stitle)
                log.info(item['volumeInfo'].get('publishedDate'))
                log.info(source.year)
                log.info(item['volumeInfo'].get('authors'))
                log.info(source.author)
                log.info(item['volumeInfo'].get('publisher'))
                log.info(source.publisher)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    with open(filepath, 'w') as fp:
                        json.dump({"totalItems": 0}, fp)
        elif command == 'update':
            source.google_book_search_id = item['id']
            source.update_jsondata(gbs=item)
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    'inauthor:' + quote_plus(source.author.encode('utf8')),
                    'intitle:' + quote_plus(title.encode('utf8')),
                ]
                if source.publisher:
                    q.append('inpublisher:' +
                             quote_plus(source.publisher.encode('utf8')))
                url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, url))
                if r.status_code == 200:
                    with open(filepath, 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info('assigned gbs ids for %s out of %s sources' % (count, i))
    elif command == 'download':
        log.info('queried gbs for %s sources' % count)
Beispiel #45
0
def main(args):  # pragma: no cover
    stats = Counter(new=0, updated=0, skipped=0)
    changes = {}

    with transaction.manager:
        update_providers(args)
        DBSession.flush()
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(get_bib(args)):
            if i and i % 1000 == 0:
                print i, 'records done', stats['updated'] + stats['new'], 'changed'

            if len(rec.keys()) < 6:
                # not enough information!
                stats.update(['skipped'])
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))

            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': rec.genre,
                'id': str(id_),
                'jsondata': {'bibtexkey': rec.id},
            }

            for source, target in FIELD_MAP.items():
                if target is None:
                    continue
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            if kw['jsondata'].get('hhtype'):
                trigger = ca_trigger(kw['jsondata']['hhtype'])
                if trigger:
                    kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if kw.get('year'):
                # prefer years in brackets over the first 4-digit number.
                match = PREF_YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))
                else:
                    match = YEAR_PATTERN.search(kw.get('year'))
                    if match:
                        kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)]
                    if 'address' not in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('pages'):
                start, end, number = compute_pages(kw['pages'])
                if start is not None:
                    kw['startpage_int'] = start
                if end is not None:
                    kw['endpage_int'] = end
                if number is not None and 'pages_int' not in kw:
                    kw['pages_int'] = number

            for k in kw.keys():
                v = kw[k]
                if isinstance(v, basestring):
                    v = v.strip() or None
                kw[k] = v

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    v = getattr(ref, k)
                    if kw[k] != v:
                        if k == 'jsondata':
                            d = {k: v for k, v in ref.jsondata.items()
                                 if k in NONREF_JSONDATA}
                            d.update(**kw[k])
                            ref.jsondata = d
                        else:
                            #print k, '--', v
                            #print k, '++', kw[k]
                            setattr(ref, k, kw[k])
                            changed = True
                            if ref.id in changes:
                                changes[ref.id][k] = ('%s' % v, '%s' % kw[k])
                            else:
                                changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])}
            else:
                changed = True
                ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw)

            ref.description = ref.title or ref.booktitle
            originator = ref.author or ref.editor or 'Anonymous'
            ref.name = '%s %s' % (originator, ref.year or 'n.d.')

            a, r = update_relationship(
                ref.macroareas,
                [macroarea_map[name] for name in
                 set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))])
            changed = changed or a or r

            src = [s.strip() for s in kw['jsondata'].get('src', '').split(',')]
            prv = {provider_map[slug(s)] for s in src if s}
            if set(ref.providers) != prv:
                ref.providers = list(prv)
                changed = True

            a, r = update_relationship(
                ref.doctypes,
                [doctype_map[m.group('name')] for m in
                 DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))])
            changed = changed or a or r

            if not update:
                stats.update(['new'])
                DBSession.add(ref)
            elif changed:
                stats.update(['updated'])

    args.log.info('%s' % stats)

    DBSession.execute("update source set description = title where description is null and title is not null;")
    DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;")

    for row in list(DBSession.execute(
            "select pk, pages, pages_int, startpage_int from source where pages_int < 0")):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s" %
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s" %
                (_end, pk))

    jsondump(changes, args.data_dir.joinpath('references', 'changes.json'))
Beispiel #46
0
def main(args):
    data = Data()

    editors = OrderedDict()
    editors['Susanne Maria Michaelis'] = None
    editors['Philippe Maurer'] = None
    editors['Martin Haspelmath'] = None
    editors['Magnus Huber'] = None

    for row in read(args, 'People'):
        name = row['First name'] + ' ' if row['First name'] else ''
        name += row['Last name']
        kw = dict(
            name=name,
            id=slug('%(Last name)s%(First name)s' % row),
            url=row['Contact Website'].split()[0]
            if row['Contact Website'] else None,
            address=row['Comments on database'],
        )
        contrib = data.add(common.Contributor, row['Author ID'], **kw)
        if kw['name'] in editors:
            editors[kw['name']] = contrib

    DBSession.flush()

    dataset = common.Dataset(
        id='apics',
        name='APiCS Online',
        description='Atlas of Pidgin and Creole Language Structures Online',
        domain='apics-online.info',
        published=date(2013, 11, 4),
        license='http://creativecommons.org/licenses/by/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'
        })
    DBSession.add(dataset)
    for i, editor in enumerate(editors.values()):
        common.Editor(dataset=dataset, contributor=editor, ord=i + 1)

    colors = dict(
        (row['ID'], row['RGB_code']) for row in read(args, 'Colours'))

    abbrs = {}
    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for id_, name in {
            'C**T': 'clitic',
            'IMPF': 'imperfect',
            'INTERM': 'intermediate',
            'NCOMPL': 'noncompletive',
            'NONFUT': 'nonfuture',
            'NPROX': 'nonproximal',
            'NSG': 'nonsingular',
            'PP': 'past participle',
            'PROP': 'proprietive',
            'TMA': 'tense-mood-aspect',
    }.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'),
                      delimiter=',',
                      namedtuples=True):
        for match in GLOSS_ABBR_PATTERN.finditer(row.standard):
            if match.group('abbr') not in abbrs:
                abbrs[match.group('abbr')] = 1
                DBSession.add(
                    common.GlossAbbreviation(id=match.group('abbr'),
                                             name=row.meaning))

    non_bibs = {}
    for row in read(args, 'References', 'Reference_ID'):
        if row['Reference_type'] == 'Non-bib':
            non_bibs[row['Reference_ID']] = row['Reference_name']
            continue

        if isinstance(row['Year'], int):
            year_int = row['Year']
            year = str(row['Year'])
        elif row['Year']:
            year_int = None
            for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']):
                year_int = int(m.group('year'))
                break
            year = row['Year']
        else:
            year, year_int = None, None

        title = row['Article_title'] or row['Book_title']
        attrs = {}
        jsondata = {}
        for attr, field in {
                'Additional_information': 'note',
                'Article_title': 'title',
                'Book_title': 'booktitle',
                'City': 'address',
                'Editors': 'editor',
                'Full_reference': None,
                'Issue': None,
                'Journal': 'journal',
                'Language_codes': None,
                'LaTeX_cite_key': None,
                'Pages': 'pages',
                'Publisher': 'publisher',
                'Reference_type': 'type',
                'School': 'school',
                'Series_title': 'series',
                'URL': 'url',
                'Volume': 'volume',
        }.items():
            value = row.get(attr)
            if not isinstance(value, int):
                value = (value or '').strip()
            if attr == 'Issue' and value:
                try:
                    value = str(int(value))
                except ValueError:
                    pass
            if value:
                if field:
                    attrs[field] = value
                else:
                    jsondata[attr] = value
        p = data.add(common.Source,
                     row['Reference_ID'],
                     id=str(row['Reference_ID']),
                     name=row['Reference_name'],
                     description=title,
                     author=row['Authors'],
                     year=year,
                     year_int=year_int,
                     bibtex_type=getattr(EntryType, row['BibTeX_type']
                                         or 'misc'),
                     jsondata=jsondata,
                     **attrs)
        if p.bibtex_type.value == 'misc' and not p.description:
            p.description = p.note
        DBSession.flush()

    DBSession.flush()

    infobox = jsonload(args.data_file('infobox.json'))
    glottocodes = jsonload(args.data_file('glottocodes.json'))
    for row in read(args, 'Languages', 'Order_number'):
        lon, lat = [
            float(c.strip()) for c in row['map_coordinates'].split(',')
        ]
        kw = dict(
            name=row['Language_name'],
            id=str(row['Order_number']),
            latitude=lat,
            longitude=lon,
            region=row['Category_region'],
        )
        lect = data.add(models.Lect, row['Language_ID'], **kw)
        DBSession.flush()

        for i, item in enumerate(infobox[lect.id]):
            DBSession.add(
                common.Language_data(object_pk=lect.pk,
                                     ord=i,
                                     key=item[0],
                                     value=item[1]))

        if row["Languages_contribution_documentation::Lect_description_checked_status"] \
                != "Checked":
            print 'unchecked! ---', row['Language_name']

        desc = row.get(
            'Languages_contribution_documentation::Lect description', '')
        markup_desc = normalize_markup(row[
            'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description']
                                       )

        c = data.add(
            models.ApicsContribution,
            row['Language_ID'],
            id=str(row['Order_number']),
            name=row['Language_name'],
            description=desc,
            markup_description=markup_desc,
            survey_reference=data['Source'][row['Survey_reference_ID']],
            language=lect)

        for ext, label, mtype in [
            ('pdf', 'Glossed text', 'application/pdf'),
            ('mp3', 'Glossed text audio', 'audio/mpeg'),
        ]:
            fid = '%s-gt.%s' % (c.id, ext)
            if args.data_file('files', 'contribution', c.id, fid).exists():
                common.Contribution_files(object=c,
                                          id=fid,
                                          name=label,
                                          mime_type=mtype)
            else:
                print label, 'missing for:', row['Language_name']

        #
        # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE
        #

        iso = None
        if row['ISO_code'] and len(row['ISO_code']) == 3:
            iso = row['ISO_code'].lower()
            if 'iso:%s' % row['ISO_code'] not in data['Identifier']:
                data.add(common.Identifier,
                         'iso:%s' % row['ISO_code'],
                         id=row['ISO_code'].lower(),
                         name=row['ISO_code'].lower(),
                         type=common.IdentifierType.iso.value)

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=data['Identifier']['iso:%s' % row['ISO_code']]))

        if lect.id in glottocodes:
            identifier = data.add(common.Identifier,
                                  'gc:%s' % glottocodes[lect.id],
                                  id=glottocodes[lect.id],
                                  name=glottocodes[lect.id],
                                  type=common.IdentifierType.glottolog.value)

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=identifier))

        if row['Language_name_ethnologue']:
            if row['Language_name_ethnologue'] not in data['Identifier']:
                data.add(common.Identifier,
                         row['Language_name_ethnologue'],
                         id=iso
                         or 'ethnologue:%s' % row['Language_name_ethnologue'],
                         name=row['Language_name_ethnologue'],
                         type='ethnologue')

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=data['Identifier'][
                        row['Language_name_ethnologue']]))

    example_count = {}
    for row in read(args, 'Examples', 'Order_number'):
        assert row['Language_ID']
        lang = data['Lect'][row['Language_ID']]
        id_ = '%(Language_ID)s-%(Example_number)s' % row
        atext, gloss = igt(row)
        example_count[row['Language_ID']] = max(
            [example_count.get(row['Language_ID'], 1), row['Example_number']])
        p = add_sentence(
            args,
            data,
            id_,
            id='%s-%s' % (lang.id, row['Example_number']),
            name=row['Text'] or row['Analyzed_text'],
            description=row['Translation'],
            type=row['Type'].strip().lower() if row['Type'] else None,
            comment=row['Comments'],
            gloss=gloss,
            analyzed=atext,
            markup_text=normalize_markup(row['z_calc_Text_CSS']),
            markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']),
            markup_comment=normalize_markup(row['z_calc_Comments_CSS']),
            markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']),
            original_script=row['Original_script'],
            jsondata={
                'sort': row['Order_number'],
                'alt_translation': (row['Translation_other'] or '').strip()
                or None
            },
            language=lang)

        if row['Reference_ID']:
            if row['Reference_ID'] in data['Source']:
                source = data['Source'][row['Reference_ID']]
                DBSession.add(
                    common.SentenceReference(
                        sentence=p,
                        source=source,
                        key=source.id,
                        description=row['Reference_pages']))
            else:
                p.source = non_bibs[row['Reference_ID']]

    DBSession.flush()

    for row in read(args, 'Language_references'):
        if row['Reference_ID'] not in data['Source']:
            assert row['Reference_ID'] in non_bibs
            continue
        assert row['Language_ID'] in data['ApicsContribution']
        source = data['Source'][row['Reference_ID']]
        DBSession.add(
            common.ContributionReference(
                contribution=data['ApicsContribution'][row['Language_ID']],
                source=source,
                description=row['Pages'],
                key=source.id))

    #
    # global counter for features - across feature types
    #
    feature_count = 0
    for row in read(args, 'Features', 'Feature_number'):
        id_ = str(row['Feature_number'])
        if int(id_) > feature_count:
            feature_count = int(id_)
        wals_id = None
        desc = row['Feature_annotation_publication']
        if row['WALS_match'] == 'Total':
            if isinstance(row['WALS_No.'], int):
                wals_id = row['WALS_No.']
            else:
                wals_id = int(row['WALS_No.'].split('.')[0].strip())

        p = data.add(models.Feature,
                     row['Feature_code'],
                     name=row['Feature_name'],
                     id=id_,
                     description=desc,
                     markup_description=normalize_markup(
                         row['z_calc_Feature_annotation_publication_CSS']),
                     feature_type='primary',
                     multivalued=row['Value_relation_type'] != 'Single',
                     area=row['Feature_area'],
                     wals_id=wals_id)

        names = {}
        for i in range(1, 10):
            if not row['Value%s_publication' % i] \
                    or not row['Value%s_publication' % i].strip():
                continue
            name = row['Value%s_publication' % i].strip()
            if name in names:
                name += ' (%s)' % i
            names[name] = 1
            de = data.add(
                common.DomainElement,
                '%s-%s' % (row['Feature_code'], i),
                id='%s-%s' % (id_, i),
                name=name,
                parameter=p,
                abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name,
                number=int(row['Value%s_value_number_for_publication' % i]),
                jsondata={'color': colors[row['Value_%s_colour_ID' % i]]},
            )
            assert de

        if row['Authors_FeatureArticles']:
            authors, _ = row['Authors_FeatureArticles'].split('and the APiCS')
            authors = authors.strip()
            if authors.endswith(','):
                authors = authors[:-1].strip()
            for i, name in enumerate(authors.split(',')):
                assert name.strip() in editors
                p._authors.append(
                    models.FeatureAuthor(ord=i + 1,
                                         contributor=editors[name.strip()]))

        DBSession.flush()

    primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41}
    segment_to_primary = dict(
        zip(primary_to_segment.values(), primary_to_segment.keys()))
    number_map = {}
    names = {}
    for row in read(args, 'Segment_features', 'Order_number'):
        symbol = row['Segment_symbol']
        if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate':
            symbol = 't\u0361s'
        truth = lambda s: s and s.strip().lower() == 'yes'
        name = '%s - %s' % (symbol, row['Segment_name'])

        if name in names:
            number_map[row['Segment_feature_number']] = names[name]
            continue

        number_map[
            row['Segment_feature_number']] = row['Segment_feature_number']
        names[name] = row['Segment_feature_number']
        feature_count += 1
        if row['Segment_feature_number'] in segment_to_primary:
            primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\
                = str(feature_count)
        p = data.add(models.Feature,
                     row['Segment_feature_number'],
                     name=name,
                     id=str(feature_count),
                     feature_type='segment',
                     area='Vowels' if truth(row['Vowel']) else
                     ('Obstruent consonants'
                      if truth(row['Obstruent']) else 'Sonorant consonants'),
                     jsondata=dict(
                         number=int(row['Segment_feature_number']),
                         vowel=truth(row['Vowel']),
                         consonant=truth(row['Consonant']),
                         obstruent=truth(row['Obstruent']),
                         core_list=truth(row['Core_list_segment']),
                         symbol=symbol,
                     ))

        for i, spec in SEGMENT_VALUES.items():
            data.add(common.DomainElement,
                     '%s-%s' % (row['Segment_feature_number'], spec[0]),
                     id='%s-%s' % (p.id, i),
                     name=spec[0],
                     parameter=p,
                     jsondata={'color': spec[1]},
                     number=i)

    print '--> remapped:', primary_to_segment
    DBSession.flush()

    for row in read(args, 'Sociolinguistic_features',
                    'Sociolinguistic_feature_number'):
        feature_count += 1
        p = data.add(models.Feature,
                     row['Sociolinguistic_feature_code'],
                     name=row['Sociolinguistic_feature_name'],
                     id='%s' % feature_count,
                     description=row['Sociolinguistic_feature_annotation'],
                     area='Sociolinguistic',
                     feature_type='sociolinguistic')

        names = {}

        for i in range(1, 10):
            id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i)
            if row.get('Value%s' % i) and row['Value%s' % i].strip():
                name = row['Value%s' % i].strip()
                if name in names:
                    name += ' (%s)' % i
                names[name] = 1
            else:
                continue
            kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i)
            data.add(common.DomainElement,
                     id_,
                     id='%s-%s' % (p.id, i),
                     name=name,
                     parameter=p,
                     number=i,
                     jsondata={
                         'color':
                         colors.get(row['Value%s_colour_ID' % i],
                                    colors.values()[i])
                     })

    sd = {}
    for row in read(args, 'Segment_data'):
        if row['Segment_feature_number'] not in number_map:
            continue
        number = number_map[row['Segment_feature_number']]

        if not row['Presence_in_the_language']:
            continue

        lang = data['Lect'][row['Language_ID']]
        param = data['Feature'][number]
        id_ = '%s-%s' % (lang.id, param.id)
        if id_ in sd:
            assert row['c_Record_is_a_duplicate'] == 'Yes'
            continue
        sd[id_] = 1
        valueset = data.add(
            common.ValueSet,
            id_,
            id=id_,
            parameter=param,
            language=lang,
            contribution=data['ApicsContribution'][row['Language_ID']],
            description=row['Comments'],
            markup_description=normalize_markup(row['z_calc_Comments_CSS']),
        )
        v = data.add(
            common.Value,
            id_,
            id=id_,
            frequency=float(100),
            valueset=valueset,
            domainelement=data['DomainElement'][
                '%s-%s' % (number, row['Presence_in_the_language'])],
        )
        if row['Example_word'] and row['Example_word_gloss']:
            example_count[row['Language_ID']] += 1
            p = add_sentence(args,
                             data,
                             '%s-p%s' % (lang.id, data['Feature'][number].id),
                             id='%s-%s' %
                             (lang.id, example_count[row['Language_ID']]),
                             name=row['Example_word'],
                             description=row['Example_word_gloss'],
                             language=lang)
            DBSession.add(common.ValueSentence(value=v, sentence=p))

        source = data['Source'].get(row['Refers_to_references_Reference_ID'])
        if source:
            DBSession.add(
                common.ValueSetReference(valueset=valueset,
                                         source=source,
                                         key=source.id))
        elif row['Refers_to_references_Reference_ID'] in non_bibs:
            valueset.source = non_bibs[
                row['Refers_to_references_Reference_ID']]

    lects = defaultdict(lambda: 1)
    lect_map = {}
    records = {}
    false_values = {}
    no_values = {}
    wals_value_number = {}
    for row in read(args, 'wals'):
        if row['z_calc_WALS_value_number']:
            wals_value_number[
                row['Data_record_id']] = row['z_calc_WALS_value_number']

    def prefix(attr, _prefix):
        if _prefix:
            return '%s_%s' % (_prefix, attr)
        return attr.capitalize()

    for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]:
        num_values = 10
        for row in read(args, prefix('data', _prefix)):
            if not row[prefix('feature_code', _prefix)]:
                print('no associated feature for', prefix('data', _prefix),
                      row[prefix('data_record_id', _prefix)])
                continue

            lid = row['Language_ID']
            lect_attr = row.get('Lect_attribute', 'my default lect').lower()
            if lect_attr != 'my default lect':
                if (row['Language_ID'], row['Lect_attribute']) in lect_map:
                    lid = lect_map[(row['Language_ID'], row['Lect_attribute'])]
                else:
                    lang = data['Lect'][row['Language_ID']]
                    c = lects[row['Language_ID']]
                    lid = '%s-%s' % (row['Language_ID'], c)
                    kw = dict(
                        name='%s (%s)' % (lang.name, row['Lect_attribute']),
                        id='%s' % (1000 + 10 * int(lang.id) + c),
                        latitude=lang.latitude,
                        longitude=lang.longitude,
                        description=row['Lect_attribute'],
                        language=lang,
                    )
                    data.add(models.Lect, lid, **kw)
                    lects[row['Language_ID']] += 1
                    lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid

            id_ = abbr + str(row[prefix('data_record_id', _prefix)])
            assert id_ not in records
            records[id_] = 1

            assert row[prefix('feature_code', _prefix)] in data['Feature']
            language = data['Lect'][lid]
            parameter = data['Feature'][row[prefix('feature_code', _prefix)]]
            valueset = common.ValueSet(
                id='%s-%s' % (language.id, parameter.id),
                description=row['Comments_on_value_assignment'],
                markup_description=normalize_markup(
                    row.get('z_calc_Comments_on_value_assignment_CSS')),
            )

            values_found = {}
            for i in range(1, num_values):
                if not row['Value%s_true_false' % i]:
                    continue

                if row['Value%s_true_false' % i].strip().lower() != 'true':
                    assert row['Value%s_true_false' %
                               i].strip().lower() == 'false'
                    false_values[row[prefix('data_record_id', _prefix)]] = 1
                    continue

                iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i)
                if iid not in data['DomainElement']:
                    print(iid, row[prefix('data_record_id',
                                          _prefix)], '--> no domainelement!')
                    continue
                values_found['%s-%s' % (id_, i)] = dict(
                    id='%s-%s' % (valueset.id, i),
                    domainelement=data['DomainElement']['%s-%s' % (row[prefix(
                        'feature_code', _prefix)], i)],
                    confidence=row['Value%s_confidence' % i],
                    frequency=float(row['c_V%s_frequency_normalised' %
                                        i]) if _prefix == '' else 100)

            if values_found:
                if row[prefix('data_record_id', _prefix)] in wals_value_number:
                    valueset.jsondata = {
                        'wals_value_number':
                        wals_value_number.pop(row[prefix(
                            'data_record_id', _prefix)])
                    }
                valueset.parameter = parameter
                valueset.language = language
                valueset.contribution = data['ApicsContribution'][
                    row['Language_ID']]
                valueset = data.add(common.ValueSet, id_, _obj=valueset)
                for i, item in enumerate(values_found.items()):
                    if i > 0 and not parameter.multivalued:
                        print 'multiple values for single-valued parameter: %s' % id_
                        break
                    id_, kw = item
                    kw['valueset'] = valueset
                    value = data.add(common.Value, id_, **kw)

                #
                # store references to additional data for segments which should be reused
                # for corresponding primary features!
                #
                if int(parameter.id) in primary_to_segment:
                    assert len(values_found) == 1
                    seg_id = '%s-%s' % (language.id, primary_to_segment[int(
                        parameter.id)])
                    seg_valueset = data['ValueSet'][seg_id]
                    seg_value = data['Value'][seg_id]
                    if not valueset.description and seg_valueset.description:
                        valueset.description = seg_valueset.description

                    for s in seg_value.sentence_assocs:
                        DBSession.add(
                            common.ValueSentence(value=value,
                                                 sentence=s.sentence))

                    for r in seg_valueset.references:
                        DBSession.add(
                            common.ValueSetReference(valueset=valueset,
                                                     source=r.source,
                                                     key=r.key))

                    if not valueset.source and seg_valueset.source:
                        valueset.source = seg_valueset.source

                DBSession.flush()
            else:
                no_values[id_] = 1

    DBSession.flush()

    for prefix, abbr, num_values in [
        ('D', '', 10),
        ('Sociolinguistic_d', 'sl', 7),
    ]:
        for row in read(args, prefix + 'ata_references'):
            assert row['Reference_ID'] in data['Source'] \
                or row['Reference_ID'] in non_bibs
            try:
                vs = data['ValueSet'][abbr +
                                      str(row[prefix + 'ata_record_id'])]
                if row['Reference_ID'] in data['Source']:
                    source = data['Source'][row['Reference_ID']]
                    DBSession.add(
                        common.ValueSetReference(
                            valueset=vs,
                            source=source,
                            key=source.id,
                            description=row['Pages'],
                        ))
                else:
                    if vs.source:
                        vs.source += '; ' + non_bibs[row['Reference_ID']]
                    else:
                        vs.source = non_bibs[row['Reference_ID']]
            except KeyError:
                continue

    DBSession.flush()

    missing = 0
    for row in read(args, 'Value_examples'):
        try:
            DBSession.add(
                common.ValueSentence(
                    value=data['Value']['%(Data_record_id)s-%(Value_number)s' %
                                        row],
                    sentence=data['Sentence'][
                        '%(Language_ID)s-%(Example_number)s' % row],
                    description=row['Notes'],
                ))
        except KeyError:
            missing += 1
    print('%s Value_examples are missing data' % missing)

    print('%s data sets with false values' % len(false_values))
    print('%s data sets without values' % len(no_values))

    for k, v in wals_value_number.items():
        print 'unclaimed wals value number:', k, v

    for i, row in enumerate(read(args, 'Contributors')):
        kw = dict(contribution=data['ApicsContribution'][row['Language ID']],
                  contributor=data['Contributor'][row['Author ID']])
        if row['Order_of_appearance']:
            kw['ord'] = int(float(row['Order_of_appearance']))
        data.add(common.ContributionContributor, i, **kw)

    DBSession.flush()
Beispiel #47
0
 def refactor(self, soup, md):
     d = BeautifulSoup('<body></body>')
     body = d.find('body')
     linked = 0
     notlinked = 0
     multiple = 0
     for p in self._chunks(soup):
         if not isinstance(p, list):
             p = [p]
         for pp in p:
             if pp.is_header:
                 continue
             elif pp.is_refs:
                 md['refs'] = [self.get_ref(line[0]) for line in pp.lines]
             else:
                 ex = None
                 if pp.is_example:
                     container = d.new_tag(
                         'blockquote', **{
                             'class':
                             'example',
                             'style':
                             'font-size:100%;padding-left:1.8em;margin-left:0.3em'
                         })
                     #body.append(Tag(name='hr'))
                 else:
                     container = body
                 for e, line, t in pp.lines:
                     body.append(e)
                     if pp.is_example:
                         if re.match('\([0-9]+\)', line):
                             e.attrs['style'] = 'text-indent:-2em'
                         equo = "’".decode('utf8')
                         if line.startswith(
                                 "‘".decode('utf8')) and equo in line:
                             line = equo.join(
                                 line[1:].split(equo)[:-1]).strip()
                             examples = self.examples.get(slug(line))
                             if examples:
                                 if len(examples) > 1:
                                     #print '~~~', line
                                     multiple += 1
                                 else:
                                     ex = examples.values()[0]
                                     #print '+++'
                                     linked += 1
                             else:
                                 #print '---', line
                                 notlinked += 1
                     container.append(e)
                 if pp.is_example:
                     if ex:
                         container.attrs['id'] = 'ex-' + ex
                         container.append(
                             new_tag(
                                 d, 'small',
                                 new_tag(d,
                                         'a',
                                         'See example ' + ex,
                                         href='/sentences/' + ex)))
                     body.append(container)
     #print 'examples:', linked, 'linked,', notlinked, 'not linked,', multiple, 'multiple choices'
     for e in body.find_all('font'):
         e.unwrap()
     return d
Beispiel #48
0
 def repl(match):
     if end_tag.match(match.string[match.end():]):
         # if the next tag is the end tag of a link, then don't link again!
         return match.string[match.start():match.end()]
     return '<a class="ref-link" style="cursor: pointer;" data-content="%s">%s</a>' \
            % (slug(match.group('key').replace('&amp;', '&')), match.group('key'))
Beispiel #49
0
     'condition': lambda l: l.iso_code,
     'rdf': "rdfs:seeAlso",
     'logo': "odin.png"
 },
 {
     'name': 'WALS',
     'href': lambda l: "http://wals.info/languoid/lect/wals_code_"
     + l.get_identifier('WALS'),
     'condition': lambda l: l.get_identifier('WALS'),
     'rdf': "owl:sameAs",
     'logo': "wals.png"
 },
 {
     'name': 'WALSgenus',
     'href': lambda l: "http://wals.info/languoid/genus/"
     + slug(l.get_identifier('WALSgenus')),
     'condition': lambda l: l.get_identifier('WALSgenus'),
     'rdf': "owl:sameAs",
     'logo': "wals.png"
 },
 {
     'name': 'WALSfamily',
     'href': lambda l: "http://wals.info/languoid/family/"
     + slug(l.get_identifier('WALSfamily')),
     'condition': lambda l: l.get_identifier('WALSfamily'),
     'rdf': "owl:sameAs",
     'logo': "wals.png"
 },
 {
     'name': 'Endangered Languages',
     'href': lambda l: "http://www.endangeredlanguages.com/lang/"
Beispiel #50
0
def main(args):
    # determine if we run on a machine where other databases are available for lookup
    # locally:
    data = Data()
    genera = get_genera(data) if astroman else {}
    glottocodes, lnames, geocoords = {}, {}, {}
    if astroman:
        for k, v in glottocodes_by_isocode(
                'postgresql://robert@/glottolog3',
                cols=['id', 'name', 'latitude', 'longitude']).items():
            glottocodes[k] = v[0]
            lnames[k] = v[1]
            geocoords[k] = (v[2], v[3])

    refs = defaultdict(list)
    for row in get_rows(args, 'BibtexKey'):
        if row[1] == 'NO SOURCE GIVEN':
            refs[row[0]] = []
        else:
            refs[row[0]].append(row[1])
    add_sources(args, data)

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE Online',
        description='PHOIBLE Online',
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        domain='phoible.org',
        license='http://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, spec in enumerate([
        ('moran', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
        ('wright', "Richard Wright"),
    ]):
        DBSession.add(
            common.Editor(dataset=dataset,
                          ord=i + 1,
                          contributor=common.Contributor(id=spec[0],
                                                         name=spec[1])))

    squibs = defaultdict(list)
    for row in get_rows(args, 'Squib'):
        squibs[row[0]].append(row[1])

    source_urls = dict(get_rows(args, 'URL'))
    ia_urls = dict(get_rows(args, 'InternetArchive'))

    aggregated = list(
        reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True))
    inventory_names = {}
    for key, items in groupby(sorted(aggregated,
                                     key=lambda t: (t.LanguageCode, t.Source)),
                              key=lambda t: (t.LanguageCode, t.Source)):
        items = list(items)

        lname = lnames.get(key[0])
        if not lname:
            lname = items[0].LanguageName
            lnames[key[0]] = lname

        if len(items) == 1:
            inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1])
        else:
            for i, item in enumerate(items):
                inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i +
                                                                    1, key[1])

    family_map = {
        ("Arawakan", "arwk"): "Arawakan",
        ("Trans-New Guinea", "trng"): "Trans-New Guinea",
        ("Moklen", "anes"): "Austronesian",
        ("Oko", "ncon"): "Niger-Congo",
        ("Muniche", "saso"): "Muniche",
        ("Tinigua", "saso"): "Tinigua",
        ("Vilela", "luvi"): "Vilela",
        ("Ofayé", "macg"): "Kamakanan",
        ("Purian", "macg"): "PurianPrint",
        ("Mixed language", "saml"): "Mixed language",
        ("Tupian", "tupi"): "Tupian",
        ("Yuwana", "saun"): "YuwanaPrint",
    }
    family_code_map = {k[1]: v for k, v in family_map.items()}

    for row in aggregated:
        lang = data['Variety'].get(row.LanguageCode)
        if not lang:
            if row.LanguageFamilyGenus == 'UNCLASSIFIED':
                genus = None
            else:
                genus_id = slug(strip_quotes(row.LanguageFamilyGenus))
                genus = genera.get(genus_id)
                if not genus:
                    genus = genera.get(row.LanguageCode)
                    if not genus:
                        #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot)
                        family = family_map.get(
                            (row.LanguageFamilyGenus, row.LanguageFamilyRoot))
                        genus = genera[genus_id] = data.add(
                            models.Genus,
                            genus_id,
                            id=genus_id,
                            name=row.LanguageFamilyGenus,
                            description=family or row.LanguageFamilyRoot,
                            active=False,
                            root=row.LanguageFamilyRoot)

                if not genus.root:
                    genus.root = row.LanguageFamilyRoot

                if genus.description in family_code_map:
                    genus.description = family_code_map[genus.description]

            if row.LanguageCode in geocoords:
                coords = geocoords[row.LanguageCode]
            elif row.Latitude != 'NULL' and row.Longitude != 'NULL':
                coords = (float(row.Latitude), float(row.Longitude))
            lang = data.add(models.Variety,
                            row.LanguageCode,
                            id=row.LanguageCode,
                            name=lnames[row.LanguageCode],
                            genus=genus,
                            country=strip_quotes(row.Country),
                            area=strip_quotes(row.Area),
                            latitude=coords[0],
                            longitude=coords[1],
                            jsondata=dict(inventory_id=row.InventoryID))
            add_language_codes(data,
                               lang,
                               row.LanguageCode,
                               glottocodes=glottocodes)

        contributor = data['Contributor'].get(row.Source)
        if not contributor:
            contributor = data.add(common.Contributor,
                                   row.Source,
                                   id=row.Source,
                                   name=SOURCES[row.Source][0],
                                   description=SOURCES[row.Source][2])
            for ref in SOURCES[row.Source][1]:
                DBSession.add(
                    models.ContributorReference(source=data['Source'][ref],
                                                contributor=contributor))

        contrib = data.add(models.Inventory,
                           row.InventoryID,
                           id=row.InventoryID,
                           language=lang,
                           source=row.Source,
                           source_url=source_urls.get(row.InventoryID),
                           internetarchive_url=ia_urls.get(row.InventoryID),
                           name=inventory_names[row.InventoryID],
                           description=row.LanguageName)

        DBSession.add(
            common.ContributionContributor(contribution=contrib,
                                           contributor=contributor))

        for j, squib in enumerate(squibs.get(row.InventoryID, [])):
            f = common.Contribution_files(object=contrib,
                                          id='squib-%s-%s.pdf' %
                                          (contrib.id, j + 1),
                                          name='Phonological squib',
                                          description=squib,
                                          mime_type='application/pdf')
            assert f
            # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read())

    DBSession.flush()
    unknown_refs = {}

    for row in reader(args.data_file('phoible-phonemes.tsv'),
                      namedtuples=True):
        inventory = data['Inventory'][row.InventoryID]
        segment = data['Segment'].get(row.Phoneme)
        if not segment:
            unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme]
            description = ' - '.join([t[1] for t in unicode_desc])
            segment = data.add(
                models.Segment,
                row.Phoneme,
                id=b16encode(md5(description).digest()),
                name=row.Phoneme,
                description=description,
                equivalence_class=''.join([
                    t[0] for t in unicode_desc
                    if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
                ]),
                segment_class=row.Class,
                combined_class=row.CombinedClass)
            DBSession.flush()

        vs = common.ValueSet(id=row.PhonemeID,
                             contribution=inventory,
                             language=inventory.language,
                             parameter=segment)

        for ref in refs.get(row.InventoryID, []):
            if ref not in data['Source']:
                if ref not in unknown_refs:
                    print('-------', ref)
                unknown_refs[ref] = 1
                continue
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            common.Value(
                id=row.PhonemeID,
                name='%s %s' %
                (row.Phoneme, data['Inventory'][row.InventoryID].name),
                valueset=vs))
        DBSession.flush()

    for inventory_id in refs:
        for ref in refs[inventory_id]:
            if ref not in data['Source']:
                continue
            data.add(common.ContributionReference,
                     '%s-%s' % (inventory_id, ref),
                     source=data['Source'][ref],
                     contribution=data['Inventory'][inventory_id])

    for i, row in enumerate(
            reader(args.data_file('phoible-segments-features.tsv'))):
        if i == 0:
            features = list(map(feature_name, row))
            continue

        if row[0] not in data['Segment']:
            # print('skipping feature vector:', row)
            continue
        for j, value in enumerate(row):
            if j and value != '0':
                DBSession.add(
                    common.Parameter_data(
                        key=features[j],
                        value=value,
                        ord=j,
                        object_pk=data['Segment'][row[0]].pk))
    DBSession.flush()
Beispiel #51
0
def test_slug():
    from clld.util import slug

    assert slug('A B. \xe4C') == 'abac'
Beispiel #52
0
def main(bib, mode):  # pragma: no cover
    count = 0
    skipped = 0

    with transaction.manager:
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        known_ids = set(r[0] for r in DBSession.query(Ref.pk))

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(bib):
            if len(rec.keys()) < 6:
                skipped += 1
                #print '---> skip', rec.id
                #print rec
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))
            if mode != 'update' and id_ in known_ids:
                continue
            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': getattr(EntryType, rec.genre),
                'id': str(id_),
                'jsondata': {
                    'bibtexkey': rec.id
                },
            }

            for source, target in FIELD_MAP.items():
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('year'):
                match = YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [
                        s.strip() for s in kw['publisher'].split(':', 1)
                    ]
                    if not 'address' in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if kw.get('pages'):
                pages = kw.get('pages')
                match = ROMANPAGESPATTERNra.search(pages)
                if not match:
                    match = ROMANPAGESPATTERNar.search(pages)
                if match:
                    if 'pages_int' not in kw:
                        kw['pages_int'] = roman_to_int(match.group('roman')) \
                            + int(match.group('arabic'))
                else:
                    start = None
                    number = None
                    match = None

                    for match in PAGES_PATTERN.finditer(pages):
                        if start is None:
                            start = int(match.group('start'))
                        number = (number or 0) \
                            + (int(match.group('end')) - int(match.group('start')) + 1)

                    if match:
                        kw['endpage_int'] = int(match.group('end'))
                        kw['startpage_int'] = start
                        kw.setdefault('pages_int', number)
                    else:
                        try:
                            kw['startpage_int'] = int(pages)
                        except ValueError:
                            pass

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    #if k == 'title':
                    #    v = ref.title or ref.description
                    #else:
                    if 1:
                        v = getattr(ref, k)
                    if kw[k] != v:
                        #
                        # TODO!
                        #
                        setattr(ref, k, kw[k])
                        #if k not in ['jsondata', 'publisher']:
                        #    print k, ref.pk
                        #    print kw[k]
                        #    print v
                        #    print '--------------'
                        changed = True
                    if ref.title:
                        ref.description = ref.title
            else:
                changed = True
                ref = Ref(**kw)

            def append(attr, obj):
                if obj and obj not in attr:
                    changed = True
                    #
                    # TODO!
                    #
                    attr.append(obj)

            for name in set(
                    filter(None, [
                        s.strip() for s in kw['jsondata'].get(
                            'macro_area', '').split(',')
                    ])):
                append(ref.macroareas, macroarea_map[name])

            for name in set(
                    filter(None, [
                        s.strip()
                        for s in kw['jsondata'].get('src', '').split(',')
                    ])):
                append(ref.providers, provider_map[slug(name)])

            for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype',
                                                                 '')):
                append(ref.doctypes, doctype_map[m.group('name')])

            if len(kw['jsondata'].get('lgcode', '')) == 3:
                kw['jsondata']['lgcode'] = '[%s]' % kw['jsondata']['lgcode']

            for m in CODE_PATTERN.finditer(kw['jsondata'].get('lgcode', '')):
                for code in set(m.group('code').split(',')):
                    if code not in languoid_map:
                        if code not in ['NOCODE_Payagua', 'emx']:
                            print '--> unknown code:', code.encode('utf8')
                    else:
                        append(ref.languages, languoid_map[code])

            for glottocode in filter(
                    None, kw['jsondata'].get('alnumcodes', '').split(';')):
                if glottocode not in languoid_map:
                    print '--> unknown glottocode:', glottocode.encode('utf8')
                else:
                    append(ref.languages, languoid_map[glottocode])

            if not update:
                #pass
                #
                # TODO!
                #
                DBSession.add(ref)

            if i % 100 == 0:
                print i, 'records done'

            if changed:
                count += 1

        print count, 'records updated or imported'
        print skipped, 'records skipped because of lack of information'