Beispiel #1
0
 def merge(self, ex1, ex2):
     for prop in 'rf tx mb gl ft'.split():
         p1 = ex1.get(prop)
         p2 = ex2.get(prop)
         if p1:
             if p2:
                 try:
                     assert slug(p1) == slug(p2)
                 except AssertionError:
                     self.log.write(
                         '# cannot merge \\%s:\n%s\n# and\n%s\n\n' % (prop, ex1, ex2))
                     raise
         else:
             if p2:
                 ex1.set(prop, p2)
Beispiel #2
0
def bibtex2source(rec, cls=common.Source):
    year = bibtex.unescape(rec.get("year", "nd"))
    fields = {}
    jsondata = {}
    for field in bibtex.FIELDS:
        if field in rec:
            value = bibtex.unescape(rec[field])
            container = fields if hasattr(cls, field) else jsondata
            container[field] = value

    etal = ""
    eds = ""
    authors = rec.get("author")
    if not authors:
        authors = rec.get("editor", "")
        if authors:
            eds = " (eds.)"
    if authors:
        authors = bibtex.unescape(authors).split(" and ")
        if len(authors) > 2:
            authors = authors[:1]
            etal = " et al."

        authors = [HumanName(a) for a in authors]
        authors = [n.last or n.first for n in authors]
        authors = "%s%s%s" % (" and ".join(authors), etal, eds)

    return cls(
        id=slug(rec.id),
        name=("%s %s" % (authors, year)).strip(),
        description=bibtex.unescape(rec.get("title", rec.get("booktitle", ""))),
        jsondata=jsondata,
        bibtex_type=rec.genre,
        **fields
    )
Beispiel #3
0
def add_identifier(languoid, data, name, type, description, lang='en'):
    if len(lang) > 3:
        # Weird stuff introduced via hhbib_lgcode names. Roll back language parsing.
        name, lang = '{0} [{1}]'.format(name, lang), 'en'
    identifier = data['Identifier'].get((name, type, description, lang))
    if not identifier:
        identifier = data.add(
            common.Identifier,
            (name, type, description, lang),
            id='{0}-{1}-{2}-{3}'.format(
                slug(name), slug(type), slug(description or ''), lang),
            name=name,
            type=type,
            description=description,
            lang=lang)
    DBSession.add(common.LanguageIdentifier(language=languoid, identifier=identifier))
Beispiel #4
0
def make_index(level, repos=None):
    fname = dict(
        language='languages', family='families', dialect='dialects')[level.name]
    links = defaultdict(dict)
    for lang in walk_tree(tree=languoids_path('tree', repos=repos)):
        if lang.level == level:
            label = '{0.name} [{0.id}]'.format(lang)
            if lang.iso:
                label += '[%s]' % lang.iso
            links[slug(lang.name)[0]][label] = \
                lang.dir.joinpath(lang.fname('.ini'))\
                    .relative_to(languoids_path(repos=repos))

    res = [languoids_path(fname + '.md', repos=repos)]
    with res[0].open('w', encoding='utf8') as fp:
        fp.write('## %s\n\n' % fname.capitalize())
        fp.write(' '.join(
            '[-%s-](%s_%s.md)' % (i.upper(), fname, i) for i in sorted(links.keys())))
        fp.write('\n')

    for i, langs in links.items():
        res.append(languoids_path('%s_%s.md' % (fname, i), repos=repos))
        with res[-1].open('w', encoding='utf8') as fp:
            for label in sorted(langs.keys()):
                fp.write('- [%s](%s)\n' % (label, langs[label]))
    return res
Beispiel #5
0
    def from_csv(cls, row, data=None, description=None):
        obj = cls(**{n: row[i] for i, n in enumerate(cls.__csv_head__) if '__' not in n and n != 'audio'})
        if not slug(row[1]):
            obj.active = False
        row = dict(list(zip(cls.__csv_head__, row)))
        sid = row['taxa__id']
        lid = row['languages__id']
        vsid = '%s-%s' % (sid, lid)
        if vsid in data['ValueSet']:
            obj.valueset = data['ValueSet'][vsid]
        else:
            # Note: source and references are dumped redundantly with each word, so we
            # only have to recreate these if a new ValueSet had to be created.
            obj.valueset = data.add(
                ValueSet, vsid,
                id=vsid,
                parameter=data['Taxon'][sid],
                language=data['Languoid'][lid],
                contribution=data['Contribution']['tsammalex'])

        if row['refs__ids']:
            for i, rid, pages in parse_ref_ids(row['refs__ids']):
                data.add(
                    NameReference, '%s-%s' % (obj.id, i),
                    name=obj,
                    source=data['Bibrec'][rid],
                    description=pages or None)
        for rel, cls in [
            ('categories', 'Category'),
            ('habitats', 'Category'),
            ('uses', 'Use')
        ]:
            for id_ in split_ids(row[rel + '__ids']):
                getattr(obj, rel).append(data[cls][id_.strip()])
        return obj
Beispiel #6
0
Datei: util.py Projekt: clld/clld
def bibtex2source(rec, cls=common.Source, lowercase_id=False):
    year = bibtex.unescape(rec.get('year', 'nd'))
    fields = {}
    jsondata = {}
    for field in bibtex.FIELDS:
        if field in rec:
            value = bibtex.unescape(rec[field])
            container = fields if hasattr(cls, field) else jsondata
            container[field] = value

    etal = ''
    eds = ''
    authors = rec.get('author')
    if not authors:
        authors = rec.get('editor', '')
        if authors:
            eds = ' (eds.)'
    if authors:
        authors = bibtex.unescape(authors).split(' and ')
        if len(authors) > 2:
            authors = authors[:1]
            etal = ' et al.'

        authors = [HumanName(a) for a in authors]
        authors = [n.last or n.first for n in authors]
        authors = '%s%s%s' % (' and '.join(authors), etal, eds)

    return cls(
        id=slug(rec.id, lowercase=lowercase_id),
        name=('%s %s' % (authors, year)).strip(),
        description=bibtex.unescape(rec.get('title', rec.get('booktitle', ''))),
        jsondata=jsondata,
        bibtex_type=rec.genre,
        **fields)
Beispiel #7
0
 def update_name(self, lid, newname, other=None):
     lpk = self.pk(Language, lid)
     self.update(Language, dict(name=newname), pk=lpk)
     self.update(
         WalsLanguage, dict(ascii_name=slug(newname, remove_whitespace=False)), pk=lpk)
     if other:
         ipk = self.insert(Identifier, name=other, description='other', type='name')
         self.insert(LanguageIdentifier, identifier_pk=ipk, language_pk=lpk)
Beispiel #8
0
def adapter_factory(*args, **kw):
    # for backwards compatibility we interpret the first positional argument as template:
    if args:
        kw['template'] = args[0]
    assert 'template' in kw
    kw.setdefault('mimetype', 'text/html')
    kw.setdefault('extension', 'html')
    base = kw.pop('base', Representation)
    return type(str('AdapterFromFactory%s' % slug(text_type(uuid4()))), (base,), kw)
Beispiel #9
0
def florafauna(req):
    note_ids = [
        'fish_notes_Mali_JH',
        'flora_notes_Mali_JH',
        'insect_arthropod_mollusc_notes_Mali_JH',
        'mammal_notes_Mali_JH',
        'reptile_notes_Mali_JH',
        'bird_notes_Mali_JH',
    ]
    return {
        'notes': [Source.get(slug(sid)) for sid in note_ids]
    }
Beispiel #10
0
def other(req):
    jenaama = 'Heath2016-Jenaama-lexicon Heath2016-JenaamaBozo'.split()
    rows = [
        ["Tieyaxo", "Tigemaxo", "boz", "tiey1235"],
        ["Tiema Cewe", "Tiema Ce", "boo", "tiem1235"],
        ["Kelenga", "Hainyaxo", "bsx", "hain1253"],
        ["Jenaama", "Sorogaana", "bze", "jena1242"],
    ]
    return {
        'rows': rows,
        'jenaama': [Source.get(slug(sid)) for sid in jenaama]
    }
Beispiel #11
0
def nexus_slug(s):
    """
    Converts a string to a nexus "safe" representation (i.e. removes
    many unicode characters and removes some punctuation characters).

    Parameters
    ----------
    s : str
        A string to convert to a nexus safe format.

    Returns
    -------
    s : str
        A string containing a nexus safe label.
    """
    return slug(s, lowercase=False, remove_whitespace=False).replace(" ", "_")
Beispiel #12
0
def to_cldf(wordlist, path='cldf', source_path=None, ref="cogid",
        segments="tokens", form="ipa", note='note', form_in_source="value",
        source=None, alignment=None):
    """Convert a wordlist in LingPy to CLDF.
    
    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A regular Wordlist object (or similar).
    path : str (default='cldf')
        The name of the directory to which the files will be written.
    source_path : str (default=None)
        If available, specify the path of your BibTex file with the sources.
    ref : str (default="cogid")
        The column in which the cognate sets are stored.
    segments : str (default="tokens")
        The column in which the segmented phonetic strings are stored.
    form : str (default="ipa")
        The column in which the unsegmented phonetic strings are stored.
    note : str (default=None)
        The column in which you store your comments.
    form_in_source : str (default=None)
        The column in which you store the original form in the source.
    source : str (default=None)
        The column in which you store your source information. 
    alignment : str (default="alignment")
        The column in which you store the alignments.
    """
    if not cldf:
        raise ValueError('The package pycldf needs to be installed')

    # create cldf-dataset
    ds = CLDF_Wordlist.in_dir(path)
    # add sources if they are available
    ds.add_sources(
            read_text(source_path) if source_path else '')
    # add components
    ds.add_component('LanguageTable')
    ds.add_component('ParameterTable', 'Concepticon_ID')
    ds.add_component('CognateTable')
    ds.add_columns('FormTable', 'form_in_source')

    languages, parameters, forms, cognates = {}, {}, [], []
    for idx in wordlist:
        lid = slug(wordlist[idx, 'doculect'])
        if lid not in languages:
            languages[lid] = dict(
                    ID=lid,
                    Name=wordlist[idx, 'doculect'],
                    Glottocode = wordlist[idx, 'glottocode'])

        pid = wordlist[idx, 'concepticon_id'] or slug(wordlist[idx, 'concept'])
        if pid not in parameters:
            parameters[pid] = dict(
                ID=pid,
                Name=wordlist[idx, 'concept'],
                Concepticon_ID=wordlist[idx, 'concepticon_id'])

        forms.append(dict(
            ID=str(idx),
            Language_ID=lid,
            Parameter_ID=pid,
            form_in_source=wordlist[idx, form_in_source] or '' if form_in_source else '',
            Form=wordlist[idx, form] or '' if form else '',
            Segments=wordlist[idx, segments] or '' if segments else '',
            Source=[wordlist[idx, source]] or [] if source else [],
            Comment=wordlist[idx, note] or '' if note else ''))

        if ref:
            cognates.append(dict(ID=str(idx), Form_ID=str(idx),
                Cognateset_ID=wordlist[idx, ref], Alignment=wordlist[idx,
                    alignment] or [''] if alignment else ['']))

    ds.write(
        FormTable=forms,
        LanguageTable=languages.values(),
        ParameterTable=parameters.values(),
        CognateTable=cognates)
Beispiel #13
0
def lff2tree(lff, tree=None, outdir='fromlff'):
    out = Path(outdir)
    out.mkdir()
    old_tree = {l.id: l for l in languoids_from_tree(tree)} if tree else {}

    nodes = set()
    languages = {}
    for lang in read_lff(lff, 'language'):
        groupdir = out
        languages[lang.id] = lang

        for name, id_, level in lang.lineage:
            groupdir = groupdir.joinpath('%s.%s' % (slug(name), id_))
            if not groupdir.exists():
                groupdir.mkdir()
                if id_ in old_tree:
                    group = old_tree[id_]
                    assert group.level == level
                    if name != group.name:
                        # rename a subgroup!
                        group.name = name
                    group.write_info(groupdir)
                else:
                    # TODO: create Languoid, write info file!
                    pass

            assert id_ in old_tree
            nodes.add(id_)

        assert lang.id in old_tree
        nodes.add(lang.id)
        old_lang = old_tree[lang.id]
        assert old_lang.level == lang.level
        if old_lang.name != lang.name:
            old_lang.name = lang.name
        langdir = groupdir.joinpath(lang.fname())
        langdir.mkdir()
        old_lang.write_info(langdir)

    for lang in read_lff(lff.replace('lff', 'dff'), 'dialect'):
        groupdir = out

        if not lang.lineage:
            # TODO: handle error of un-attached dialects!
            continue

        for name, id_, level in languages[lang.lineage[0][1]].lineage + lang.lineage:
            groupdir = groupdir.joinpath('%s.%s' % (slug(name), id_))
            if not groupdir.exists():
                groupdir.mkdir()
                if id_ in old_tree:
                    group = old_tree[id_]
                    assert group.level == level
                    if name != group.name:
                        # rename a subgroup!
                        group.name = name
                    group.write_info(groupdir)
                else:
                    # TODO: create Languoid, write info file!
                    pass

            assert id_ in old_tree
            nodes.add(id_)

        assert lang.id in old_tree
        nodes.add(lang.id)
        old_lang = old_tree[lang.id]
        assert old_lang.level == lang.level
        if old_lang.name != lang.name:
            old_lang.name = lang.name
        langdir = groupdir.joinpath(lang.fname())
        langdir.mkdir()
        old_lang.write_info(langdir)

    print len(nodes)
Beispiel #14
0
 def contribution_id(self):
     return '{0}-{1}'.format(self.contributor_id, slug(self.review_title))
Beispiel #15
0
Datei: util.py Projekt: clld/csd
def normalize_sid(sid):
    return slug(sid.replace('+', 'and').replace('&', 'and'))
Beispiel #16
0
 def words(s):
     return set(slug(s.strip(), remove_whitespace=False).split())
Beispiel #17
0
 def parameter_id(self):
     return slug(self.parameter)
Beispiel #18
0
def main(args):
    #
    # order of init:
    # - villages
    # - files
    # - movies
    #
    videos = defaultdict(list)
    for f in util.iter_files(args):
        obj = models.File(**attr.asdict(f))
        if obj.mime_type.startswith('video'):
            videos[slug(obj.name.split('.')[0])].append(obj)
        DBSession.add(obj)

    lexicon = list(util.iter_lexicon(args))
    villages = util.get_villages(args)
    ff_images = list(util.ff_images(args))
    bib = list(util.get_bib(args))
    data = Data()

    dataset = common.Dataset(
        id=dogonlanguages.__name__,
        name="Dogon and Bangime Linguistics",
        contact="*****@*****.**",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='dogonlanguages.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'}
    )
    DBSession.add(dataset)

    if Glottolog:
        if socket.gethostname() == 'dlt5502178l':
            glottolog = Glottolog(
                Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath(
                    'glottolog3', 'glottolog'))
        else:
            glottolog = Glottolog(
                Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath(
                    'glottolog'))
        languoids = {l.id: l for l in glottolog.languoids()}
    else:
        languoids = {}
    print('got glottolog')

    for c in util.CONTRIBUTORS:
        id_ = slug(c.name.split()[-1])
        data.add(models.Member, id_, id=id_, **attr.asdict(c))
    data.add(
        models.Member, 'forkel',
        id='forkel',
        name='Robert Forkel',
        email='*****@*****.**',
        in_project=False)

    for i, id_ in enumerate(['moran', 'forkel', 'heath']):
        DBSession.add(common.Editor(
            dataset=dataset, ord=i + 1, contributor=data['Member'][id_]))

    contrib = data.add(common.Contribution, 'd', id='d', name='Dogon Languages')
    for doc in bib:
        obj = data.add(
            models.Document,
            doc.rec.id,
            _obj=bibtex2source(doc.rec, cls=models.Document))
        keywords = nfilter([s.strip() for s in doc.rec.get('keywords', '').split(',')])
        for dt in 'grammar lexicon typology texts'.split():
            if dt in keywords:
                obj.doctype = dt
                break
        obj.project_doc = ('DLP' in keywords) or bool(doc.files)
        if obj.project_doc:
            for i, cid in enumerate(util.get_contributors(doc.rec, data)):
                models.DocumentContributor(
                    document=obj, contributor=data['Member'][cid], ord=i)
        for i, (path, cdstar) in enumerate(doc.files):
            common.Source_files(
                id='%s-%s' % (obj.id, i + 1),
                name=path,
                object=obj,
                mime_type=guess_type(path)[0],
                jsondata=cdstar,
            )

    print('got bib')

    for name, (gc, desc) in LANGUAGES.items():
        gl_lang = languoids[gc]
        lat, lon = gl_lang.latitude, gl_lang.longitude
        lang = data.add(
            models.Languoid, gc,
            id=gc,
            name=name,
            description=desc,
            latitude=lat,
            longitude=lon,
            family=gl_lang.family.name if gl_lang and gl_lang.family else name,
        )
        if name == 'Penange' and lang.longitude > 0:
            lang.longitude = -lang.longitude
        if name == 'Bankan Tey':
            lang.latitude, lang.longitude = 15.07, -2.91
        if name == 'Ben Tey':
            lang.latitude, lang.longitude = 14.85, -2.95
        if name == 'Togo Kan':
            lang.latitude, lang.longitude = 14.00, -3.25
        add_language_codes(data, lang, gl_lang.iso, glottocode=gc)

    villages_by_name = defaultdict(list)
    contrib_by_initial = {c.abbr: c for c in data['Member'].values()}
    for i, village in enumerate(villages):
        lang = None
        if village.glottocode:
            lang = data['Languoid'].get(village.glottocode)
            if not lang:
                gl_lang = languoids[village.glottocode]
                lang = data.add(
                    models.Languoid, gl_lang.id,
                    id=gl_lang.id,
                    name=gl_lang.name,
                    in_project=False,
                    family=gl_lang.family.name if gl_lang.family else gl_lang.name)
        v = data.add(
            models.Village, str(i + 1),
            id=str(i + 1),
            name=village.name,
            description=village.data.pop('social info'),
            surnames=village.data.pop('surnames'),
            major_city=village.data['MajorCity'] == 'Y',
            transcribed_name=village.data.pop('Transcribed Village Name'),
            source_of_coordinates=village.data.pop('sourceOfCoordinates'),
            latitude=village.lat,
            longitude=village.lon,
            languoid=lang,
            jsondata=village.data,
        )
        villages_by_name[village.name].append(v)
        for img in village.images:
            mimetype = guess_type(img.name)[0]
            if mimetype:
                f = models.Village_files(
                    id=img.id,
                    name=img.name,
                    description=img.description,
                    date_created=img.date,
                    latitude=img.coords[0] if img.coords else None,
                    longitude=-img.coords[1] if img.coords else None,
                    object=v,
                    mime_type=mimetype,
                    jsondata=img.cdstar,
                )
                for initial in img.creators:
                    if initial in contrib_by_initial:
                        models.Fotographer(
                            foto=f, contributor=contrib_by_initial[initial])

    for cat, desc, place, name in MOVIES:
        s = slug(name)
        m = models.Movie(
            id=s,
            name=desc,
            description=cat,
            place=place,
        )
        if place in villages_by_name and len(villages_by_name[place]) == 1:
            m.village = villages_by_name[place][0]
            #print('found village: %s' % name)
        for v in videos[s]:
            #print('found video: %s' % name)
            v.movie = m
            m.duration = v.duration

    names = defaultdict(int)
    for concept in lexicon:
        add(concept, data, names, contrib)

    count = set()
    for img in ff_images:
        if img.id in count:
            continue
        count.add(img.id)
        if img.ref:
            if img.ref in data['Concept']:
                concept = data['Concept'][img.ref]
                if img.tsammalex_taxon and not concept.tsammalex_taxon:
                    concept.tsammalex_taxon = img.tsammalex_taxon
                    #print(concept.tsammalex_taxon)
                common.Parameter_files(
                    object=concept,
                    id=img.id,
                    name=img.name.decode('utf8'),
                    mime_type=guess_type(img.name)[0],
                    jsondata=img.cdstar)
            else:
                print('missing ref: %s' % img.ref)
Beispiel #19
0
 def id(self):
     res = self.get('ref')
     if not res:
         res = md5(slug(self.text + self.translation).encode('utf')).hexdigest()
         self.insert(0, ('ref', res))
     return res
Beispiel #20
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    data = Data()
    concept_list = Concepticon(CONCEPTICON_REPOS).conceptlist('Key-2016-1310')

    def concepticon_id(ids_code):
        for item in concept_list:
            if item['IDS_ID'] == ids_code:
                return int(item['CONCEPTICON_ID']) if item['CONCEPTICON_ID'] else None

    def read(table):
        fname = args.data_file(table + '.all.csv')
        if not fname.exists():
            fname = args.data_file(table + '.csv')
        return list(dsv.reader(fname, namedtuples=True))

    dataset = common.Dataset(
        id=ids.__name__,
        name="IDS",
        description="The Intercontinental Dictionary Series",
        published=date(2015, 5, 25),
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name':
                'Creative Commons Attribution 4.0 International License',
        },
        domain='ids.clld.org')

    DBSession.add(dataset)

    for rec in Database.from_file(args.data_file('sources.bib'), lowercase=True):
        if rec.id not in data['Source']:
            data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    DBSession.flush()

    data_desc = defaultdict(dict)
    for l in read('x_lg_data'):
        data_desc[l.lg_id][l.map_ids_data] = l.header

    # language lang
    iso_codes = {l.id: l.sil_code for l in read('sil_lang')}
    iso_codes = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil')}
    languages = []

    exclude = []
    for l in read('lang'):
        if l.status == '1':
            exclude.append(l.lg_id)
            continue
        lang_changed = LANGS.get(int(l.lg_id), {})
        code = lang_changed.get('glotto') or lang_changed.get('iso') or iso_codes.get(l.lg_id)
        lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=lang_changed.get('name', l.lg_name))
        if code:
            languages.append((code, lang))
        data.add(
            models.Dictionary, l.lg_id,
            id=l.lg_id, name=l.lg_name,
            language=lang,
            default_representation=data_desc[l.lg_id].get('1'),
            alt_representation=data_desc[l.lg_id].get('2'),
            jsondata=dict(status=l.status, date=l.date))

    iso2glotto = {}
    for l in walk_tree(tree=languoids_path('tree', GLOTTOLOG_REPOS)):
        if l.iso:
            iso2glotto[l.iso] = l.id

    load_families(
        Data(), [(iso2glotto.get(c, c), l) for c, l in languages], glottolog=Glottolog(GLOTTOLOG_REPOS), isolates_icon='tcccccc')

    contributors = defaultdict(list)
    sources = defaultdict(list)
    for l in read('lang_compilers'):
        if l.lg_id in exclude:
            continue
        if l.name == "BIBIKO":
            continue
        #name	lg_id	what_did_id
        if int(l.what_did_id) in models.ROLES:
            contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id))
        else:
            assert int(l.what_did_id) in [4, 395]
            sources[l.name].append(l.lg_id)

    for s, roles in contributors.items():
        name = roles[0][0]
        c = data.add(common.Contributor, s, id=s, name=name)
        if name == 'Mary Ritchie Key':
            c.address = 'University of California, Irvine'
        for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]):
            sroles = sorted(
                [s[1] for s in specs],
                reverse=True,
                key=lambda what: what + 2 if what == 2 else what)
            what = sroles[0]
            DBSession.add(common.ContributionContributor(
                contribution=data['Dictionary'][lg],
                contributor=c,
                ord=what,
                primary=what == 2))

    data.add(
        common.Contributor, 'bernardcomrie',
        id='bernardcomrie',
        name="Bernard Comrie",
        address="University of California, Santa Barbara")

    for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']):
        common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1)

    #for i, name in enumerate(sorted(sources.keys())):
    #    c = data.add(common.Source, name, id=str(i + 1), name=name, description=name)

    DBSession.flush()
    for name, lgs in sources.items():
        for _src in name.split(';'):
            src = data['Source'].get(_src.strip())
            if not src:
                print('-- missing source --', _src)
                raise ValueError
            for lg in lgs:
                if lg in exclude:
                    continue
                assert lg in data['Dictionary']
                DBSession.add(common.ContributionReference(
                    contribution_pk=data['Dictionary'][lg].pk, source_pk=src.pk))

    altnames = {}
    for i, l in enumerate(read('alt_names')):
        if l.name in altnames:
            identifier = altnames[l.name]
        else:
            identifier = data.add(
                common.Identifier, l.name,
                id='name-%s' % i, type='name', name=l.name, description='IDS')
            altnames[l.name] = identifier
        if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name:
            DBSession.add(common.LanguageIdentifier(
                identifier=identifier,
                language=data['IdsLanguage'][l.lg_id]))

    # parameter chapter/entry
    for l in read('chapter'):
        data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title)

    entries = {}
    for l in read('entry'):
        id_ = '%s-%s' % (l.chap_id, l.entry_id)
        name = l.trans_english
        if name in entries:
            entries[name] += 1
            name = name + ' (%s)' % entries[name]
        else:
            entries[name] = 1
        kw = {
            'id': id_,
            'name': name,
            'concepticon_id': concepticon_id(id_),
            'chapter': data['Chapter'][l.chap_id]}
        for ll in 'french russian spanish portugese'.split():
            kw[ll] = getattr(l, 'trans_' + ll)
        data.add(models.Entry, id_, sub_code=l.entry_id, **kw)

    misaligned = []

    DBSession.flush()
    for entity in 'IdsLanguage Entry Chapter Dictionary'.split():
        for k in data[entity].keys()[:]:
            data[entity][k] = data[entity][k].pk

    synsets = set()
    counterparts = set()
    problems = defaultdict(list)

    for lg_id, entries in groupby(
            sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id):
        if lg_id in exclude or not lg_id:
            continue

        # keep the memory footprint reasonable
        transaction.commit()
        transaction.begin()

        language = common.Language.get(data['IdsLanguage'][lg_id])
        desc = data_desc.get(lg_id, {})
        words = defaultdict(list)
        for l in entries:
            if empty.match(l.data_1):
                continue

            entry_id = '%s-%s' % (l.chap_id, l.entry_id)
            if entry_id not in data['Entry']:
                continue
                #data.add(
                #    models.Entry, entry_id,
                #    id=entry_id,
                #    name=entry_id,
                #    concepticon_id=concepticon_id(entry_id),
                #    sub_code=l.entry_id,
                #    chapter_pk=data['Chapter'][l.chap_id])
                #DBSession.flush()
                #data['Entry'][entry_id] = data['Entry'][entry_id].pk

            id_ = '%s-%s' % (entry_id, l.lg_id)
            if id_ in synsets:
                vs = models.Synset.get(id_)
            else:
                vs = models.Synset(
                    id=id_,
                    comment=get_string(l.comment or ''),
                    alt_representation=get_string(l.data_2),
                    language=language,
                    contribution_pk=data['Dictionary'][l.lg_id],
                    parameter_pk=data['Entry'][entry_id])
                synsets.add(id_)

            trans1 = list(split_counterparts(l.data_1))
            trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2))

            if trans2:
                if len(trans2) != len(trans1):
                    if language.id != '238':
                        misaligned.append((l.chap_id, l.entry_id, l.lg_id))
                        #print('===', language.id, language.name)
                        #print(l.data_1)
                        #print(l.data_2)
                    # 83 cases of misaligned transcriptions
                    trans2 = None

            for i, word in enumerate(trans1):
                cid = id_ + '-' + str(i + 1 + len(vs.values))
                if cid not in counterparts:
                    v = models.Counterpart(
                        id=cid,
                        name=word,
                        description=desc.get('1'),
                        valueset=vs)
                    words[word].append((v, trans2[i] if trans2 else None))
                    counterparts.add(cid)
                else:
                    print(cid)
                    #12 - 420 - 811 - 3
                    #5 - 390 - 818 - 3
                    #2 - 930 - 819 - 3
                    #2 - 930 - 819 - 3
                    #3 - 120 - 819 - 3
                    #10 - 140 - 822 - 3
                    #9 - 160 - 825 - 3
                    #2 - 430 - 829 - 4

        for i, form in enumerate(words.keys()):
            # Since we identify words based on their string representation, we have to
            # make sure a word has the same alternative transcription for all meanings.
            if language.id == '238':
                alt_names = []
            else:
                alt_names = set(norm(w[1] or '', desc.get('2'), language.id)
                                for w in words[form])
            alt_names = nfilter(alt_names)
            try:
                assert len(alt_names) <= 1
            except AssertionError:
                problems[(language.id, language.name)].append(alt_names)
            word = models.Word(
                id='%s-%s' % (language.id, i + 1),
                name=form,
                description=desc.get('1'),
                language=language,
                alt_name=', '.join(alt_names) if alt_names else None,
                alt_description=desc.get('2')
            )
            for v, _ in words[form]:
                word.counterparts.append(v)
            DBSession.add(word)

        DBSession.flush()

    with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp:
        fp.writerows(misaligned)

    # about 250 cases where alternative transcriotions do not covary across meanings.
    for k, v in problems.items():
        print(k, len(v))
Beispiel #21
0
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=dictionaria.__name__,
        name="Dictionaria",
        description="The Dictionary Journal",
        published=date(2015, 10, 1),
        contact='*****@*****.**',
        domain='dictionaria.clld.org',
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    ed = data.add(
        common.Contributor, 'hartmanniren', id='hartmanniren', name='Iren Hartmann')
    common.Editor(dataset=dataset, contributor=ed)
    DBSession.add(dataset)

    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))

    comparison_meanings = {}
    comparison_meanings_alt_labels = {}

    print('loading concepts ...')

    concepticon = Concepticon()
    for i, concept_set in enumerate(concepticon.resources('parameter').members):
        concept_set = concepticon.resource(concept_set)
        cm = ComparisonMeaning(
            id=concept_set.id,
            name=concept_set.name.lower(),
            description=concept_set.description,
            concepticon_url='%s' % concept_set.uriref)
        DBSession.add(cm)
        comparison_meanings[cm.name] = cm
        for label in concept_set.alt_labels:
            comparison_meanings_alt_labels.setdefault(label.lower(), cm)

    DBSession.flush()

    print('... done')

    comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()}
    comparison_meanings_alt_labels = {
        k: v.pk for k, v in comparison_meanings_alt_labels.items()}

    submissions = []

    for submission in REPOS.joinpath('submissions').glob('*'):
        if not submission.is_dir():
            continue

        try:
            submission = Submission(submission)
        except ValueError:
            continue

        md = submission.md
        id_ = submission.id
        lmd = md['language']

        language = data['Variety'].get(lmd['glottocode'])
        if not language:
            language = data.add(
                Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name'])

        dictionary = data.add(
            Dictionary,
            id_,
            id=id_,
            name=lmd['name'] + ' Dictionary',
            language=language,
            published=date(*map(int, md['published'].split('-'))))

        for i, cname in enumerate(md['authors']):
            name = HumanName(cname)
            cid = slug('%s%s' % (name.last, name.first))
            contrib = data['Contributor'].get(cid)
            if not contrib:
                contrib = data.add(common.Contributor, cid, id=cid, name=cname)
            DBSession.add(common.ContributionContributor(
                ord=i + 1,
                primary=True,
                contributor=contrib,
                contribution=dictionary))

        submissions.append((dictionary.id, language.id, submission))
    transaction.commit()

    for did, lid, submission in submissions:
        try:
            mod = __import__(
                'dictionaria.loader.' + submission.id, fromlist=['MARKER_MAP'])
            marker_map = mod.MARKER_MAP
        except ImportError:
            marker_map = {}

        transaction.begin()
        print('loading %s ...' % submission.id)
        submission.load(
            did,
            lid,
            comparison_meanings,
            comparison_meanings_alt_labels,
            marker_map)
        transaction.commit()
        print('... done')

        #('hoocak', 'Hooca\u0328k', 43.5, -88.5, [('hartmanniren', 'Iren Hartmann')]),
        #('yakkha', 'Yakkha', 27.37, 87.93, [('schackowdiana', 'Diana Schackow')]),
        #('palula', 'Palula', 35.51, 71.84, [('liljegrenhenrik', 'Henrik Liljegren')], {}),
        #('daakaka', 'Daakaka', -16.27, 168.01, [('vonprincekilu', 'Kilu von Prince')],
        # {'published': date(2015, 9, 30), 'iso': 'bpa', 'glottocode': 'daka1243'}),
        #('teop', 'Teop', -5.67, 154.97, [('moselulrike', 'Ulrike Mosel')],
        # {'published': date(2015, 9, 30), 'iso': 'tio', 'glottocode': 'teop1238', 'encoding': 'latin1'}),

    transaction.begin()
    load_families(Data(), DBSession.query(Variety))
Beispiel #22
0
def add_cultural_data(questionnaire_file_name, parameters, language):
    """ Parse the typological questionnaire into the database """
    contribution_text, parameter_descriptions, answers = parse_culture_questionnaire(
        os.path.join(DBPATH, questionnaire_file_name))

    # All ValueSets must be related to a contribution, so generate one from the metadata.
    contrib = common.Contribution(id='contrib'+newid(), name=contribution_text+newid())

    for p, parameter in parameter_descriptions.iterrows():
        # First, make sure that this parameter exists – either look it up or create it.
        pid = p.replace(".", "-")
        try:
            param, domain = parameters[pid]
        except KeyError:
            param = common.Parameter(
                id='culture'+pid,
                name=p,
                description=parameter['Question_text_English'],
                markup_description=parameter['Question_text_English'])
            domain = {}
            parameters[pid] = (param, domain)

        # Secondly, check whether we are aware that this answer is
        # valid already – otherwise we add its value to the domain,
        # and use that.
        # Note: Once we have a database, we can do better filtering
        # and constraining, and don't need to rely on reasonable data.
        answer = str(answers["Answer"][p])
        try:
            domain_element = domain[slug(answer)]
        except KeyError:
            try:
                numerical_value = int(answer)
            except ValueError:
                numerical_value = (1 if answer == "Y" or answer == 'True' else
                                   0 if answer == "N" or answer == 'False' else
                                   None)
            domain_element = common.DomainElement(
                id=param.id+slug(answer),
                description=answer,
                number=numerical_value,
                name=answer,
                parameter=param,
                abbr=answer,
                jsondata={'color': color(numerical_value)})
            DBSession.add(domain_element)
            try:
                DBSession.flush()
            except:
                print(domain, domain_element, language.name, pid, param.name)
            domain[slug(answer)] = domain_element

        # Now create the ValueSet, representing all values the
        # language has for this parameter
        vs = common.ValueSet(id='vs'+newid(),
                             language=language,
                             parameter=param,
                             jsondata=domain_element.jsondata,
                             contribution=contrib)

        # and fill in the actual values, which in this case is only
        # one. This object, and all objects it depends on, are then
        # scheduled for writing into the database.
        DBSession.add(common.Value(
            id='v'+newid(),
            valueset=vs,
            frequency=float(100),
            jsondata=domain_element.jsondata,
            domainelement=domain_element))
        # Execute all scheduled database updates.
        DBSession.flush()
def parse_culture_questionnaire(filename):
    questionnaire = pandas.ExcelFile(filename)

    metadata_sheet_name = 'Metadata'
    if metadata_sheet_name in questionnaire.sheet_names:
        metadata_sheet = questionnaire.parse(metadata_sheet_name, header=None)
        try:
            metadata_blob = "; ".join(map(str, metadata_sheet.values))
        except KeyError:
            metadata_blob = ""
    else:
        metadata_blob = ""
    metadata = metadata_blob

    weird_rows = set()
    answers = pandas.DataFrame(columns = ["Answer", "Original Answer", "Notes"])
    features = pandas.DataFrame(columns = ["Domain", "Question_text_English", "Question_text_Indonesian", "Question_notes"])
    try:
        data_sheet = questionnaire.parse(
            0,
            skiprows=[0, 1, 2, 3, 4]).dropna('columns', 'all')
    except XLRDError:
        raise UnexpectedCultureFormatError(
            "Culture sheet did not have a 'Questionnaire' sheet")
    if "answer" in data_sheet.columns[5].lower():
        cols = list(data_sheet.columns)
        cols[5]="Original Answer"
        data_sheet.columns = cols
    else:
        raise UnexpectedCultureFormatError(
            "Expected Excel column F to have 'answer' in its header.")
        
    for i, row in data_sheet.iterrows():
        if pandas.isnull(row["Q_ID"]):
            # print(row)
            continue
        id = str(row["Q_ID"]).lower()
        if pandas.isnull(row[5]):
            # print(row)
            continue
        else:
            question = row[features.columns]
            question.name = id
            features = features.append(question)

            answer = row[answers.columns]
            if pandas.isnull(answer['Original Answer']):
                answer['Answer'] = None
            elif type(answer['Original Answer']) == int:
                answer['Answer'] = answer['Original Answer']
            elif slug(answer['Original Answer']) == 'yes':
                answer['Answer'] = True
            elif slug(answer['Original Answer']) == 'no':
                answer['Answer'] = False
            elif slug(answer['Original Answer']) == 'na':
                answer['Answer'] = None
            else:
                answer['Answer'] = answer['Original Answer']

            answer.name = id
            answers = answers.append(answer)

            assert(len(features) == len(answers))
    return metadata, features, answers
Beispiel #24
0
def main(args):
    data = Data()
    data_path = lambda *cs: args.data_file('concepticon-data', 'concepticondata', *cs)

    dataset = common.Dataset(
        id=concepticon.__name__,
        name="Concepticon 1.0",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='concepticon.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    for i, name in enumerate(['Johann-Mattis List', 'Michael Cysouw', 'Robert Forkel']):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    english = data.add(
        common.Language, 'eng',
        id='eng',
        name='English')

    files = {}
    for fname in data_path('sources').iterdir():
        files[fname.stem] = \
            "https://github.com/clld/concepticon-data/blob/master/concepticondata/sources/%s" % fname.name

    for rec in Database.from_file(
            data_path('references', 'references.bib'), lowercase=True):
        source = data.add(common.Source, rec.id, _obj=bibtex2source(rec))
        if rec.id in files:
            DBSession.flush()
            DBSession.add(common.Source_files(
                mime_type='application/pdf',
                object_pk=source.pk,
                jsondata=dict(url=files[rec.id])))

    for concept in reader(data_path('concepticon.tsv'), namedtuples=True):
        data.add(
            models.ConceptSet,
            concept.ID,
            id=concept.ID,
            name=concept.GLOSS,
            description=concept.DEFINITION,
            semanticfield=concept.SEMANTICFIELD,
            ontological_category=concept.ONTOLOGICAL_CATEGORY)

    for rel in reader(data_path('conceptrelations.tsv'), namedtuples=True):
        DBSession.add(models.Relation(
            source=data['ConceptSet'][rel.SOURCE],
            target=data['ConceptSet'][rel.TARGET],
            description=rel.RELATION))

    unmapped = Counter()
    number_pattern = re.compile('(?P<number>[0-9]+)(?P<suffix>.*)')

    for cl in reader(data_path('conceptlists.tsv'), dicts=True):
        concepts = data_path('conceptlists', '%(ID)s.tsv' % cl)
        if not concepts.exists():
            continue
        langs = [l.lower() for l in split(cl['SOURCE_LANGUAGE'])]
        conceptlist = data.add(
            models.Conceptlist,
            cl['ID'],
            id=cl['ID'],
            name=' '.join(cl['ID'].split('-')),
            description=cl['NOTE'],
            target_languages=cl['TARGET_LANGUAGE'],
            source_languages=' '.join(langs),
            year=int(cl['YEAR']) if cl['YEAR'] else None,
        )
        for id_ in split(cl['REFS']):
            common.ContributionReference(
                source=data['Source'][id_], contribution=conceptlist)
        for i, name in enumerate(split(cl['AUTHOR'], sep=' and ')):
            name = strip_braces(name)
            contrib = data['Contributor'].get(name)
            if not contrib:
                contrib = data.add(
                    common.Contributor, name, id=slug(name), name=name)
            DBSession.add(common.ContributionContributor(
                ord=i, contribution=conceptlist, contributor=contrib))
        for k in 'ID NOTE TARGET_LANGUAGE SOURCE_LANGUAGE YEAR REFS AUTHOR'.split():
            del cl[k]
        DBSession.flush()
        for k, v in cl.items():
            DBSession.add(common.Contribution_data(
                object_pk=conceptlist.pk, key=k, value=v))

        for concept in reader(concepts, namedtuples=True):
            if not concept.ID or not concept.CONCEPTICON_ID or concept.CONCEPTICON_ID == 'NAN':
                #print conceptlist.id, getattr(concept, 'ENGLISH', getattr(concept, 'GLOSS', None))
                unmapped.update([conceptlist.id])
                continue

            lgs = {}
            for lang in langs:
                v = getattr(concept, lang.upper())
                if v:
                    lgs[lang] = v

            match = number_pattern.match(concept.NUMBER)
            if not match:
                print(concept.ID)
                raise ValueError
            vs = common.ValueSet(
                id=concept.ID,
                description=getattr(concept, 'GLOSS', getattr(concept, 'ENGLISH', None)),
                language=english,
                contribution=conceptlist,
                parameter=data['ConceptSet'][concept.CONCEPTICON_ID])
            d = {}
            for key, value in concept.__dict__.items():
                if not key.startswith('CONCEPTICON_') and \
                        key not in ['NUMBER', 'ID', 'GLOSS'] + [l.upper() for l in langs]:
                    d[key.lower()] = value
            v = models.Concept(
                id=concept.ID,
                valueset=vs,
                description=getattr(concept, 'GLOSS', None),  # our own gloss, if available
                name='; '.join('%s [%s]' % (lgs[l], l) for l in sorted(lgs.keys())),
                number=int(match.group('number')),
                number_suffix=match.group('suffix'),
                jsondata=d)
            DBSession.flush()
            for key, value in lgs.items():
                DBSession.add(
                    common.Value_data(key='lang_' + key, value=value, object_pk=v.pk))

    print('Unmapped concepts:')
    for clid, no in unmapped.most_common():
        print(clid, no)

    for fname in data_path('concept_set_meta').iterdir():
        if fname.suffix == '.tsv':
            md = load(fname.parent.joinpath(fname.name + '-metadata.json'))
            provider = models.MetaProvider(
                id=fname.stem,
                name=md['dc:title'],
                description=md['dc:description'],
                url=md['dc:source'],
                jsondata=md)
            for meta in reader(fname, dicts=True):
                try:
                    for k, v in meta.items():
                        if v and k != 'CONCEPTICON_ID':
                            models.ConceptSetMeta(
                                metaprovider=provider,
                                conceptset=data['ConceptSet'][meta['CONCEPTICON_ID']],
                                key=k,
                                value=v)
                except:
                    print(fname)
                    print(meta)
                    raise
    def cmd_makecldf(self, args):

        wl = lingpy.Wordlist(
            self.raw_dir.joinpath("bodt-khobwa-cleaned.tsv").as_posix(),
            conf=self.raw_dir.joinpath("wordlist.rc").as_posix(),
        )
        args.writer.add_sources()

        concept_lookup = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concept_lookup[concept.english] = idx

        args.writer.add_languages(lookup_factory="Name")

        mapper = {
            "pʰl": "pʰ l",
            "pɕ": "p ɕ",
            "pɕ~ɕ/pɕ": "p ɕ",
            "aw": "au",
            "ɛj": "ɛi",
            "ɔw": "ɔu",
            "bl": "b l",
            "aj": "ai",
            "ɔj": "ɔi",
            "(ŋ)": "ŋ",
            "kʰl": "kʰ l",
            "ej": "ei",
            "uj": "ui",
            "bɹ": "b ɹ",
            "ɐʰ": "ɐʰ/ɐ",
            "hw": "h w",
            "ɔːʰ": "ɔːʰ/ɔː",
            "dʑr": "dʑ r",
            "ow": "ou",
            "pl": "p l",
            "lj": "l j",
            "tʰj": "tʰ j",
            "aːʰ": "aːʰ/aː",
            "bj": "b j",
            "mp": "m p",
            "pɹ": "p ɹ",
            "ɐ̃ʰ": "ɐ̃ʰ/ɐ̃",
            "ɔ̃ʰ": "ɔ̃ʰ/ɔ̃",
            "aj~e/aj": "aj~e/ai",
            "aj~ej/ej": "aj~ej/ei",
            "kl": "k l",
            "kʰɹ": "kʰ ɹ",
            "ɛːʰ": "ɛːʰ/ɛː",
            "ɔʰ": "ɔʰ/ɔ",
            "tɹ": "t ɹ",
            "ɐːʰ": "ɐːʰ/ɐ",
            "br": "b r",
            "kɹ": "k ɹ",
            "kʰj": "kʰ j",
            "kʰr": "kʰ r",
            "gɹ": "g ɹ",
            "hj": "h j",
            "bl~gl/bl": "bl~gl/b l",
            "dj": "d j",
            "ej~i/ej": "ej~i/ei",
            "e~a/ej": "e~a/ei",
            "fl": "f l",
            "kʰw": "kʰ w",
            "mj": "m j",
            "pr": "p r",
            "pʰl~bl/pʰl": "pʰl~bl/pʰ l",
            "pʰr": "pʰ r",
            "pʰr~pʰl/pʰr": "pʰr~pʰl/pʰ r",
            "pʰw": "pʰ w",
            "pʰɹ": "pʰ ɹ",
            "tr": "t r",
            "tɕʰɹ": "tɕʰ ɹ",
            "tʰr": "tʰ r",
            "tʰw": "tʰ w",
            "dɾ": "d ɾ",
            "tɾ": "t ɾ",
            "zj": "z j",
            "ɔj~uj/uj": "ɔj~uj/ui",
        }

        for idx in pylexibank.progressbar(wl, desc="cldfify"):
            segments = " ".join([mapper.get(x, x) for x in wl[idx, "tokens"]])
            concept = concept_lookup.get(wl[idx, "concept"], "")
            lex = args.writer.add_form_with_segments(
                Language_ID=wl[idx, "doculect"],
                Parameter_ID=concept,
                Value=wl[idx, "form"],
                Form=wl[idx, "form"],
                Segments=segments.split(),
                Partial_Cognacy=" ".join([str(x) for x in wl[idx, "crossids"]]),
                Source=["Bodt2019"],
            )
            for morpheme_index, cogid in enumerate(wl[idx, "crossids"]):
                alignment = wl[idx, "alignment"].split(" + ")[morpheme_index].split()
                alignment = " ".join([mapper.get(x, x) for x in alignment]).split()
                if int(cogid):
                    args.writer.add_cognate(
                        lexeme=lex,
                        Cognateset_ID=cogid,
                        Morpheme_Index=morpheme_index + 1,
                        Alignment=alignment,
                    )
Beispiel #26
0
 def contributor_id(self):
     return slug(self.reviewer.last + self.reviewer.first)
Beispiel #27
0
 def id(self):
     return '{0}-{1}-{2}-{3}'.format(self.contributor_id, slug(self.doi),
                                     self.experiment_number,
                                     self.species_id)
Beispiel #28
0
def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == "cleanup":
        for fname in args.data_file("gbs").glob("*.json"):
            try:
                data = jsonlib.load(fname)
                if data.get("totalItems") == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source).order_by(common.Source.id).options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file("gbs", "source%s.json" % source.id)

        if command == "update":
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ["verify", "update"]:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn("no JSON object found in: %s" % filepath)
                    continue
                if not data["totalItems"]:
                    continue
                item = data["items"][0]
            else:
                continue

        if command == "verify":
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item["volumeInfo"].get("publishedDate", "").split("-")[0]
            if not year or year != slug(source.year or ""):
                needs_check = True
            twords = words(stitle)
            iwords = words(item["volumeInfo"]["title"] + " " + item["volumeInfo"].get("subtitle", ""))
            if (
                twords == iwords
                or (len(iwords) > 2 and iwords.issubset(twords))
                or (len(twords) > 2 and twords.issubset(iwords))
            ):
                needs_check = False
            if int(source.id) == 241:
                log.info("%s" % sorted(words(stitle)))
                log.info("%s" % sorted(iwords))
            if needs_check:
                log.info("------- %s -> %s" % (source.id, item["volumeInfo"].get("industryIdentifiers")))
                log.info("%s %s" % (item["volumeInfo"]["title"], item["volumeInfo"].get("subtitle", "")))
                log.info(stitle)
                log.info(item["volumeInfo"].get("publishedDate"))
                log.info(source.year)
                log.info(item["volumeInfo"].get("authors"))
                log.info(source.author)
                log.info(item["volumeInfo"].get("publisher"))
                log.info(source.publisher)
                if not confirm("Are the records the same?"):
                    log.warn("---- removing ----")
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == "update":
            source.google_book_search_id = item["id"]
            source.update_jsondata(gbs=item)
            count += 1
        elif command == "download":
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    "inauthor:" + quote_plus(source.author.encode("utf8")),
                    "intitle:" + quote_plus(title.encode("utf8")),
                ]
                if source.publisher:
                    q.append("inpublisher:" + quote_plus(source.publisher.encode("utf8")))
                url = api_url + "q=%s&key=%s" % ("+".join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={"accept": "application/json"})
                log.info("%s - %s" % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), "w") as fp:
                        fp.write(r.text.encode("utf8"))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == "update":
        log.info("assigned gbs ids for %s out of %s sources" % (count, i))
    elif command == "download":
        log.info("queried gbs for %s sources" % count)
Beispiel #29
0
def ia_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(common.Source.id)\
            .options(joinedload(common.Source.data))
    else:
        if callable(sources):
            sources = sources()

    i = 0
    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file('ia', 'source%s.json' % source.id)

        if command in ['verify', 'update']:
            if filepath.exists():
                with open(filepath) as fp:
                    try:
                        data = json.load(fp)
                    except ValueError:
                        continue
                if not data['response']['numFound']:
                    continue
                item = data['response']['docs'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = text_type(item.get('year', ''))
            if not year or year != slug(source.year or ''):
                needs_check = True
            twords = words(stitle)
            iwords = words(item['title'])
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if needs_check:
                log.info('------- %s -> %s' % (source.id, item['identifier']))
                log.info(item['title'])
                log.info(stitle)
                log.info(item.get('year'))
                log.info(source.year)
                log.info(item['creator'])
                log.info(source.author)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    with open(filepath, 'w') as fp:
                        json.dump({"response": {'numFound': 0}}, fp)
        elif command == 'update':
            source.update_jsondata(internetarchive_id=item['identifier'])
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = quote_plus(b'creator:"%s" AND title:"%s"' % (
                    source.author.split(',')[0].encode('utf8'), title.encode('utf8')))

                count += 1
                r = requests.get(API_URL + q, headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, r.url))
                if r.status_code == 200:
                    with open(filepath, 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info('assigned internet archive identifiers for %s out of %s sources'
                 % (count, i))
    elif command == 'download':
        log.info('queried internet archive for %s sources' % count)
Beispiel #30
0
def test_slug():
    from clldutils.misc import slug

    assert slug('A B. \xe4C') == 'abac'
Beispiel #31
0
def main(args):
    fts.index('fts_index', Word.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")

    if DBSession.bind.dialect.name == 'postgresql':
        Index('ducet', collkey(common.Unit.name)).create(DBSession.bind)
    data = Data()

    dataset = common.Dataset(
        id=dictionaria.__name__,
        name="Dictionaria",
        description="The Dictionary Journal",
        published=date(2017, 3, 30),
        contact='*****@*****.**',
        domain='dictionaria.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    for i, (id_, name) in enumerate([
        ('haspelmathmartin', 'Martin Haspelmath'),
        ('stiebelsbarbara', 'Barbara Stiebels')
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))

    comparison_meanings = {}

    print('loading concepts ...')

    glosses = set()
    concepticon = Concepticon(
        REPOS.joinpath('..', '..', 'concepticon', 'concepticon-data'))
    if not args.no_concepts:
        for conceptset in concepticon.conceptsets.values():
            if conceptset.gloss in glosses:
                continue
            glosses.add(conceptset.gloss)
            cm = data.add(
                ComparisonMeaning,
                conceptset.id,
                id=conceptset.id,
                name=conceptset.gloss.lower(),
                description=conceptset.definition,
                concepticon_url='http://concepticon.clld.org/parameters/%s' % conceptset.id)
            comparison_meanings[cm.id] = cm

    DBSession.flush()

    print('... done')

    comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()}
    submissions = []

    for submission in REPOS.joinpath(
            'submissions-internal' if args.internal else 'submissions').glob('*'):
        if not submission.is_dir():
            continue

        try:
            submission = Submission(submission)
        except ValueError:
            continue

        md = submission.md
        if md is None:
            print('no md', submission.id)
            continue

        if not md['date_published']:
            print('no date', submission.id)
            continue

        id_ = submission.id
        if args.dict and args.dict != id_ and args.dict != 'all':
            print('not selected', submission.id)
            continue
        lmd = md['language']
        props = md.get('properties', {})
        props.setdefault('custom_fields', [])
        props['metalanguage_styles'] = {}
        for v, s in zip(props.get('metalanguages', {}).values(),
                        ['success', 'info', 'warning', 'important']):
            props['metalanguage_styles'][v] = s
        props['custom_fields'] = ['lang-' + f if f in props['metalanguage_styles'] else f
                                  for f in props['custom_fields']]
        props.setdefault('choices', {})

        language = data['Variety'].get(lmd['glottocode'])
        if not language:
            language = data.add(
                Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name'])

        md['date_published'] = md['date_published'] or date.today().isoformat()
        if '-' not in md['date_published']:
            md['date_published'] = md['date_published'] + '-01-01'
        dictionary = data.add(
            Dictionary,
            id_,
            id=id_,
            number=md.get('number'),
            name=props.get('title', lmd['name'] + ' dictionary'),
            description=submission.description,
            language=language,
            published=date(*map(int, md['date_published'].split('-'))),
            doi=md.get('doi'),
            jsondata=props)

        for i, spec in enumerate(md['authors']):
            if not isinstance(spec, dict):
                cname, address = spec, None
                spec = {}
            else:
                cname, address = spec['name'], spec.get('affiliation')
            name = HumanName(cname)
            cid = slug('%s%s' % (name.last, name.first))
            contrib = data['Contributor'].get(cid)
            if not contrib:
                contrib = data.add(
                    common.Contributor,
                    cid,
                    id=cid,
                    name=cname,
                    address=address,
                    url=spec.get('url'),
                    email=spec.get('email'))
            DBSession.add(common.ContributionContributor(
                ord=i + 1,
                primary=spec.get('primary', True),
                contributor=contrib,
                contribution=dictionary))

        submissions.append((dictionary.id, language.id, submission))
    transaction.commit()

    for did, lid, submission in submissions:
        transaction.begin()
        print('loading %s ...' % submission.id)
        dictdata = Data()
        lang = Variety.get(lid)
        submission.load_sources(Dictionary.get(did), dictdata)
        submission.load_examples(Dictionary.get(did), dictdata, lang)
        submission.dictionary.load(
            submission,
            dictdata,
            Dictionary.get(did),
            lang,
            comparison_meanings,
            OrderedDict(submission.md.get('properties', {}).get('labels', [])))
        transaction.commit()
        print('... done')

    transaction.begin()
    load_families(
        Data(),
        [v for v in DBSession.query(Variety) if re.match('[a-z]{4}[0-9]{4}', v.id)],
        glottolog_repos='../../glottolog/glottolog')
Beispiel #32
0
def slug(s, escape=False, **kw):
    # That's some weird stuff coming in with ElCat alternative names ...
    return misc.slug(
        ''.join(hex(ord(c)) if escape else c for c in s if ord(c) != 2), **kw)
Beispiel #33
0
 def species_id(self):
     return slug(self.species_latin)
Beispiel #34
0
 def from_name(cls, name, dry_run=False, repos=None):
     alpha = slug(text_type(name))[:4]
     assert alpha
     while len(alpha) < 4:
         alpha += alpha[-1]
     return Glottocodes(repos=repos).new(alpha, dry_run=dry_run)
Beispiel #35
0
def import_dataset(path, data, icons, add_missing_features = False):
    # look for metadata
    # look for sources
    # then loop over values
    
    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)
    glottolog = Glottolog()

    try:
        contrib = CulturebankContribution(id=basename, name=basename, desc=glottolog.languoid(basename).name)
    except:
        print("Basename {:s} did not match a glottolog languoid, skipped.".format(basename))
        return

    md = {}
    mdpath = path + '-metadata.json'
    if os.path.exists(mdpath):
        md = jsonload(mdpath)
    contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS'))
    contributor_id = slug(contributor_name.last + contributor_name.first)
    contributor = data['Contributor'].get(contributor_id)
    if not contributor:
        contributor = data.add(
            Contributor,
            contributor_id,
            id=contributor_id,
            name='%s' % contributor_name)
    DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor))

    bibpath = os.path.join(dirpath, basename + '.bib')
    if os.path.exists(bibpath):
        for rec in Database.from_file(bibpath):
            if rec['key'] not in data['Source']:
                data.add(Source, rec['key'], _obj=bibtex2source(rec))

    languages = {f['properties']['glottocode']: f for f in md.get('features', [])}

    for i, row in pandas.io.parsers.read_csv(
            path,
            sep=',' if 'c' in ext else '\t',
            encoding='utf-16').iterrows():
        if pandas.isnull(row['Value']) or pandas.isnull(row['Feature_ID']):
            print("Expected columns not found: ", row)
            continue
        vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID'])
        vid = row.get('ID', '%s-%s' % (basename, i + 1))

        parameter = data['Feature'].get(row['Feature_ID'])
        if parameter is None:
            if add_missing_features:
                parameter = data.add(Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID']))
            else: 
                print(('skip value for invalid feature %s' % row['Feature_ID']))
                continue

        language = data['CulturebankLanguage'].get(row['Language_ID'])
        if language is None:
            # query glottolog!
            try:
                languoid = glottolog.languoid(row['Language_ID'])
            except AttributeError:
                print(('Skipping, no Glottocode found for %s' % row['Language_ID']))
                continue
            
            gl_md = {
                'name': languoid.name,
                'longitude': languoid.longitude,
                'latitude': languoid.latitude}
            lmd = languages.get(row['Language_ID'])
            if lmd:
                if lmd.get('properties', {}).get('name'):
                    gl_md['name'] = lmd['properties']['name']
                if lmd.get('geometry', {}).get('coordinates'):
                    gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates']

            language = data.add(
                CulturebankLanguage, row['Language_ID'],
                id=row['Language_ID'],
                name=gl_md['name'],
                latitude=gl_md.get('latitude'),
                longitude=gl_md.get('longitude'))

        
        vs = data['ValueSet'].get(vsid)
        if vs is None:
            vs = data.add(
                ValueSet, vsid,
                id=vsid,
                parameter=parameter,
                language=language,
                contribution=contrib,
                source=row['Source'])

        domain = {de.abbr: de for de in parameter.domain}    
        name = row['Value']
        if name in domain:
            name = domain[name].name
        else:
            name = str(name)
            if name in domain:
                name = domain[name].name
            else:
                raise ValueError("For feature {:s} in language {:s}: Name {:s} not found among domain values {:}".format(
                    row['Language_ID'],
                    row['Feature_ID'],
                    name,
                    {d: de for d, de in domain.items()}))

        data.add(Value,
            vid,
            id=vid,
            valueset=vs,
            name=name,
            description=row['Comment'],
            domainelement=domain.get(row['Value']))

        print(".", end="")
        if vs.source is not None:
            for key, src in list(data['Source'].items()):
                if key in vs.source:
                    ValueSetReference(valueset=vs, source=src, key=key)
Beispiel #36
0
 def fname(self, suffix=''):
     return '%s.%s%s' % (slug(self.name), self.id, suffix)
    def cmd_makecldf(self, args):
        args.writer.add_sources()

        languages = args.writer.add_languages(lookup_factory="Token")

        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name")

        seen = []
        for f in ("AP_lexicon_coded.txt", "AP_lexicon.txt"):
            for row in self.raw_dir.read_csv(f, dicts=True, delimiter="\t"):
                concept = row["English"].lower().strip().replace(", ", "/")
                # skip rows in AP_lexicon.txt that we've already seen
                # in AP_lexicon_coded.txt
                # Note that there are duplicate rows in across both files, and *within*
                # the same files, so this handles that too.
                if concept in seen:
                    continue

                # manually catch "chase away". There are two glosses for this:
                #   367   "chase away"
                #   368   "chase away, expel"
                # ...-> 367 looks only partially complete and 368 contains all
                # forms in 367, so ignore 367.
                if row["English"] == "chase away":
                    continue

                if row["English"]:
                    # store lexicon IDs for the cognate row.
                    lexicon_ids = {}

                    for lang in languages:
                        assert concept in concepts, "bad concept %s" % concept
                        value = row[lang]

                        # preprocess value
                        # remove the reconstruction mark (for proto-AP)
                        value = value.lstrip("*")

                        # remove leading & trailing spaces
                        value = value.strip().lstrip()

                        # if the stripped form starts and ends with a slash,
                        # it is a leftover from a transcription, let's clean
                        # it (it could be done with the orthographic profile,
                        # but this could hide errors in parsing multiple
                        # forms, and in any case this is more adequate as we
                        # get the correct value)
                        if value.startswith("/") and value.endswith("/"):
                            value = value[1:-1]

                        lex = args.writer.add_forms_from_value(
                            Language_ID=languages[lang],
                            Parameter_ID=concepts[concept],
                            Value=value,
                            Source=["Robinson2012"],
                        )
                        if len(lex) >= 1:
                            # it looks like only the first lexemes of combined
                            # forms have cognates, so only add the first one.
                            lexicon_ids[lang] = lex[0]

                    seen.append(concept)

                else:  # cognates...
                    lastword = seen[-1]  # find the last word..
                    for lang in languages:
                        # find lexical ids belonging to this language & gloss.
                        lex = lexicon_ids.get(lang)
                        if lex and row[lang]:
                            if int(row[lang]) not in range(0, 12):
                                raise ValueError("Invalid cognate id: %s" %
                                                 row[lang])
                            args.writer.add_cognate(
                                lexeme=lex,
                                Cognateset_ID="%s-%s" %
                                (concepts[lastword], row[lang]),
                                Source=["Robinson2012"],
                            )
Beispiel #38
0
Datei: util.py Projekt: clld/clld
def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == 'cleanup':
        for fname in args.data_file('gbs').glob('*.json'):
            try:
                data = jsonlib.load(fname)
                if data.get('totalItems') == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(common.Source.id)\
            .options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file('gbs', 'source%s.json' % source.id)

        if command == 'update':
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ['verify', 'update']:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn('no JSON object found in: %s' % filepath)
                    continue
                if not data['totalItems']:
                    continue
                item = data['items'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item['volumeInfo'].get('publishedDate', '').split('-')[0]
            if not year or year != slug(source.year or ''):
                needs_check = True
            twords = words(stitle)
            iwords = words(
                item['volumeInfo']['title'] + ' '
                + item['volumeInfo'].get('subtitle', ''))
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if int(source.id) == 241:
                log.info('%s' % sorted(words(stitle)))
                log.info('%s' % sorted(iwords))
            if needs_check:
                log.info('------- %s -> %s' % (
                    source.id, item['volumeInfo'].get('industryIdentifiers')))
                log.info('%s %s' % (
                    item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', '')))
                log.info(stitle)
                log.info(item['volumeInfo'].get('publishedDate'))
                log.info(source.year)
                log.info(item['volumeInfo'].get('authors'))
                log.info(source.author)
                log.info(item['volumeInfo'].get('publisher'))
                log.info(source.publisher)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == 'update':
            source.google_book_search_id = item['id']
            source.update_jsondata(gbs=item)
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    'inauthor:' + quote_plus(source.author.encode('utf8')),
                    'intitle:' + quote_plus(title.encode('utf8')),
                ]
                if source.publisher:
                    q.append('inpublisher:' + quote_plus(
                        source.publisher.encode('utf8')))
                url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info('assigned gbs ids for %s out of %s sources' % (count, i))
    elif command == 'download':
        log.info('queried gbs for %s sources' % count)