Example #1
0
def bibtex2source(rec, cls=common.Source):
    year = bibtex.unescape(rec.get("year", "nd"))
    fields = {}
    jsondata = {}
    for field in bibtex.FIELDS:
        if field in rec:
            value = bibtex.unescape(rec[field])
            container = fields if hasattr(cls, field) else jsondata
            container[field] = value

    etal = ""
    eds = ""
    authors = rec.get("author")
    if not authors:
        authors = rec.get("editor", "")
        if authors:
            eds = " (eds.)"
    if authors:
        authors = bibtex.unescape(authors).split(" and ")
        if len(authors) > 2:
            authors = authors[:1]
            etal = " et al."

        authors = [HumanName(a) for a in authors]
        authors = [n.last or n.first for n in authors]
        authors = "%s%s%s" % (" and ".join(authors), etal, eds)

    return cls(
        id=slug(rec.id),
        name=("%s %s" % (authors, year)).strip(),
        description=bibtex.unescape(rec.get("title", rec.get("booktitle", ""))),
        jsondata=jsondata,
        bibtex_type=rec.genre,
        **fields
    )
Example #2
0
def bibtex2source(rec, cls=common.Source):
    year = bibtex.unescape(rec.get('year', 'nd'))
    fields = {}
    jsondata = {}
    for field in bibtex.FIELDS:
        if field in rec:
            value = bibtex.unescape(rec[field])
            container = fields if hasattr(cls, field) else jsondata
            container[field] = value

    etal = ''
    eds = ''
    authors = rec.get('author')
    if not authors:
        authors = rec.get('editor', '')
        if authors:
            eds = ' (eds.)'
    if authors:
        authors = bibtex.unescape(authors).split(' and ')
        if len(authors) > 2:
            authors = authors[:1]
            etal = ' et al.'

        authors = [HumanName(a) for a in authors]
        authors = [n.last or n.first for n in authors]
        authors = '%s%s%s' % (' and '.join(authors), etal, eds)

    return cls(id=slug(rec.id),
               name=('%s %s' % (authors, year)).strip(),
               description=bibtex.unescape(
                   rec.get('title', rec.get('booktitle', ''))),
               jsondata=jsondata,
               bibtex_type=rec.genre,
               **fields)
Example #3
0
File: util.py Project: clld/clld
def bibtex2source(rec, cls=common.Source, lowercase_id=False):
    year = bibtex.unescape(rec.get('year', 'nd'))
    fields = {}
    jsondata = {}
    for field in bibtex.FIELDS:
        if field in rec:
            value = bibtex.unescape(rec[field])
            container = fields if hasattr(cls, field) else jsondata
            container[field] = value

    etal = ''
    eds = ''
    authors = rec.get('author')
    if not authors:
        authors = rec.get('editor', '')
        if authors:
            eds = ' (eds.)'
    if authors:
        authors = bibtex.unescape(authors).split(' and ')
        if len(authors) > 2:
            authors = authors[:1]
            etal = ' et al.'

        authors = [HumanName(a) for a in authors]
        authors = [n.last or n.first for n in authors]
        authors = '%s%s%s' % (' and '.join(authors), etal, eds)

    return cls(
        id=slug(rec.id, lowercase=lowercase_id),
        name=('%s %s' % (authors, year)).strip(),
        description=bibtex.unescape(rec.get('title', rec.get('booktitle', ''))),
        jsondata=jsondata,
        bibtex_type=rec.genre,
        **fields)
Example #4
0
    def test_unescape(self):
        from clld.lib.bibtex import unescape, u_unescape

        self.assertEqual(unescape(binary_type("\\ss \xef".encode('latin1'))), 'ß\xef')
        self.assertEqual(unescape("\\ss "), 'ß')
        self.assertEqual(u_unescape('?[\\u123] ?[\\u1234]'), '{ \u04d2')
        s = '\u2013'
        self.assertEqual(s, unescape(s))
        self.assertEqual(unescape('?[\\u65533]'), '\ufffd')
Example #5
0
    def test_unescape(self):
        from clld.lib.bibtex import unescape, u_unescape

        self.assertEqual(unescape(binary_type("\\ss \xef".encode('latin1'))),
                         'ß\xef')
        self.assertEqual(unescape("\\ss "), 'ß')
        self.assertEqual(u_unescape('?[\\u123] ?[\\u1234]'), '{ \u04d2')
        s = '\u2013'
        self.assertEqual(s, unescape(s))
        self.assertEqual(unescape('?[\\u65533]'), '\ufffd')
Example #6
0
def bibtex2source(rec):  # pragma: no cover
    year = rec.get('year', 'nd')
    fields = {}
    jsondata = {}
    for field in FIELDS:
        if field in rec:
            value = unescape(rec[field])
            container = fields if hasattr(common.Source, field) else jsondata
            container[field] = value
            # remove \\ from url fields!
            if field == 'url':
                container[field] = container[field].replace('\\', '')

    etal = ''
    eds = ''
    authors = rec.get('author')
    if not authors:
        authors = rec.get('editor', '')
        if authors:
            eds = ' (eds.)'
    if authors:
        authors = unescape(authors).split(' and ')
        if len(authors) > 2:
            authors = authors[:1]
            etal = ' et al.'

        authors = [HumanName(a) for a in authors]
        authors = [n.last or n.first for n in authors]
        authors = '%s%s%s' % (' and '.join(authors), etal, eds)

    if rec.genre == 'thesis':
        if rec['type'] == 'phdthesis':
            rec.genre = 'phdthesis'
        else:
            rec.genre = 'mastersthesis'

    try:
        bibtex_type = EntryType.from_string(rec.genre)
    except:
        bibtex_type = EntryType.from_string('misc')

    return common.Source(id=rec.id,
                         name=('%s %s' % (authors, year)).strip(),
                         description=unescape(
                             rec.get('title', rec.get('booktitle', ''))),
                         jsondata=jsondata,
                         bibtex_type=bibtex_type,
                         **fields)
Example #7
0
File: util.py Project: FieldDB/clld
def bibtex2source(rec):
    year = bibtex.unescape(rec.get('year', 'nd'))
    fields = {}
    jsondata = {}
    for field in bibtex.FIELDS:
        if field in rec:
            value = bibtex.unescape(rec[field])
            container = fields if hasattr(common.Source, field) else jsondata
            container[field] = value

    return common.Source(
        id=slug(rec.id),
        name=('%s %s' % (bibtex.unescape(
            rec.get('author', rec.get('editor', ''))), year)).strip(),
        description=bibtex.unescape(rec.get('title', rec.get('booktitle', ''))),
        jsondata=jsondata,
        bibtex_type=rec.genre,
        **fields)
Example #8
0
def bibtex2source(rec):
    year = bibtex.unescape(rec.get('year', 'nd'))
    fields = {}
    jsondata = {}
    for field in bibtex.FIELDS:
        if field in rec:
            value = bibtex.unescape(rec[field])
            container = fields if hasattr(common.Source, field) else jsondata
            container[field] = value

    return common.Source(
        id=slug(rec.id),
        name=('%s %s' % (bibtex.unescape(
            rec.get('author', rec.get('editor', ''))), year)).strip(),
        description=bibtex.unescape(rec.get('title', rec.get('booktitle',
                                                             ''))),
        jsondata=jsondata,
        bibtex_type=rec.genre,
        **fields)
Example #9
0
def unescape_dict(d):
    for o, n in [(r'Cura\c{c}ao', "Curaçao"), ("Saint Barth\\'elemy", "Saint-Barthélemy")]:
        if o in d:
            d[n.decode('utf8')] = d[o]
            del d[o]
    for k, v in d.items():
        if isinstance(v, dict):
            d[k] = unescape_dict(v)
        elif isinstance(v, StringTypes):
            d[k] = unescape(v).replace("Barth\\'elemy", "Barthélemy".decode('utf8'))
    return d
Example #10
0
def convert(key, value):
    #if key == 'code+name':
    #    return value.replace('\\', "\\\\")
    if key == 'coordinates':
        lon, lat = eval(value)
        return dict(latitude=lat, longitude=lon)
    if key == 'population_numeric':
        return int(value)
    if key in [
        'alternate_names',
        'classification',
        'classification-hh',
        'country',
        'dialects',
    ]:
        return split(unescape(value))
    if key == 'typology':
        return split(unescape(value), sep=';')
    if key == 'writing':
        return split(unescape(value), sep='.')
    if key == 'also_spoken_in':
        return unescape_dict(eval(value))
    return unescape(value)
Example #11
0
def main(args):
    """
    The case is we have to codings for two different dialects (called hua and yagaria) of
    the same iso "qgr", both of which we want to keep and keep separately. I had missed
    that when making NTS, rigging everything so that the iso would be the id, which is not
    sufficient. Glottocodes in Grambank would have taken care of it except the dialect
    division for yaga1260 is wrong, having yagaria as overarching and Hua under it
    (reality has it that Hua and Yagaria are two dialects of the same language, which has
    no name). So a solution with glottocodes would have to wait until we fix that or need
    another fix later. So I guess, for now, let's ignore qgr (and its datapoints) and I'll
    fix on my end later.
    """
    data = Data(
        created=utc.localize(datetime(2013, 11, 15)),
        updated=utc.localize(datetime(2013, 12, 12)))
    icons = issues.Icons()

    dtab = partial(_dtab, args.data_file())

    #Languages
    tabfns = ['%s' % fn.name for fn in args.data_file().glob('nts_*.tab')]
    args.log.info("Sheets found: %s" % tabfns)
    ldps = []
    lgs = {}
    nfeatures = Counter()
    nlgs = Counter()

    for fn in tabfns:
        for ld in dtab(fn):
            if ld['language_id'] == 'qgr':
                continue
            if "feature_alphanumid" not in ld:
                args.log.info("NO FEATUREID %s %s" % (len(ld), ld))
            if not ld["feature_alphanumid"].startswith("DRS") \
                    and ld["feature_alphanumid"].find(".") == -1:
                ldps.append(dp_dict(ld))
                lgs[ld['language_id']] = unescape(ld['language_name'])
                if ld["value"] != "?":
                    nfeatures.update([ld['language_id']])
                    nlgs.update([ld['feature_alphanumid']])

    ldps = sorted(ldps, key=lambda d: d['feature_alphanumid'])

    lgs["ygr"] = "Hua"

    for lgid, lgname in lgs.items():
        data.add(
            models.ntsLanguage, lgid,
            id=lgid,
            name=lgname,
            representation=nfeatures.get(lgid, 0))
    DBSession.flush()

    load_families(data, [(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['ntsLanguage'].values()], isolates_icon='tcccccc')
    #glottolog = Glottolog()
    #for lg in data['ntsLanguage'].values():
    #    print lg.id, NOCODE_TO_GLOTTOCODE.get(lg.id, lg.id)
    #    gl_language = glottolog.languoid(NOCODE_TO_GLOTTOCODE.get(lg.id, lg.id))
    #    if not gl_language.family:
    #        family = data.add(Family, gl_language.id, id = gl_language.id, name = gl_language.name, description=common.Identifier(name=gl_language.id, type=common.IdentifierType.glottolog.value).url(), jsondata={"icon": 'tcccccc'})
    #        lg.family = family

    
    #Domains
    for domain in set(ld['feature_domain'] for ld in ldps):
        data.add(models.FeatureDomain, domain, name=domain)
    DBSession.flush()

    #Designers
    for i, info in enumerate(dtab("ntscontributions.tab") + dtab("ntscontacts.tab")):
        designer_id = str(i + 1)
        data.add(
            models.Designer, info['designer'],
            id=designer_id,
            name=designer_id,
            domain=info["domain"],
            contributor=info['designer'],
            pdflink=info["pdflink"],
            citation=info["citation"])
    DBSession.flush()

    #Sources
    for k, (typ, bibdata) in [
        ktfbib(bibsource) for ld in ldps
        if ld.get(u'bibsources') for bibsource in ld['bibsources'].split(",,,")
    ]:
        if k not in data["Source"]:
            data.add(common.Source, k, _obj=bibtex2source(Record(typ, k, **bibdata)))
    DBSession.flush()

    #Features
    fs = [(fid, mergeds(lds)) for fid, lds in
          groupby(ldps, key=lambda d: d['feature_alphanumid'])]

    fvdesc = [(fid, [(ld.get("feature_possible_values"), ld.get("fromfile")) for ld in lds if ld.get("feature_possible_values")]) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])]
    fvdt = [(fid, grp2(vdescs)) for (fid, vdescs) in fvdesc]
    fvmis = [(fid, vdescs) for (fid, vdescs) in fvdt if len(vdescs) > 1]
    for (fid, vdescs) in fvmis:
        print fid, "DIFF VDESC"
        for (vd, fromf) in vdescs:
            print vd, set(fromf)

    for _, dfsids in groupby(
            sorted((f.get('feature_name', fid), fid) for fid, f in fs),
            key=lambda t: t[0]):
        assert len(list(dfsids)) == 1

    for fid, f in fs:
        if not fid.isdigit():
            args.log.info("NO INT FID %s" % f)           
        feature = data.add(
            models.Feature, fid,
            id=fid,
            name=f.get('feature_name', f['feature_alphanumid']),
            doc=f.get('feature_information', ""),
            vdoc=f.get('feature_possible_values', ""),
            representation=nlgs.get(fid, 0),
            designer=data["Designer"][f['designer']],
            dependson=f.get("depends_on", ""),
            abbreviation=f.get("abbreviation", ""),
            featuredomain=data['FeatureDomain'][f["feature_domain"]],
            name_french=f.get('francais', ""),
            clarification=f.get("draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", ""),
            alternative_id=f.get("old feature number", ""),
            jl_relevant_unit=f.get("relevant unit(s)", ""),
            jl_function=f.get("function", ""),
            jl_formal_means=f.get("formal means", ""),
            sortkey_str="",
            sortkey_int=int(fid))

        vdesclist = [veq.split("==") for veq in feature.vdoc.split("||")]
        vdesc = {v.replace(".", "-"): desc for [v, desc] in vdesclist}
        vdesc.setdefault('?', 'Not known')
        if 'N/A' not in vdesc and feature.dependson:
            vdesc["N/A"] = "Not Applicable"
        vi = {v: i for (i, v) in enumerate(sorted(vdesc.keys()))}
        vicons = icons.iconize(vi.keys())
        for v, desc in vdesc.items():
            data.add(
                common.DomainElement, (fid, v),
                id='%s-%s' % (fid, v),
                name=v,
                description=desc,
                jsondata={"icon": vicons[v]},
                number=vi[v],
                parameter=feature)
    DBSession.flush()

    for ((f, lg), ixs) in grp2(
            [((ld['feature_alphanumid'], ld['language_id']), i)
             for i, ld in enumerate(ldps)]):
        ixvs = set([ldps[ix]['value'] for ix in ixs])
        if len(ixvs) == 1:
            continue
        args.log.warn(
            "Dup value %s %s %s" %
            (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile']) for ix in ixs]))
        print "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'], ldps[ix].get('provenance')) for ix in ixs])
    errors = {}
    done = set()
    for ld in ldps:
        parameter = data['Feature'][ld['feature_alphanumid']]
        language = data['ntsLanguage'][ld['language_id']]
        
        id_ = '%s-%s' % (parameter.id, language.id)
        if id_ in done:
            continue

        if (ld['feature_alphanumid'], ld['value']) not in data['DomainElement']:
            if not ld["value"].strip():
                continue
            info = (
                ld['feature_alphanumid'],
                ld.get('feature_name', "[Feature Name Lacking]"),
                ld['language_id'],
                ld['value'],
                ld['fromfile'])
            msg = u"%s %s %s %s %s not in the set of legal values ({0})" % info
            args.log.error(msg.format(sorted(
                [y for (x, y) in data['DomainElement'].keys()
                 if x == ld['feature_alphanumid']])))
            print msg.format(sorted(
                [y for (x, y) in data['DomainElement'].keys()
                 if x == ld['feature_alphanumid']]))
            errors[(ld['feature_alphanumid'], ld['language_id'])] = info
            continue

        vs = common.ValueSet(
            id=id_,
            language=language,
            parameter=parameter,
            source=ld["source"] or None,
            contribution=parameter.designer)
        models.ntsValue(
            id=id_,
            domainelement=data['DomainElement'][(ld['feature_alphanumid'], ld['value'])],
            jsondata={"icon": data['DomainElement'][(ld['feature_alphanumid'], ld['value'])].jsondata},
            comment=ld["comment"],
            valueset=vs,
            contributed_datapoint=ld["contributor"])
        done.add(id_)

        if not ld.get('bibsources'):
            if 'bibsources' not in ld:
                args.log.warn("no bibsource %s" % ld)
            continue
        for k, _ in [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,")]:
            common.ValueSetReference(valueset=vs, source=data['Source'][k])
    DBSession.flush()

    #To CLDF
    cldf = {}
    for ld in ldps:
        parameter = data['Feature'][ld['feature_alphanumid']]
        language = data['ntsLanguage'][ld['language_id']]
        id_ = '%s-%s' % (parameter.id, language.id)
        if not id_ in done:
            continue
        dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"]) #, ld["comment"], ld["source"], bibliographical_details(ld.get('bibsources', "").split(",,,"))
        cldf[dt] = None
        
        
    tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows])
    savu(tab([("Language", "iso-639-3", "Feature", "Value")] + cldf.keys()), "nts.cldf", encoding = "utf-8") #utf-16 "Comment", "Source", "Bibliographical Details"



    #cldf = {}
    #for ld in ldps:
    #    parameter = data['Feature'][ld['feature_alphanumid']]
    #    language = data['ntsLanguage'][ld['language_id']]
    #    id_ = '%s-%s' % (parameter.id, language.id)
    #    if not id_ in done:
    #        continue
    #    dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"], ld["comment"], ld["source"], bibliographical_details(ld.get('bibsources', "").split(",,,")), ld.get("feature_information", ""), ld.get('feature_possible_values', ""), ld["designer"], ld.get("abbreviation", ""), ld["feature_domain"], ld.get('francais', ""), ld.get("dependencies", ""), ld.get("draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", ""))
    #    cldf[dt] = None
    
    #savu(tab([("Language", "iso-639-3", "Feature", "Value", "Comment", "Source", "Bibliographical Details", "Feature Information", "Feature Possible Values", "Feature Designer", "Feature Abbreviation", "Feature Domain", "Feature (French)", "Feature Dependencies", "Feature Clarifying Comments")] + cldf.keys()), "nts-with-metadata.tsv", encoding="utf-16")

    
    args.log.info('%s Errors' % len(errors))

    dataset = common.Dataset(
        id="NTS",
        name='Nijmegen Typological Survey',
        publisher_name="Max Planck Institute for Psycholinguistics",
        publisher_place="Nijmegen",
        publisher_url="http://www.mpi.nl",
        description="""Dataset on Typological Features, collected 2013-2014 in the Language and Cognition Department at the Max Planck Institute for Psycholinguistics, Max-Planck Gesellschaft, and a European Research Council's Advanced Grant (269484 "INTERACT") to Stephen C. Levinson.""",
        domain='http://nts.clld.org',
        published=date(2014, 2, 20),
        contact='*****@*****.**',
        license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en',
        jsondata={
            'license_icon': 'http://wals.info/static/images/cc_by_nc_nd.png',
            'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany'})

    for i, contributor in enumerate([
        common.Contributor(
            id="Harald Hammarstrom",
            name="Harald Hammarstrom",
            email="*****@*****.**"),
        common.Contributor(
            id="Suzanne van der Meer",
            name="Suzanne van der Meer",
            email="*****@*****.**"),
        common.Contributor(
            id="Hedvig Skirgard",
            name="Hedvig Skirgard",
            email="*****@*****.**")
    ]):
        common.Editor(dataset=dataset, contributor=contributor, ord=i)

    DBSession.add(dataset)
Example #12
0
def bibliographical_details(bibsources):
    ktfs = [ktfbib(bibsource) for bibsource in bibsources if bibsource.strip()]
    return u"; ".join([Record(t, k, **{k: bibtex.unescape(v) for (k, v) in f.iteritems()}).text() for (k, (t, f)) in ktfs])
Example #13
0
def get_contributors(rec, data):
    for author in re.split('\s+and\s+', unescape(rec['author'])):
        for cid, obj in data['Member'].items():
            if fuzz.token_sort_ratio(author, obj.name) >= 92:
                yield cid
Example #14
0
def main(args):
    data = Data(created=utc.localize(datetime(2013, 11, 15)),
                updated=utc.localize(datetime(2013, 12, 12)))
    icons = issues.Icons()
    #print icons

    DBSession.execute("delete from Language")
    DBSession.execute("delete from Unit")
    DBSession.execute("delete from featuredomain")
    DBSession.execute("delete from family")
    DBSession.execute("delete from source")
    DBSession.execute("delete from parameter")
    DBSession.execute("delete from feature")
    DBSession.execute("delete from domainelement")
    DBSession.execute("delete from valueset")
    DBSession.execute("delete from value")
    DBSession.execute("delete from lsivalue")
    DBSession.execute("delete from dataset")
    DBSession.execute("delete from contributor")
    DBSession.execute("delete from lsilanguage")
    DBSession.execute("delete from contribution")
    DBSession.execute("delete from designer")

    DBSession.flush()

    dtab = partial(_dtab, args.data_file())

    #Languages

    #print args.data_file()
    #tabfns = ['%s' % fn.basename() for fn in args.data_file().files('nts_*.tab')]
    #tabfns = ['nts_18.tab']
    ##tabfns = os.listdir('/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/data')[1:]
    tabfns = os.listdir(
        '/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/lsi_data'
    )[1:]
    #print tabfns
    args.log.info("Sheets found: %s" % tabfns)
    ldps = []
    lgs = {}
    nfeatures = Counter()
    nlgs = Counter()

    for fn in tabfns:
        for ld in dtab(fn):

            if ld['language_id'] == 'qgr' or ld['language_id'] == '---' or ld[
                    'language_id'] == '':  # to exclude languages which do not have an iso-code
                continue
            if "feature_alphanumid" not in ld:
                args.log.info("NO FEATUREID %s %s" % (len(ld), ld))
            if not ld["feature_alphanumid"].startswith("DRS") \
                    and ld["feature_alphanumid"].find(".") == -1:
                ldps.append(dp_dict(ld))
                ##print ld
                lgs[ld['language_id']] = unescape(ld['language_name'])
                if ld["value"] != "?":
                    nfeatures.update([ld['language_id']])
                    nlgs.update([ld['feature_alphanumid']])

    ldps = sorted(ldps, key=lambda d: d['feature_alphanumid'])

    #lgs["ygr"] = "Hua"

    for lgid, lgname in lgs.items():
        data.add(models.lsiLanguage,
                 lgid,
                 id=lgid,
                 name=lgname,
                 representation=nfeatures.get(lgid, 0))
    DBSession.flush()
    #print "I am here"
    #print data['ntsLanguage'].values()[1].id
    load_families(
        data,
        ##[(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['lsiLanguage'].values()],
        [(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l)
         for l in data['lsiLanguage'].values()
         if l.id != '---' and l.id != ''],
        isolates_icon='tcccccc')
    #print 'family'
    #print data['Family'].get('sino1245').jsondata
    #Domains
    for domain in set(ld['feature_domain'] for ld in ldps):
        data.add(models.FeatureDomain, domain, name=domain)
    DBSession.flush()

    #Designers
    #for i, info in enumerate(dtab("ntscontributions.tab") + dtab("ntscontacts.tab")):
    for i, info in enumerate([{
            'designer': 'shafqat',
            'domain': '',
            'pdflink': '',
            'citation': ''
    }, {
            'designer': '-',
            'domain': '',
            'pdflink': '',
            'citation': ''
    }]):
        designer_id = str(i + 1)
        data.add(models.Designer,
                 info['designer'],
                 id=designer_id,
                 name=designer_id,
                 domain=info["domain"],
                 contributor=info['designer'],
                 pdflink=info["pdflink"],
                 citation=info["citation"])
    DBSession.flush()

    #Sources
    '''for k, (typ, bibdata) in [
        ktfbib(bibsource) for ld in ldps
        if ld.get(u'bibsources') for bibsource in ld['bibsources'].split(",,,")
    ]:
        if k not in data["Source"]:
            data.add(common.Source, k, _obj=bibtex2source(Record(typ, k, **bibdata)))
    DBSession.flush()'''

    #Features
    fs = [(fid, mergeds(lds))
          for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])]

    fvdesc = [(fid, [
        (ld.get("feature_possible_values"), ld.get("fromfile")) for ld in lds
        if ld.get("feature_possible_values")
    ]) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])]
    fvdt = [(fid, grp2(vdescs)) for (fid, vdescs) in fvdesc]
    fvmis = [(fid, vdescs) for (fid, vdescs) in fvdt if len(vdescs) > 1]

    for _, dfsids in groupby(sorted(
        (f.get('feature_name', fid), fid) for fid, f in fs),
                             key=lambda t: t[0]):
        ##print [(k,v) for (k,v) in list(dfsids)],len(list(dfsids))
        assert len(list(dfsids)) == 1
    #print 'here is nlgs'

    for fid, f in fs:
        #print "lang name"
        #print ldps
        #print f.get('feature_possible_values', ""),
        if not fid.isdigit():
            args.log.info("NO INT FID %s" % f)
        feature = data.add(
            models.Feature,
            fid,
            id=fid,
            name=f.get('feature_name', f['feature_alphanumid']),
            doc=f.get('feature_information', ""),
            vdoc=f.get('feature_possible_values', ""),
            representation=nlgs.get(fid, 0),
            designer=data["Designer"][f['designer']],
            dependson=f.get("depends_on", ""),
            abbreviation=f.get("abbreviation", ""),
            featuredomain=data['FeatureDomain'][f["feature_domain"]],
            name_french=f.get('francais', ""),
            clarification=f.get(
                "draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)",
                ""),
            alternative_id=f.get("old feature number", ""),
            jl_relevant_unit=f.get("relevant unit(s)", ""),
            jl_function=f.get("function", ""),
            jl_formal_means=f.get("formal means", ""),
            sortkey_str="",
            sortkey_int=int(fid))

        vdesclist = [veq.split("==") for veq in feature.vdoc.split("||")]
        vdesc = {v.replace(".", "-"): desc for [v, desc] in vdesclist}
        ##vdesc = {fmly+val2icon(v): desc for ((v,desc),fmly) in itertools.product([(vv,desc) for [vv, desc] in vdesclist],['c','d','f','t'])}

        vdesc.setdefault('?', 'Not known')
        if 'N/A' not in vdesc and feature.dependson:
            vdesc["N/A"] = "Not Applicable"
        vi = {v: i for (i, v) in enumerate(sorted(vdesc.keys()))}
        ##vicons = {f+val2icon(v):f+val2icon(v) for (v,f) in itertools.product(['0','1','2','3'],['c','d','f','t'])}
        ##vicons['?'] = 'c00ffff'
        ##vicons['N/A'] = 'c00ffff'
        ##vicons = icons.iconize(vi.keys())
        for (v, desc) in vdesc.items():
            #print v,vicons[v]
            data.add(common.DomainElement, (fid, v),
                     id='%s-%s' % (fid, v),
                     name=v,
                     description=desc,
                     jsondata={"icon": Colors[v]},
                     number=vi[v],
                     parameter=feature)
    DBSession.flush()

    for ((f, lg), ixs) in grp2([((ld['feature_alphanumid'], ld['language_id']),
                                 i) for i, ld in enumerate(ldps)]):
        ixvs = set([ldps[ix]['value'] for ix in ixs])
        if len(ixvs) == 1:
            continue
        args.log.warn("Dup value %s %s %s" %
                      (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'])
                               for ix in ixs]))
        ##print "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'], ldps[ix].get('provenance')) for ix in ixs])
    errors = {}
    done = set()
    glottolog = Glottolog()
    for ld in ldps:

        ############################### for printing different map markers for different familys for features:shafqat
        #print data['Family']

        language = data['lsiLanguage'][ld['language_id']]
        if isinstance(language, (tuple, list)) and len(language) == 2:
            code, language = language
        else:
            code = language.id
        if code != '-':
            gl_language = glottolog.languoid(code)
            if gl_language:
                gl_family = gl_language.family
                if gl_family:
                    family = data['Family'].get(gl_family.id)

        ##ld['value'] = ld['value']+'-'+str(family)
        ##ld['value'] = combineValueFamily(ld['value'],str(family))
        #print family
        #####################################
        parameter = data['Feature'][ld['feature_alphanumid']]
        language = data['lsiLanguage'][ld['language_id']]

        id_ = '%s-%s' % (parameter.id, language.id)
        if id_ in done:
            continue

        if (ld['feature_alphanumid'],
                ld['value']) not in data['DomainElement']:
            if not ld["value"].strip():
                continue
            info = (ld['feature_alphanumid'],
                    ld.get('feature_name', "[Feature Name Lacking]"),
                    ld['language_id'], ld['value'], ld['fromfile'])
            msg = u"%s %s %s %s %s not in the set of legal values ({0})" % info
            args.log.error(
                msg.format(
                    sorted([
                        y for (x, y) in data['DomainElement'].keys()
                        if x == ld['feature_alphanumid']
                    ])))
            ##print msg.format(sorted(
            ##  [y for (x, y) in data['DomainElement'].keys()
            ## if x == ld['feature_alphanumid']]))
            errors[(ld['feature_alphanumid'], ld['language_id'])] = info
            continue

        vs = common.ValueSet(
            id=id_,
            language=language,
            parameter=parameter,
            source=ld["source"] or None,
            ##contribution=parameter.designer
        )
        #print
        #print "this one"
        #print ld['value'],family
        models.lsiValue(
            id=id_,
            domainelement=data['DomainElement'][(ld['feature_alphanumid'],
                                                 ld['value'])],
            jsondata={
                "icon":
                data['DomainElement'][(ld['feature_alphanumid'],
                                       ld['value'])].jsondata,
                "family":
                FamilyCodes[str(family)]
            },
            comment=ld["comment"],
            valueset=vs,
            contributed_datapoint=ld["contributor"])
        done.add(id_)
        '''if not ld.get('bibsources'):
            if 'bibsources' not in ld:
                args.log.warn("no bibsource %s" % ld)
            continue
        for k, _ in [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,")]:
            common.ValueSetReference(valueset=vs, source=data['Source'][k])'''
    DBSession.flush()

    #To CLDF
    cldf = {}
    for ld in ldps:
        parameter = data['Feature'][ld['feature_alphanumid']]
        language = data['lsiLanguage'][ld['language_id']]
        id_ = '%s-%s' % (parameter.id, language.id)
        if not id_ in done:
            continue
        dt = (lgs[ld['language_id']], ld['language_id'],
              ld['feature_alphanumid'] + ". " + ld['feature_name'],
              ld["value"])
        cldf[dt] = None

    tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows])
    savu(tab([("Language", "iso-639-3", "Feature", "Value")] + cldf.keys()),
         "lsi.cldf")

    args.log.info('%s Errors' % len(errors))

    dataset = common.Dataset(
        id="LSI",
        name='Linguistic Survey of India',
        publisher_name="Sprakbanken",
        publisher_place="Gothenburg",
        publisher_url="to be given",
        description="this is to be followed",
        domain='http://lsi.clld.org',
        published=date(2016, 05, 16),
        contact='*****@*****.**',
        license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en',
        jsondata={
            'license_icon':
            'http://wals.info/static/images/cc_by_nc_nd.png',
            'license_name':
            'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany'
        })

    # disabled for experimental purposes, names were appearing multiple times
    for i, contributor in enumerate([
            common.Contributor(id="Lars Borin",
                               name="Lars Borin",
                               email="*****@*****.**"),
            common.Contributor(id="Shafqat Mumtaz Virk",
                               name="Shafqat Mumtaz Virk",
                               email="*****@*****.**"),
            common.Contributor(id="Anju Saxena",
                               name="Anju Saxena",
                               email="*****@*****.**"),
            common.Contributor(id="Harald Hammarstrom",
                               name="Harald Hammarstrom",
                               email="*****@*****.**")
    ]):
        #print i
        common.Editor(dataset=dataset, contributor=contributor, ord=i)
    '''cont1 = common.Contributor(
            id="Harald Hammarstrom",
            name="Harald Hammarstrom",
            email="*****@*****.**")
    cont2= common.Contributor(
            id="Shafqat Mumtaz Virk",
            name="Shafqat Mumtaz Virk",
            email="*****@*****.**")
    cont3 = common.Contributor(
            id="Lars Borin",
            name="Lars Borin",
            email="*****@*****.**")
    for contributor in [cont1,cont2,cont3]:
        common.Editor(dataset=dataset, contributor=contributor,ord=1)'''

    DBSession.add(dataset)
    DBSession.flush()
Example #15
0
    def test_unescape(self):
        from clld.lib.bibtex import unescape

        assert unescape(r"\ss ") == 'ß'