Esempio n. 1
0
def get_datasets(spec, ep=ENTRY_POINT, glob=False):
    if spec == '*':
        return list(iter_datasets(ep))
    if glob:
        return nfilter(
            dataset_from_module(p) for p in pathlib.Path('.').glob(spec))
    return nfilter([get_dataset(spec, ep=ep)])
Esempio n. 2
0
def get_datasets(spec,
                 ep=ENTRY_POINT,
                 glob: bool = False) -> typing.List[Dataset]:
    """
    :param spec: Either `'*'` to get all datasets for a specific entry point, or glob pattern \
    matching dataset modules in the current directory (if `glob == True`), or a `str` as accepted \
    by :func:`get_dataset`.
    """
    if spec == '*':
        return list(iter_datasets(ep))
    if glob:
        return nfilter(
            dataset_from_module(p) for p in pathlib.Path('.').glob(spec))
    return nfilter([get_dataset(spec, ep=ep)])
Esempio n. 3
0
 def col_defs(self):
     get_param = lambda v: v.valueset.parameter
     get_lang = lambda v: v.valueset.language
     core = nfilter([
         CognateCol(self, 'name'),
         PhoneticCol(self, 'phonetic')
         if not (self.language and self.language.proto) else None,
         Col(self, 'description', sTitle='Meaning'),
         Col(self, 'comment',
             model_col=Counterpart.comment,
             format=lambda i: markup_italic(i.comment)),
     ])
     if self.language:
         if self.language.id == 'psi':
             return [
                 LinkCol(self, 'lemma', get_object=get_param, model_col=Parameter.name)] + \
                 core + [
                 RootExtCol(
                     self,
                     'reconstruction_with_root_extension_code',
                     model_col=Entry.psi_reconstruction_with_root_extension_code),
                 RefsCol(self, 'sources')]
         return [
             LinkCol(self, 'lemma', get_object=get_param, model_col=Parameter.name)] +\
             core + [RefsCol(self, 'sources')]
     if self.parameter:
         return [
             LanguageCol(
                 self, 'language', model_col=Language.name, get_object=get_lang)] +\
             core + [RefsCol(self, 'sources')]
     return [
         LinkCol(self, 'lemma', get_object=get_param, model_col=Parameter.name),
         LanguageCol(self, 'language', model_col=Language.name, get_object=get_lang)] +\
         core
Esempio n. 4
0
def refs(api, glottolog, sheet):
    glottolog = Glottolog(glottolog)
    languoid, lang = glottolog.api.languoid(sheet.glottocode), None

    # Determine the associated language-level languoid:
    if languoid.level.name == 'dialect':  # pragma: no cover
        for _, gc, _ in reversed(languoid.lineage):
            lang = glottolog.api.languoid(gc)
            if lang.level.name == 'language':
                break
    else:
        lang = languoid

    ids = set(
        nfilter([
            languoid.id, languoid.hid, languoid.iso, lang.id, lang.hid,
            lang.iso
        ]))
    bibs = Bibs(glottolog, api)

    lgks = collections.defaultdict(set)
    for key, code in bibs.iter_codes():
        if code in ids:
            lgks[languoid.id].add(key)

    def source(key):
        type_, fields = bibs[key]
        return key, type_, fields.get('author', fields.get('editor',
                                                           '-')), fields.get(
                                                               'year', '-')

    unresolved = collections.Counter()
    res = bibdata(sheet, list(sheet.iter_row_objects(api)), bibs, lgks,
                  unresolved)
    return list(res), unresolved, [source(k) for k in lgks[languoid.id]]
Esempio n. 5
0
def add(concept, data, names, contrib):
    domain = data['Domain'].get(concept.code)
    if domain is None:
        domain = data.add(
            models.Domain, concept.code,
            id=concept.code,
            name=concept.code_eng,
            description=concept.code_fr)
    scid = '-'.join([concept.code, concept.subcode.replace('.', '_')])
    subdomain = data['Subdomain'].get(scid)
    if subdomain is None:
        subdomain = data.add(
            models.Subdomain, scid,
            id=scid,
            name=concept.subcode_eng,
            description=concept.sous_code_fr,
            domain=domain)

    cid = '%05d' % int(concept.ref)
    if concept.English in names:
        name = '%s (%s)' % (concept.English, names[concept.English] + 1)
    else:
        name = concept.English
    names[concept.English] += 1

    c = data['Concept'].get(cid)
    if c is None:
        c = data.add(
            models.Concept, cid,
            core=concept.core == '1',
            id=cid,
            name=name,
            description=concept.Francais,
            subdomain=subdomain,
            jsondata=dict(ref='%s-%s-%s' % (concept.code, concept.subcode, concept.subsubcode)))
        if concept.species:
            c.species = concept.species
        if concept.family:
            c.family = concept.family
    else:
        assert cid == '50325'

    for gc, forms in concept.forms.items():
        lang = data['Languoid'].get(gc)
        assert lang
        if not forms:
            continue

        vs = common.ValueSet(
            id='-'.join([cid, gc]),
            language=lang,
            contribution=contrib,
            parameter=c)
        for j, form in enumerate(nfilter(util.split_words(forms))):
            attrs = util.parse_form(form)
            if attrs['name'] and attrs['name'] != 'xxx':
                models.Counterpart(
                    id='-'.join([vs.id, str(j + 1)]),
                    valueset=vs,
                    **util.parse_form(form))
Esempio n. 6
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    for vs in DBSession.query(common.ValueSet).options(
            joinedload(common.ValueSet.values)):
        d = []
        for generic_term, words in groupby(
            sorted(vs.values, key=lambda v: v.description), key=lambda v: v.description
        ):
            if generic_term:
                generic_term += ': '
            else:
                generic_term = ''
            d.append(generic_term + ', '.join(nfilter([w.name for w in words])))

        vs.description = '; '.join(d)

    for model in [models.Country, models.Ecoregion]:
        for instance in DBSession.query(model).options(
                joinedload(getattr(model, 'taxa'))
        ):
            if not instance.taxa:
                instance.active = False
Esempio n. 7
0
def run(args):
    dataset = get_dataset(args)
    with update(dataset.dir / '.zenodo.json', indent=4, default=collections.OrderedDict()) as md:
        modules = ['cldf:' + spec.module for spec in dataset.cldf_specs_dict.values()]
        contribs = dataset.dir / 'CONTRIBUTORS.md'
        creators, contributors = get_creators_and_contributors(
            contribs.read_text(encoding='utf8') if contribs.exists() else '', strict=False)
        if creators:
            md['creators'] = [contrib(p) for p in creators]
        if contributors:
            md["contributors"] = [contrib(p) for p in contributors]
        communities = [r["identifier"] for r in md.get("communities", [])] + \
                      [c.strip() for c in nfilter(args.communities.split(','))]
        if communities:
            md['communities'] = [
                {"identifier": community_id} for community_id in sorted(set(communities))]
        md.update(
            {
                "title": dataset.metadata.title,
                "access_right": "open",
                "keywords": sorted(set(md.get("keywords", []) + ["linguistics"] + modules)),
                "upload_type": "dataset",
            }
        )
        if dataset.metadata.citation:
            md['description'] = "<p>Cite the source of the dataset as:</p>\n\n" \
                                "<blockquote>\n<p>{}</p>\n</blockquote>".format(
                html.escape(dataset.metadata.citation))
        if dataset.metadata.zenodo_license:
            md['license'] = {'id': dataset.metadata.zenodo_license}
Esempio n. 8
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    for vs in DBSession.query(common.ValueSet).options(
            joinedload(common.ValueSet.values)):
        d = []
        for generic_term, words in groupby(sorted(vs.values,
                                                  key=lambda v: v.description),
                                           key=lambda v: v.description):
            if generic_term:
                generic_term += ': '
            else:
                generic_term = ''
            d.append(generic_term + ', '.join(nfilter([w.name
                                                       for w in words])))

        vs.description = '; '.join(d)

    for model in [models.Country, models.Ecoregion]:
        for instance in DBSession.query(model).options(
                joinedload(getattr(model, 'taxa'))):
            if not instance.taxa:
                instance.active = False
Esempio n. 9
0
 def format(self, item):
     vs = self.get_obj(item)
     return ', '.join(
         nfilter([
             getattr(vs, 'source', None),
             linked_references(self.dt.req, vs)
         ]))
Esempio n. 10
0
def source_to_refs(src, lgid, e, lgks, unresolved, fixrefs=None):
    fixrefs = fixrefs or REFS
    ays = list(iter_ayps(src))
    refs = sorted(set(ref for s in ays
                      for ref in iter_key_pages(lgid, s, e, lgks)),
                  key=lambda r: (r[0], r[1] or ''))
    src_comment = None
    if not refs:
        if repageonly.match(src):
            src = "[%s] default source:%s" % (lgid, src)
            print("PAGEONLY:", src, lgid)
        elif not (src.find("p.c") == -1 and src.find("personal communication")
                  == -1 and src.find("pers comm") == -1 and
                  src.find("pers. comm") == -1 and src.find("ieldnotes") == -1
                  and src.find("ield notes") == -1 and src.find("forth") == -1
                  and src.find("Forth") == -1 and src.find("ubmitted") == -1
                  and src.find("o appear") == -1 and src.find("in press") == -1
                  and src.find("in prep") == -1 and src.find("in prog") == -1
                  and not src.startswith("http")):
            src_comment = src
        else:
            if ays:
                for author, year, pages, word_from_title in ays:
                    if (author, year, lgid) in fixrefs:
                        refs.append((fixrefs[(author, year, lgid)], pages))
                    else:
                        unresolved.update([(author, year, lgid)])
            else:
                unresolved.update([(src, lgid)])
    return [(k, nfilter(r[1] for r in rs))
            for k, rs in groupby(refs, lambda r: r[0])], src_comment
Esempio n. 11
0
def add_sources(args, data):
    bib = Database.from_file(args.data_file('phoible-references.bib'),
                             lowercase=True)
    ext = [
        Record.from_string('@' + s, lowercase=True)
        for s in nfilter(BIB.split('@'))
    ]

    for rec in chain(ext, bib):
        if rec.id not in data['Source']:
            data.add(Source, rec.id, _obj=bibtex2source(rec))

    #
    # add aliases to lookup records with bibtex keys with numeric prefixes without
    # specifying the prefix
    #
    for key in list(data['Source'].keys()):
        if '_' in key:
            no, rem = key.split('_', 1)
            try:
                int(no)
                if rem not in data['Source']:
                    data['Source'][rem] = data['Source'][key]
            except (ValueError, TypeError):
                pass
Esempio n. 12
0
def get_ts_search_string(s_):
    """Converts a search string into a ts_query conform syntax
    - a " " will be replaced by " & "
    - a :* will be append to each search term for partial matching ("starts with")
    """
    # if any special character appear return None to let handle plainto_tsquery() the search
    if any(e in s_ for e in ["'", '*', ':', '&', '|', '(', ')', '!']):
        return None

    # while creating tsvector
    # _ and - will be replaced by . to avoid tokenizing
    # ,\t\r\n – will be replaced by 'space' to take them as search separator
    s = re.sub(r'[,\t\r\n]', ' ', re.sub(r'[_\-]', '.', s_))

    search_items = set(nfilter(re.split(' +', s.replace('"', ''))))
    search_items = nfilter([a.strip() for a in search_items])
    return ' & '.join(['%s:*' % (a) for a in search_items])
Esempio n. 13
0
 def feature_properties(self, ctx, req, valueset):
     return {
         'values':
         list(valueset.values),
         'label':
         ', '.join(nfilter(v.name for v in valueset.values))
         or self.get_language(ctx, req, valueset).name
     }
Esempio n. 14
0
class DocumentType(ConfigObject):
    rank = attr.ib(converter=int)
    id = attr.ib()
    name = attr.ib()
    description = attr.ib()
    abbv = attr.ib()
    bibabbv = attr.ib()
    webabbr = attr.ib()
    triggers = attr.ib(converter=lambda s: nfilter(s.split('\n')))
Esempio n. 15
0
def load_ecoregions(data_file, data):
    ecoregions = jsonload(data_file('ecoregions.json'))['features']

    biome_map = {
        1: ('Tropical & Subtropical Moist Broadleaf Forests', '008001'),
        2: ('Tropical & Subtropical Dry Broadleaf Forests', '557715'),
        3: ('Tropical & Subtropical Coniferous Forests', ''),
        4: ('Temperate Broadleaf & Mixed Forests', ''),
        5: ('Temperate Conifer Forests', ''),
        6: ('Boreal Forests/Taiga', ''),
        7:
        ('Tropical & Subtropical Grasslands, Savannas & Shrublands', '98ff66'),
        8: ('Temperate Grasslands, Savannas & Shrublands', ''),
        9: ('Flooded Grasslands & Savannas', '0265fe'),
        10: ('Montane Grasslands & Shrublands', 'cdffcc'),
        11: ('Tundra', ''),
        12: ('Mediterranean Forests, Woodlands & Scrub', 'cc9900'),
        13: ('Deserts & Xeric Shrublands', 'feff99'),
        14: ('Mangroves', '870083'),
    }

    for eco_code, features in groupby(
            sorted(ecoregions, key=lambda e: e['properties']['eco_code']),
            key=lambda e: e['properties']['eco_code']):
        features = list(features)
        props = features[0]['properties']
        if int(props['BIOME']) not in biome_map:
            continue
        biome = data['Biome'].get(props['BIOME'])
        if not biome:
            name, color = biome_map[int(props['BIOME'])]
            biome = data.add(Biome,
                             props['BIOME'],
                             id=str(int(props['BIOME'])),
                             name=name,
                             description=color or 'ffffff')
        centroid = (None, None)
        f = sorted(features, key=lambda _f: _f['properties']['AREA'])[-1]
        if f['geometry']:
            coords = f['geometry']['coordinates'][0]
            if f['geometry']['type'] == 'MultiPolygon':
                coords = coords[0]
            centroid = get_center(coords)

        polygons = nfilter([_f['geometry'] for _f in features])
        data.add(Ecoregion,
                 eco_code,
                 id=eco_code,
                 name=props['ECO_NAME'],
                 description=props['G200_REGIO'],
                 latitude=centroid[1],
                 longitude=centroid[0],
                 biome=biome,
                 area=props['area_km2'],
                 gbl_stat=Ecoregion.gbl_stat_map[int(props['GBL_STAT'])],
                 realm=Ecoregion.realm_map[props['REALM']],
                 jsondata=dict(polygons=polygons))
Esempio n. 16
0
 def feature_properties(self, ctx, req, valueset):
     return {
         'values':
         list(valueset.values),
         'label':
         ', '.join(nfilter(v.name for v in valueset.values))
         if valueset.parameter.contribution.id == 'Wordlist' else
         self.get_language(ctx, req, valueset).name
     }
Esempio n. 17
0
def parse_coords(s):
    cc = nfilter(ss.strip().replace(' ', '') for ss in re.split('[,;]', s))
    res = []
    for i in range(0, len(cc), 2):
        try:
            res.append(Coordinate(cc[i], cc[i + 1]))
        except ValueError:
            continue
    return res
Esempio n. 18
0
def load_ecoregions(data_file, data):
    ecoregions = jsonload(data_file('ecoregions.json'))['features']

    biome_map = {
        1: ('Tropical & Subtropical Moist Broadleaf Forests', '008001'),
        2: ('Tropical & Subtropical Dry Broadleaf Forests', '557715'),
        3: ('Tropical & Subtropical Coniferous Forests', ''),
        4: ('Temperate Broadleaf & Mixed Forests', ''),
        5: ('Temperate Conifer Forests', ''),
        6: ('Boreal Forests/Taiga', ''),
        7: ('Tropical & Subtropical Grasslands, Savannas & Shrublands', '98ff66'),
        8: ('Temperate Grasslands, Savannas & Shrublands', ''),
        9: ('Flooded Grasslands & Savannas', '0265fe'),
        10: ('Montane Grasslands & Shrublands', 'cdffcc'),
        11: ('Tundra', ''),
        12: ('Mediterranean Forests, Woodlands & Scrub', 'cc9900'),
        13: ('Deserts & Xeric Shrublands', 'feff99'),
        14: ('Mangroves', '870083'),
    }

    for eco_code, features in groupby(
            sorted(ecoregions, key=lambda e: e['properties']['eco_code']),
            key=lambda e: e['properties']['eco_code']):
        features = list(features)
        props = features[0]['properties']
        if int(props['BIOME']) not in biome_map:
            continue
        biome = data['Biome'].get(props['BIOME'])
        if not biome:
            name, color = biome_map[int(props['BIOME'])]
            biome = data.add(
                Biome, props['BIOME'],
                id=str(int(props['BIOME'])),
                name=name,
                description=color or 'ffffff')
        centroid = (None, None)
        f = sorted(features, key=lambda _f: _f['properties']['AREA'])[-1]
        if f['geometry']:
            coords = f['geometry']['coordinates'][0]
            if f['geometry']['type'] == 'MultiPolygon':
                coords = coords[0]
            centroid = get_center(coords)

        polygons = nfilter([_f['geometry'] for _f in features])
        data.add(
            Ecoregion, eco_code,
            id=eco_code,
            name=props['ECO_NAME'],
            description=props['G200_REGIO'],
            latitude=centroid[1],
            longitude=centroid[0],
            biome=biome,
            area=props['area_km2'],
            gbl_stat=Ecoregion.gbl_stat_map[int(props['GBL_STAT'])],
            realm=Ecoregion.realm_map[props['REALM']],
            jsondata=dict(polygons=polygons))
Esempio n. 19
0
 def merged_refs(self, type):
     assert type in ['sub', 'family']
     res = defaultdict(set)
     for m in Reference.pattern.finditer(getattr(self, type) or ''):
         res[m.group('key')].add(m.group('pages'))
     for ref in getattr(self, type + 'refs'):
         res[ref.key].add(ref.pages)
     return [
         Reference(key=key, pages=';'.join(sorted(nfilter(pages))) or None)
         for key, pages in res.items()
     ]
Esempio n. 20
0
 def __init__(self, req, *args, **kw):
     Parameters.__init__(self, req, *args, **kw)
     if kw.get('languages'):
         self.languages = kw['languages']
     elif 'languages' in req.params:
         self.languages = nfilter([
             Language.get(id_, default=None)
             for id_ in req.params['languages'].split(',')])
     else:
         self.languages = []
     self._langs = [
         aliased(ValueSet, name='l%s' % i) for i in range(len(self.languages))]
Esempio n. 21
0
class DocumentType(ConfigObject):
    """
    Document types categorize Glottolog references
    """
    rank = attr.ib(converter=int)  #:
    id = attr.ib()  #:
    name = attr.ib()  #:
    description = attr.ib()  #:
    abbv = attr.ib()
    bibabbv = attr.ib()
    webabbr = attr.ib()
    triggers = attr.ib(converter=lambda s: nfilter(s.split('\n')))
Esempio n. 22
0
File: text.py Progetto: cldf/cldfviz
def get_env(template_dir=None, fallback_template_dir=None):
    loader = jinja2.FileSystemLoader(searchpath=[
        str(d)
        for d in nfilter([template_dir, fallback_template_dir, TEMPLATE_DIR])
    ])
    env = jinja2.Environment(loader=loader,
                             trim_blocks=True,
                             lstrip_blocks=True)

    def paragraphs(s):
        return '\n\n'.join(s.split('\n'))

    env.filters['paragraphs'] = paragraphs
    return env
Esempio n. 23
0
 def serialize(obj):
     if obj is None:
         return ''
     if isinstance(obj, string_types):
         return obj
     if isinstance(obj, list):
         return ';'.join(list(sorted(nfilter(obj))))
     if isinstance(obj, tuple):
         if obj[0] is None:
             return ''
         return '{0:.6f} {1:.6f}'.format(*obj)
     if isinstance(obj, date):
         return obj.isoformat()
     raise ValueError(obj)  # pragma: no cover
Esempio n. 24
0
 def serialize(obj):
     if obj is None:
         return ''
     if isinstance(obj, string_types):
         return obj
     if isinstance(obj, list):
         return ';'.join(list(sorted(nfilter(obj))))
     if isinstance(obj, tuple):
         if obj[0] is None:
             return ''
         return '{0:.6f} {1:.6f}'.format(*obj)
     if isinstance(obj, date):
         return obj.isoformat()
     raise ValueError(obj)  # pragma: no cover
Esempio n. 25
0
 def __init__(self, req, *args, **kw):
     Parameters.__init__(self, req, *args, **kw)
     if kw.get('languages'):
         self.languages = kw['languages']
     elif 'languages' in req.params:
         self.languages = nfilter([
             Language.get(id_, default=None)
             for id_ in req.params['languages'].split(',')
         ])
     else:
         self.languages = []
     self._langs = [
         aliased(ValueSet, name='l%s' % i)
         for i in range(len(self.languages))
     ]
Esempio n. 26
0
 def split(self, item, value, lexemes=None):
     lexemes = lexemes or {}
     if value in lexemes:
         log.debug('overriding via lexemes.csv: %r -> %r' %
                   (value, lexemes[value]))
         value = lexemes[value]
     if self.normalize_unicode:
         value = unicodedata.normalize(self.normalize_unicode, value)
     res = misc.nfilter(
         self.clean(form, item=item)
         for form in text.split_text_with_context(
             value, separators=self.separators, brackets=self.brackets))
     if self.first_form_only:
         return res[:1]
     return res
Esempio n. 27
0
def split_text(text, separators=re.compile(r'\s'), brackets=None, strip=False):
    """Split text along the separators unless they appear within brackets.

    :param separators: An iterable single characters or a compiled regex pattern.
    :param brackets: `dict` mapping start tokens to end tokens of what is to be \
    recognized as brackets.

    .. note:: This function will also strip content within brackets.
    """
    if not isinstance(separators, PATTERN_TYPE):
        separators = re.compile(r'[{0}]'.format(''.join(r'\{0}'.format(c)
                                                        for c in separators)))

    return nfilter(
        s.strip() if strip else s
        for s in separators.split(strip_brackets(text, brackets=brackets)))
Esempio n. 28
0
def get_creators_and_contributors(fname, strict=True):
    ctypes = {c.lower(): c for c in CONTRIBUTOR_TYPES}
    creators, contributors = [], []
    for row in iter_rows(fname):
        row = {k.lower(): v for k, v in row.items()}
        for role in nfilter([r.strip().lower() for r in row.get('role', '').split(',')]):
            c = {k: v for k, v in row.items() if k != 'role'}
            if role in {'author', 'creator', 'maintainer'}:
                creators.append(c)
            else:
                if strict:
                    c['type'] = ctypes[role]
                else:
                    c['type'] = ctypes.get(role, 'Other')
                contributors.append(c)
    return creators, contributors
Esempio n. 29
0
    def from_txt(cls, txt, session=None, **kw):
        session = session or DBSession

        lines = nfilter(txt.split('\n'))
        m = LANGUAGE_LINE_PATTERN.match(lines[0])
        assert m
        kw['id'] = m.group('name')
        kw['name'] = ' '.join(s.capitalize() for s in kw['id'].split('_'))
        for cname in ['wals', 'ethnologue', 'glottolog']:
            if m.group(cname[0]):
                kw['classification_' + cname] = m.group(cname[0])

        kw.update(parse_metadata(lines[1]))
        doculect = cls(**kw)
        if doculect.classification_ethnologue:
            doculect.ethnologue_family = doculect.classification_ethnologue.split(
                ',')[0]

        if doculect.classification_glottolog:
            doculect.glottolog_family = doculect.classification_glottolog.split(
                ',')[0]

        doculect.wordlist = Contribution(id=kw['id'],
                                         language=doculect,
                                         name=doculect.id)

        parameters = {p.id: p for p in session.query(Parameter)}

        for line in lines[2:]:
            if '\t' in line:
                wid, words, comment = parse_word(line)
                # if int(wid) not in MEANINGS_ALL:
                #    # drop non-core meanings
                #    continue
                vsid = '%s-%s' % (doculect.id, wid)
                vs = Synset(id=vsid,
                            description=comment,
                            language=doculect,
                            contribution=doculect.wordlist,
                            parameter=parameters[wid])

                for i, word in enumerate(words):
                    id_ = '%s-%s' % (vsid, i + 1)
                    word, loan = word
                    word = Word(id=id_, name=word, valueset=vs, loan=loan)

        return doculect
Esempio n. 30
0
    def iterupdated(self, languoids):  # pragma: no cover
        res = reader(io.StringIO(
            requests.get(MD_URL).content.decode('utf-8-sig')),
                     dialect=Dialect(skipBlankRows=True, commentPrefix='<'),
                     dicts=True)
        md = {d['language_code']: d for d in res}
        lmap = collections.defaultdict(set)
        return
        for line in requests.get(URL).text.splitlines():
            if line.startswith('var curItem'):
                line = line.split('=', maxsplit=1)[1]
                d = json.loads(line)
                if d['AIATSIS_Code'] and d['Glottolog_ID']:
                    codes = [
                        c.strip().replace('*', '')
                        for c in d['AIATSIS_Code'].split(',')
                    ]
                    for code in codes:
                        if code:
                            if code not in md:
                                print(d['AIATSIS_Code'], list(md.keys())[:10])
                                continue
                            lmap[d['Glottolog_ID']].add(code)
        with pathlib.Path(__file__).parent.joinpath('aiatsis.json').open(
                encoding='utf8') as fp:
            for code, gc in json.load(fp).items():
                if code not in md:
                    print(code, list(md.keys())[:10])
                    continue
                lmap[gc].add(code)

        for lang in languoids:
            links, names = [], []
            for c in sorted(lmap.get(lang.id, [])):
                links.append((md[c]['uri'], md[c]['language_name']))
                if md[c]['language_name']:
                    names.append(md[c]['language_name'])
                names.extend(
                    nfilter([
                        n.strip() for n in md[c]['language_synonym'].split('|')
                    ]))
            if any([
                    lang.update_links(DOMAIN, links),
                    lang.update_names(names, type_='aiatsis'),
            ]):
                yield lang
Esempio n. 31
0
def details(path):
    soup = get_soup(path)
    if not soup.find('h2'):
        return
    res = dict(id=path.split('/')[-1], name=soup.find('h2').get_text())
    data = OrderedDict()
    for tr in soup.find_all('tr'):
        tds = list(tr.find_all('td'))
        if len(tds) == 3:
            data[tds[0].get_text().strip()] = tds[2].get_text().strip()

    names = data.get('ALSO KNOWN AS')
    if names:
        res['alternative_names'] = nfilter([n.strip() for n in names.split(',')])
    if data.get('CODE AUTHORITY') == 'ISO 639-3':
        res['iso-639-3'] = data.get('LANGUAGE CODE')
    return res
Esempio n. 32
0
def details(path):  # pragma: no cover
    soup = get_soup(path)
    if not soup.find('h2'):
        return
    res = dict(id=path.split('/')[-1], name=soup.find('h2').get_text())
    data = OrderedDict()
    for tr in soup.find_all('tr'):
        tds = list(tr.find_all('td'))
        if len(tds) == 3:
            data[tds[0].get_text().strip()] = tds[2].get_text().strip()

    names = data.get('ALSO KNOWN AS')
    if names:
        res['alternative_names'] = nfilter(
            [n.strip() for n in names.split(',')])
    if data.get('CODE AUTHORITY') == 'ISO 639-3':
        res['iso-639-3'] = data.get('LANGUAGE CODE')
    return res
Esempio n. 33
0
def merged_rows(rows, active):
    if all(r['Conflict'].lower().startswith('true') for r in rows):
        for row in rows:
            if row['Select'].lower().strip() == 'true':
                return row
        assert rows[0]['Feature_ID'] not in active, str(rows)
        return None
    elif all(r['Conflict'].lower().strip() == 'false' for r in rows):
        row = rows[0]
        for k in ['Sheet', 'Comment', 'Source']:
            row[k] = [row[k]]
        for r in rows[1:]:
            for k in ['Sheet', 'Comment', 'Source']:
                row[k].append(r[k])
        for k, sep in [('Sheet', ' '), ('Comment', '. '), ('Source', '; ')]:
            row[k] = sep.join(nfilter(row[k]))
        return row
    raise ValueError(rows)
Esempio n. 34
0
def split_text_with_context(text, separators=WHITESPACE, brackets=None):
    """Splits text at separators outside of brackets.

    :param text:
    :param separators: An iterable of single character tokens.
    :param brackets:
    :return: A `list` of non-empty chunks.

    .. note:: This function leaves content in brackets in the chunks.
    """
    res, chunk = [], []
    for c, type_ in _tokens(text, brackets=brackets):
        if type_ == TextType.text and c in separators:
            res.append(''.join(chunk).strip())
            chunk = []
        else:
            chunk.append(c)
    res.append(''.join(chunk).strip())
    return nfilter(res)
Esempio n. 35
0
def report(dataset, tr_analysis=None, glottolog=None, log=None):
    #
    # FIXME: in case of multiple cldf datasets:
    # - write only summary into README.md
    # - separate lexemes.md and transcriptions.md
    #
    lines = []

    # add NOTES.md
    if dataset.dir.joinpath('NOTES.md').exists():
        lines.append('## Notes\n')
        lines.append(dataset.dir.joinpath('NOTES.md').read_text() + '\n\n')

    badges = nfilter([build_status_badge(dataset)])

    for cldf_spec in dataset.cldf_specs_dict.values():
        lines.extend(
            cldf_report(cldf_spec, tr_analysis, badges, log, glottolog))
        break
    return '\n'.join(lines)
 def get_personnel(self, args, contributors_path=None):
     if contributors_path is None:
         contributors_path = self.dir / "CONTRIBUTORS.md"
     personnel = {'author': [], 'data entry': [], 'consultant': []}
     try:
         for d in itertools.chain.from_iterable(
                 itertools.chain(
                     pylexibank.get_creators_and_contributors(
                         contributors_path))):
             if 'name' in d and d['name']:
                 for desc in nfilter([
                         r.strip().lower()
                         for r in d.get('description', '').split(',')
                 ]):
                     if desc in personnel and d['name'] not in personnel[
                             desc]:
                         personnel[desc].append(d['name'])
             else:
                 args.log.warn("No 'name' found in file 'CONTRIBUTORS.md'")
     except FileNotFoundError:  # pragma: no cover
         args.log.warn("File '{}' not found".format(contributors_path))
     return personnel
Esempio n. 37
0
    def cmd_makecldf(self, args):
        args.writer.add_sources()
        langs = {lang["Name"]: lang["Glottocode"] for lang in self.languages}

        concepticon = {
            c.number: c.concepticon_id
            for c in self.conceptlists[0].concepts.values()
        }
        varieties, meanings, allforms, rels = parse(self)

        for mn, cognatesets in sorted(allforms.items()):
            args.writer.add_concept(ID=mn,
                                    Name=meanings[mn],
                                    Concepticon_ID=concepticon[mn])
            for ccn, forms in sorted(cognatesets.items()):
                for ln, form in forms:
                    args.writer.add_language(
                        ID=ln,
                        Name=varieties[ln]["name"].strip(),
                        Glottocode=langs[varieties[ln]["name"].strip()],
                    )
                    ffs = [ff.strip().lower() for ff in form.split(",")]
                    for i, f in enumerate(nfilter(ffs)):
                        for row in args.writer.add_lexemes(
                                Language_ID=ln,
                                Parameter_ID=mn,
                                Value=f,
                                Source=["Dyen1992"],
                                Cognacy="%s-%s" % (mn, ccn),
                        ):
                            if len(ffs) == 1 and (2 <= int(ccn) <= 99
                                                  or 200 <= int(ccn) <= 399):
                                # most conservative cognacy judgements only
                                args.writer.add_cognate(lexeme=row,
                                                        Cognateset_ID="%s-%s" %
                                                        (mn, ccn),
                                                        Source="Dyen1992")
Esempio n. 38
0
 def getlist(self, section, option):
     return nfilter(self.get(section, option, fallback='').strip().split('\n'))
Esempio n. 39
0
    def load(self,
             submission,
             data,
             vocab,
             lang,
             comparison_meanings,
             labels):
        def id_(oid):
            return '%s-%s' % (submission.id, oid)

        print('######\n a CLDF dict! \n####')

        try:
            media = {d['ID']: d for d in self.cldf['media.csv']}
        except KeyError:
            media = {}

        metalanguages = submission.props.get('metalanguages', {})

        entries = self.cldf['EntryTable']
        colmap = {k: self.cldf['EntryTable', k].name
                  for k in ['id', 'headword', 'partOfSpeech', 'languageReference', 'source']
                  if self.cldf.get(('EntryTable', k))}
        fks = get_foreign_keys(self.cldf, entries)
        elabels = get_labels('entry', entries, colmap, submission, exclude=fks['EntryTable'][:])

        for lemma in entries:
            if not lemma[colmap['headword']]:
                continue
            oid = lemma.pop(colmap['id'])
            word = data.add(
                models.Word,
                oid,
                id=id_(oid),
                name=lemma.pop(colmap['headword']),
                pos=lemma.pop(colmap['partOfSpeech']),
                dictionary=vocab,
                language=lang)
            DBSession.flush()

            files = [(md5, media[md5]) for md5 in set(lemma.get('Media_IDs', [])) if md5 in media]
            for md5, spec in sorted(
                files,
                key=lambda i: i[1].get(submission.props.get('media_order', 'Description')) or i[1]['ID']
            ):
                submission.add_file(None, md5, common.Unit_files, word, spec)

            self.add_refs(data, 'EntryTable', lemma, word)

            for index, (key, label) in enumerate(elabels.items()):
                label, with_links = label
                if lemma.get(key):
                    DBSession.add(common.Unit_data(
                        object_pk=word.pk,
                        key=label,
                        value=lemma[key],
                        ord=index,
                        jsondata=dict(with_links=with_links)))

        DBSession.flush()

        #
        # Now that all entries are in the DB and have primary keys, we can create the
        # self-referential links:
        #
        fullentries = defaultdict(list)
        for lemma in entries:
            fullentries[lemma[colmap['id']]].extend(list(lemma.items()))
            word = data['Word'][lemma[colmap['id']]]
            for col in fks['EntryTable']:
                col = self.cldf['EntryTable', col]
                label = col.titles.getfirst() if col.titles else col.name
                if label == 'Entry_IDs':
                    label = 'See also'
                label = label.replace('_', ' ')
                for lid in lemma[col.name] or []:
                    if lid not in data['Word']:
                        print('missing entry ID: {0}'.format(lid))
                    else:
                        DBSession.add(models.SeeAlso(
                            source_pk=word.pk, target_pk=data['Word'][lid].pk, description=label))

        sense2word = {}
        colmap = {k: self.cldf['SenseTable', k].name
                  for k in ['id', 'entryReference', 'description', 'source']
                  if self.cldf.get(('SenseTable', k))}
        fks = get_foreign_keys(self.cldf, self.cldf['SenseTable'])

        slabels = get_labels(
            'sense',
            self.cldf['SenseTable'],
            colmap,
            submission,
            exclude=['alt_translation1', 'alt_translation2'] + fks['EntryTable'][:])

        for sense in self.cldf['SenseTable']:
            fullentries[sense[colmap['entryReference']]].extend(list(sense.items()))
            sense2word[sense[colmap['id']]] = sense[colmap['entryReference']]
            try:
                w = data['Word'][sense[colmap['entryReference']]]
            except KeyError:
                print('missing entry: {0}'.format(sense[colmap['entryReference']]))
                continue
            dsc = sense[colmap['description']]
            if not isinstance(dsc, list):
                dsc = [dsc]
            kw = dict(
                id=id_(sense[colmap['id']]),
                name='; '.join(nfilter(dsc)),
                semantic_domain=sense.pop('Semantic_Domain', None),
                word=w)
            if 'alt_translation1' in sense and metalanguages.get('gxx'):
                kw['alt_translation1'] = sense['alt_translation1']
                kw['alt_translation_language1'] = metalanguages.get('gxx')
            if 'alt_translation2' in sense and metalanguages.get('gxy'):
                kw['alt_translation2'] = sense['alt_translation2']
                kw['alt_translation_language2'] = metalanguages.get('gxy')
            m = data.add(models.Meaning, sense[colmap['id']], **kw)
            DBSession.flush()

            self.add_refs(data, 'SenseTable', sense, m)

            for index, (key, label) in enumerate(slabels.items()):
                label, with_links = label
                if sense.get(key):
                    DBSession.add(models.Meaning_data(
                        object_pk=m.pk,
                        key=label,
                        value=sense[key],
                        ord=index,
                        jsondata=dict(with_links=with_links)))

            for i, md in enumerate(nfilter(sense[colmap['description']]), start=1):
                key = md.lower()
                if key in comparison_meanings:
                    concept = comparison_meanings[key]
                else:
                    continue

                vsid = '%s-%s' % (lang.id, concept)
                vs = data['ValueSet'].get(vsid)
                if not vs:
                    vs = data.add(
                        common.ValueSet, vsid,
                        id=vsid,
                        language=lang,
                        contribution=vocab,
                        parameter_pk=concept)

                DBSession.add(models.Counterpart(
                    id='{0}-{1}'.format(m.id, i), name=w.name, valueset=vs, word=w))

            DBSession.flush()
            files = [(md5, media[md5]) for md5 in set(sense.get('Media_IDs', [])) if md5 in media]
            for md5, spec in sorted(
                files,
                key=lambda i: i[1].get(submission.props.get('media_order', 'Description')) or i[1]['ID']
            ):
                submission.add_file(None, md5, models.Meaning_files, m, spec)

            for col in fks['EntryTable']:
                col = self.cldf['SenseTable', col]
                if col.name == colmap['entryReference']:
                    continue
                label = col.titles.getfirst() if col.titles else col.name
                label = label.replace('_', ' ')
                entry_ids = sense[col.name]
                if entry_ids:
                    if not isinstance(entry_ids, list):
                        entry_ids = [entry_ids]
                    for eid in entry_ids:
                        if eid not in data['Word']:
                            print('missing entry ID: {0}'.format(eid))
                        else:
                            DBSession.add(models.Nym(
                                source_pk=m.pk, target_pk=data['Word'][eid].pk, description=label))

        colmap = {k: self.cldf['ExampleTable', k].name
                  for k in ['id', 'primaryText', 'translatedText']}
        for ex in self.cldf['ExampleTable']:
            #
            # FIXME: Detect the column with sense IDs by looking at the foreign keys!
            #
            mids = ex.get('Senses') or ex.get('Sense_IDs', [])
            if not isinstance(mids, list):
                mids = mids.split(' ; ')
            for mid in mids:
                if mid not in data['Meaning']:
                    continue
                if mid in sense2word:
                    fullentries[sense2word[mid]].extend(list(ex.items()))
                    models.MeaningSentence(
                        meaning=data['Meaning'][mid],
                        sentence=data['Example'][ex[colmap['id']]])
                else:
                    print('missing sense: {0}'.format(mid))

        for wid, d in fullentries.items():
            if wid in data['Word']:
                data['Word'][wid].fts = tsvector(
                    '; '.join('{0}: {1}'.format(k, v) for k, v in d if v))
 def nattr(p, attr):
     return len(nfilter([getattr(i, attr, None) for i in read_all(p)]))
Esempio n. 41
0
def tests_path(*comps):
    return Path(__file__).parent.joinpath(*nfilter(comps))
Esempio n. 42
0
def test_nfilter():
    from clldutils.misc import nfilter

    assert nfilter(range(5)) == list(range(1, 5))
Esempio n. 43
0
 def feature_properties(self, ctx, req, valueset):
     return {
         "values": list(valueset.values),
         "label": ", ".join(nfilter(v.name for v in valueset.values)) or self.get_language(ctx, req, valueset).name,
     }
Esempio n. 44
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    data = Data()
    concept_list = Concepticon(CONCEPTICON_REPOS).conceptlist('Key-2016-1310')

    def concepticon_id(ids_code):
        for item in concept_list:
            if item['IDS_ID'] == ids_code:
                return int(item['CONCEPTICON_ID']) if item['CONCEPTICON_ID'] else None

    def read(table):
        fname = args.data_file(table + '.all.csv')
        if not fname.exists():
            fname = args.data_file(table + '.csv')
        return list(dsv.reader(fname, namedtuples=True))

    dataset = common.Dataset(
        id=ids.__name__,
        name="IDS",
        description="The Intercontinental Dictionary Series",
        published=date(2015, 5, 25),
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name':
                'Creative Commons Attribution 4.0 International License',
        },
        domain='ids.clld.org')

    DBSession.add(dataset)

    for rec in Database.from_file(args.data_file('sources.bib'), lowercase=True):
        if rec.id not in data['Source']:
            data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    DBSession.flush()

    data_desc = defaultdict(dict)
    for l in read('x_lg_data'):
        data_desc[l.lg_id][l.map_ids_data] = l.header

    # language lang
    iso_codes = {l.id: l.sil_code for l in read('sil_lang')}
    iso_codes = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil')}
    languages = []

    exclude = []
    for l in read('lang'):
        if l.status == '1':
            exclude.append(l.lg_id)
            continue
        lang_changed = LANGS.get(int(l.lg_id), {})
        code = lang_changed.get('glotto') or lang_changed.get('iso') or iso_codes.get(l.lg_id)
        lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=lang_changed.get('name', l.lg_name))
        if code:
            languages.append((code, lang))
        data.add(
            models.Dictionary, l.lg_id,
            id=l.lg_id, name=l.lg_name,
            language=lang,
            default_representation=data_desc[l.lg_id].get('1'),
            alt_representation=data_desc[l.lg_id].get('2'),
            jsondata=dict(status=l.status, date=l.date))

    iso2glotto = {}
    for l in walk_tree(tree=languoids_path('tree', GLOTTOLOG_REPOS)):
        if l.iso:
            iso2glotto[l.iso] = l.id

    load_families(
        Data(), [(iso2glotto.get(c, c), l) for c, l in languages], glottolog=Glottolog(GLOTTOLOG_REPOS), isolates_icon='tcccccc')

    contributors = defaultdict(list)
    sources = defaultdict(list)
    for l in read('lang_compilers'):
        if l.lg_id in exclude:
            continue
        if l.name == "BIBIKO":
            continue
        #name	lg_id	what_did_id
        if int(l.what_did_id) in models.ROLES:
            contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id))
        else:
            assert int(l.what_did_id) in [4, 395]
            sources[l.name].append(l.lg_id)

    for s, roles in contributors.items():
        name = roles[0][0]
        c = data.add(common.Contributor, s, id=s, name=name)
        if name == 'Mary Ritchie Key':
            c.address = 'University of California, Irvine'
        for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]):
            sroles = sorted(
                [s[1] for s in specs],
                reverse=True,
                key=lambda what: what + 2 if what == 2 else what)
            what = sroles[0]
            DBSession.add(common.ContributionContributor(
                contribution=data['Dictionary'][lg],
                contributor=c,
                ord=what,
                primary=what == 2))

    data.add(
        common.Contributor, 'bernardcomrie',
        id='bernardcomrie',
        name="Bernard Comrie",
        address="University of California, Santa Barbara")

    for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']):
        common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1)

    #for i, name in enumerate(sorted(sources.keys())):
    #    c = data.add(common.Source, name, id=str(i + 1), name=name, description=name)

    DBSession.flush()
    for name, lgs in sources.items():
        for _src in name.split(';'):
            src = data['Source'].get(_src.strip())
            if not src:
                print('-- missing source --', _src)
                raise ValueError
            for lg in lgs:
                if lg in exclude:
                    continue
                assert lg in data['Dictionary']
                DBSession.add(common.ContributionReference(
                    contribution_pk=data['Dictionary'][lg].pk, source_pk=src.pk))

    altnames = {}
    for i, l in enumerate(read('alt_names')):
        if l.name in altnames:
            identifier = altnames[l.name]
        else:
            identifier = data.add(
                common.Identifier, l.name,
                id='name-%s' % i, type='name', name=l.name, description='IDS')
            altnames[l.name] = identifier
        if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name:
            DBSession.add(common.LanguageIdentifier(
                identifier=identifier,
                language=data['IdsLanguage'][l.lg_id]))

    # parameter chapter/entry
    for l in read('chapter'):
        data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title)

    entries = {}
    for l in read('entry'):
        id_ = '%s-%s' % (l.chap_id, l.entry_id)
        name = l.trans_english
        if name in entries:
            entries[name] += 1
            name = name + ' (%s)' % entries[name]
        else:
            entries[name] = 1
        kw = {
            'id': id_,
            'name': name,
            'concepticon_id': concepticon_id(id_),
            'chapter': data['Chapter'][l.chap_id]}
        for ll in 'french russian spanish portugese'.split():
            kw[ll] = getattr(l, 'trans_' + ll)
        data.add(models.Entry, id_, sub_code=l.entry_id, **kw)

    misaligned = []

    DBSession.flush()
    for entity in 'IdsLanguage Entry Chapter Dictionary'.split():
        for k in data[entity].keys()[:]:
            data[entity][k] = data[entity][k].pk

    synsets = set()
    counterparts = set()
    problems = defaultdict(list)

    for lg_id, entries in groupby(
            sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id):
        if lg_id in exclude or not lg_id:
            continue

        # keep the memory footprint reasonable
        transaction.commit()
        transaction.begin()

        language = common.Language.get(data['IdsLanguage'][lg_id])
        desc = data_desc.get(lg_id, {})
        words = defaultdict(list)
        for l in entries:
            if empty.match(l.data_1):
                continue

            entry_id = '%s-%s' % (l.chap_id, l.entry_id)
            if entry_id not in data['Entry']:
                continue
                #data.add(
                #    models.Entry, entry_id,
                #    id=entry_id,
                #    name=entry_id,
                #    concepticon_id=concepticon_id(entry_id),
                #    sub_code=l.entry_id,
                #    chapter_pk=data['Chapter'][l.chap_id])
                #DBSession.flush()
                #data['Entry'][entry_id] = data['Entry'][entry_id].pk

            id_ = '%s-%s' % (entry_id, l.lg_id)
            if id_ in synsets:
                vs = models.Synset.get(id_)
            else:
                vs = models.Synset(
                    id=id_,
                    comment=get_string(l.comment or ''),
                    alt_representation=get_string(l.data_2),
                    language=language,
                    contribution_pk=data['Dictionary'][l.lg_id],
                    parameter_pk=data['Entry'][entry_id])
                synsets.add(id_)

            trans1 = list(split_counterparts(l.data_1))
            trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2))

            if trans2:
                if len(trans2) != len(trans1):
                    if language.id != '238':
                        misaligned.append((l.chap_id, l.entry_id, l.lg_id))
                        #print('===', language.id, language.name)
                        #print(l.data_1)
                        #print(l.data_2)
                    # 83 cases of misaligned transcriptions
                    trans2 = None

            for i, word in enumerate(trans1):
                cid = id_ + '-' + str(i + 1 + len(vs.values))
                if cid not in counterparts:
                    v = models.Counterpart(
                        id=cid,
                        name=word,
                        description=desc.get('1'),
                        valueset=vs)
                    words[word].append((v, trans2[i] if trans2 else None))
                    counterparts.add(cid)
                else:
                    print(cid)
                    #12 - 420 - 811 - 3
                    #5 - 390 - 818 - 3
                    #2 - 930 - 819 - 3
                    #2 - 930 - 819 - 3
                    #3 - 120 - 819 - 3
                    #10 - 140 - 822 - 3
                    #9 - 160 - 825 - 3
                    #2 - 430 - 829 - 4

        for i, form in enumerate(words.keys()):
            # Since we identify words based on their string representation, we have to
            # make sure a word has the same alternative transcription for all meanings.
            if language.id == '238':
                alt_names = []
            else:
                alt_names = set(norm(w[1] or '', desc.get('2'), language.id)
                                for w in words[form])
            alt_names = nfilter(alt_names)
            try:
                assert len(alt_names) <= 1
            except AssertionError:
                problems[(language.id, language.name)].append(alt_names)
            word = models.Word(
                id='%s-%s' % (language.id, i + 1),
                name=form,
                description=desc.get('1'),
                language=language,
                alt_name=', '.join(alt_names) if alt_names else None,
                alt_description=desc.get('2')
            )
            for v, _ in words[form]:
                word.counterparts.append(v)
            DBSession.add(word)

        DBSession.flush()

    with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp:
        fp.writerows(misaligned)

    # about 250 cases where alternative transcriotions do not covary across meanings.
    for k, v in problems.items():
        print(k, len(v))
Esempio n. 45
0
def main(args):
    #
    # order of init:
    # - villages
    # - files
    # - movies
    #
    videos = defaultdict(list)
    for f in util.iter_files(args):
        obj = models.File(**attr.asdict(f))
        if obj.mime_type.startswith('video'):
            videos[slug(obj.name.split('.')[0])].append(obj)
        DBSession.add(obj)

    lexicon = list(util.iter_lexicon(args))
    villages = util.get_villages(args)
    ff_images = list(util.ff_images(args))
    bib = list(util.get_bib(args))
    data = Data()

    dataset = common.Dataset(
        id=dogonlanguages.__name__,
        name="Dogon and Bangime Linguistics",
        contact="*****@*****.**",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='dogonlanguages.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'}
    )
    DBSession.add(dataset)

    if Glottolog:
        if socket.gethostname() == 'dlt5502178l':
            glottolog = Glottolog(
                Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath(
                    'glottolog3', 'glottolog'))
        else:
            glottolog = Glottolog(
                Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath(
                    'glottolog'))
        languoids = {l.id: l for l in glottolog.languoids()}
    else:
        languoids = {}
    print('got glottolog')

    for c in util.CONTRIBUTORS:
        id_ = slug(c.name.split()[-1])
        data.add(models.Member, id_, id=id_, **attr.asdict(c))
    data.add(
        models.Member, 'forkel',
        id='forkel',
        name='Robert Forkel',
        email='*****@*****.**',
        in_project=False)

    for i, id_ in enumerate(['moran', 'forkel', 'heath']):
        DBSession.add(common.Editor(
            dataset=dataset, ord=i + 1, contributor=data['Member'][id_]))

    contrib = data.add(common.Contribution, 'd', id='d', name='Dogon Languages')
    for doc in bib:
        obj = data.add(
            models.Document,
            doc.rec.id,
            _obj=bibtex2source(doc.rec, cls=models.Document))
        keywords = nfilter([s.strip() for s in doc.rec.get('keywords', '').split(',')])
        for dt in 'grammar lexicon typology texts'.split():
            if dt in keywords:
                obj.doctype = dt
                break
        obj.project_doc = ('DLP' in keywords) or bool(doc.files)
        if obj.project_doc:
            for i, cid in enumerate(util.get_contributors(doc.rec, data)):
                models.DocumentContributor(
                    document=obj, contributor=data['Member'][cid], ord=i)
        for i, (path, cdstar) in enumerate(doc.files):
            common.Source_files(
                id='%s-%s' % (obj.id, i + 1),
                name=path,
                object=obj,
                mime_type=guess_type(path)[0],
                jsondata=cdstar,
            )

    print('got bib')

    for name, (gc, desc) in LANGUAGES.items():
        gl_lang = languoids[gc]
        lat, lon = gl_lang.latitude, gl_lang.longitude
        lang = data.add(
            models.Languoid, gc,
            id=gc,
            name=name,
            description=desc,
            latitude=lat,
            longitude=lon,
            family=gl_lang.family.name if gl_lang and gl_lang.family else name,
        )
        if name == 'Penange' and lang.longitude > 0:
            lang.longitude = -lang.longitude
        if name == 'Bankan Tey':
            lang.latitude, lang.longitude = 15.07, -2.91
        if name == 'Ben Tey':
            lang.latitude, lang.longitude = 14.85, -2.95
        if name == 'Togo Kan':
            lang.latitude, lang.longitude = 14.00, -3.25
        add_language_codes(data, lang, gl_lang.iso, glottocode=gc)

    villages_by_name = defaultdict(list)
    contrib_by_initial = {c.abbr: c for c in data['Member'].values()}
    for i, village in enumerate(villages):
        lang = None
        if village.glottocode:
            lang = data['Languoid'].get(village.glottocode)
            if not lang:
                gl_lang = languoids[village.glottocode]
                lang = data.add(
                    models.Languoid, gl_lang.id,
                    id=gl_lang.id,
                    name=gl_lang.name,
                    in_project=False,
                    family=gl_lang.family.name if gl_lang.family else gl_lang.name)
        v = data.add(
            models.Village, str(i + 1),
            id=str(i + 1),
            name=village.name,
            description=village.data.pop('social info'),
            surnames=village.data.pop('surnames'),
            major_city=village.data['MajorCity'] == 'Y',
            transcribed_name=village.data.pop('Transcribed Village Name'),
            source_of_coordinates=village.data.pop('sourceOfCoordinates'),
            latitude=village.lat,
            longitude=village.lon,
            languoid=lang,
            jsondata=village.data,
        )
        villages_by_name[village.name].append(v)
        for img in village.images:
            mimetype = guess_type(img.name)[0]
            if mimetype:
                f = models.Village_files(
                    id=img.id,
                    name=img.name,
                    description=img.description,
                    date_created=img.date,
                    latitude=img.coords[0] if img.coords else None,
                    longitude=-img.coords[1] if img.coords else None,
                    object=v,
                    mime_type=mimetype,
                    jsondata=img.cdstar,
                )
                for initial in img.creators:
                    if initial in contrib_by_initial:
                        models.Fotographer(
                            foto=f, contributor=contrib_by_initial[initial])

    for cat, desc, place, name in MOVIES:
        s = slug(name)
        m = models.Movie(
            id=s,
            name=desc,
            description=cat,
            place=place,
        )
        if place in villages_by_name and len(villages_by_name[place]) == 1:
            m.village = villages_by_name[place][0]
            #print('found village: %s' % name)
        for v in videos[s]:
            #print('found video: %s' % name)
            v.movie = m
            m.duration = v.duration

    names = defaultdict(int)
    for concept in lexicon:
        add(concept, data, names, contrib)

    count = set()
    for img in ff_images:
        if img.id in count:
            continue
        count.add(img.id)
        if img.ref:
            if img.ref in data['Concept']:
                concept = data['Concept'][img.ref]
                if img.tsammalex_taxon and not concept.tsammalex_taxon:
                    concept.tsammalex_taxon = img.tsammalex_taxon
                    #print(concept.tsammalex_taxon)
                common.Parameter_files(
                    object=concept,
                    id=img.id,
                    name=img.name.decode('utf8'),
                    mime_type=guess_type(img.name)[0],
                    jsondata=img.cdstar)
            else:
                print('missing ref: %s' % img.ref)
Esempio n. 46
0
    def load_examples(self, dictionary, data, lang):
        abbr_p = re.compile('\$(?P<abbr>[a-z1-3][a-z]*(\.[a-z]+)?)')
        if hasattr(self.dictionary, 'cldf'):
            #ID,Language_ID,Primary_Text,Analyzed_Word,Gloss,Translated_Text,Meta_Language_ID,Comment,Sense_IDs,Analyzed,Media_IDs
            #XV000001,tzh,lek a lok',,,salió bien,,,SN000001,,
            colmap = {}
            for k in [
                'id',
                'primaryText',
                'analyzedWord',
                'gloss',
                'translatedText',
                'languageReference',
                'metaLanguageReference',
                'comment',
            ]:
                try:
                    colmap[k] = self.dictionary.cldf['ExampleTable', k].name
                except KeyError:
                    pass

            for i, ex in enumerate(self.dictionary.cldf['ExampleTable']):
                obj = data.add(
                    models.Example,
                    ex[colmap['id']],
                    id='%s-%s' % (self.id, ex.pop(colmap['id']).replace('.', '_')),
                    name=ex.pop(colmap['primaryText']),
                    number='{0}'.format(i + 1),
                    source=ex.pop('Corpus_Reference', None),
                    comment=ex.pop(colmap['comment'], None) if 'comment' in colmap else None,
                    original_script=ex.pop('original_script', None),
                    language=lang,
                    serialized='{0}'.format(ex),
                    dictionary=dictionary,
                    analyzed='\t'.join(
                        nfilter(ex.pop(colmap['analyzedWord'], []) or []))
                    if 'analyzedWord' in colmap else None,
                    gloss='\t'.join(
                        [abbr_p.sub(lambda m: m.group('abbr').upper(), g or '') for g in ex[colmap['gloss']]])
                    if 'gloss' in colmap and ex[colmap['gloss']] \
                    else ((ex[colmap['gloss']] or None) if 'gloss' in colmap else None),
                    description=ex.pop(colmap['translatedText'], None),
                    alt_translation1=ex.pop('alt_translation1', None),
                    alt_translation_language1=self.props.get('metalanguages', {}).get('gxx'),
                    alt_translation2=ex.pop('alt_translation2', None),
                    alt_translation_language2=self.props.get('metalanguages', {}).get('gxy'),
                )
                for col in ['languageReference', 'metaLanguageReference', 'gloss']:
                    if col in colmap:
                        del ex[colmap[col]]
                DBSession.flush()
                for md5 in sorted(set(ex.pop('Media_IDs', []))):
                    self.add_file(None, md5, common.Sentence_files, obj)

                for k, v in ex.items():
                    if v and (k not in ['Sense_IDs']):
                        DBSession.add(common.Sentence_data(
                            object_pk=obj.pk,
                            key=k,
                            value=ex[k],
                        ))
        elif self.dir.joinpath('processed', 'examples.sfm').exists():
            for i, ex in enumerate(
                    Examples.from_file(self.dir.joinpath('processed', 'examples.sfm'))):
                obj = data.add(
                    models.Example,
                    ex.id,
                    id='%s-%s' % (self.id, ex.id.replace('.', '_')),
                    name=ex.text,
                    number='{0}'.format(i + 1),
                    source=ex.corpus_ref,
                    language=lang,
                    serialized='{0}'.format(ex),
                    dictionary=dictionary,
                    analyzed=ex.morphemes,
                    gloss=abbr_p.sub(lambda m: m.group('abbr').upper(), ex.gloss) if ex.gloss else ex.gloss,
                    description=ex.translation,
                    alt_translation1=ex.alt_translation,
                    alt_translation_language1=self.props.get('metalanguages', {}).get('gxx'),
                    alt_translation2=ex.alt_translation2,
                    alt_translation_language2=self.props.get('metalanguages', {}).get('gxy'))
                DBSession.flush()

                if ex.soundfile:
                    self.add_file('audio', ex.soundfile, common.Sentence_files, obj)
Esempio n. 47
0
    def col_defs(self):
        kw = {}
        if self.language:
            kw['bSearchable'] = False
            kw['bSortable'] = False
        name_col = ApicsValueNameCol(self, 'value', **kw)
        if self.parameter and self.parameter.domain:
            name_col.choices = [de.name for de in self.parameter.domain]

        class ValueLanguageCol(LinkCol):
            def search(self, qs):
                if self.dt.language:
                    return ValueSet.language_pk == int(qs)
                if self.dt.parameter:
                    return icontains(self.dt.vs_lang.name, qs)

            def order(self):
                if self.dt.parameter:
                    return cast(self.dt.vs_lang.id, Integer)
                if self.dt.language:
                    return ValueSet.language_pk

        lang_col = ValueLanguageCol(
            self,
            'language',
            model_col=Language.name,
            get_obj=lambda item: item.valueset.language,
            bSearchable=bool(self.parameter or self.language),
            bSortable=bool(self.parameter or self.language))
        if self.language:
            if self.language.lects:
                lang_col.choices = [
                    (l.pk, l.name) for l in [self.language] + self.language.lects]
                lang_col.js_args['sTitle'] = 'lect'
            else:
                lang_col = None

        get_param = lambda i: i.valueset.parameter
        if self.parameter:
            return nfilter([
                lang_col,
                name_col,
                Col(self,
                    'lexifier',
                    format=lambda i: i.valueset.language.lexifier,
                    model_col=self.vs_lect.lexifier,
                    choices=get_distinct_values(
                        Lect.lexifier,
                        key=lambda v: 'z' + v if v == 'Other' else v)),
                LinkToMapCol(
                    self, 'm', get_object=lambda i: None
                    if i.valueset.language.language_pk else i.valueset.language),
                DetailsRowLinkCol(self, 'more')
                if self.parameter.feature_type != 'sociolinguistic' else None,
                RefsCol(self, 'source')
                if self.parameter.feature_type != 'segment' else None,
            ])
        if self.language:
            return nfilter([
                IntegerIdCol(self, 'id', get_obj=get_param, model_col=Parameter.id),
                LinkCol(self, 'parameter', get_obj=get_param, model_col=Parameter.name),
                name_col,
                lang_col,
                DetailsRowLinkCol(self, 'more'),
                RefsCol(self, 'source'),
            ])
        return [
            LinkCol(self, 'parameter', get_obj=get_param, model_col=Parameter.name),
            name_col,
            lang_col,
            DetailsRowLinkCol(self, 'more'),
            RefsCol(self, 'source'),
        ]
Esempio n. 48
0
File: base.py Progetto: clld/clld
 def format(self, item):
     vs = self.get_obj(item)
     return ', '.join(
         nfilter([getattr(vs, 'source', None), linked_references(self.dt.req, vs)]))
Esempio n. 49
0
 def joined(iterable):
     return ' / '.join(sorted(nfilter(set(iterable))))