def get_datasets(spec, ep=ENTRY_POINT, glob=False): if spec == '*': return list(iter_datasets(ep)) if glob: return nfilter( dataset_from_module(p) for p in pathlib.Path('.').glob(spec)) return nfilter([get_dataset(spec, ep=ep)])
def get_datasets(spec, ep=ENTRY_POINT, glob: bool = False) -> typing.List[Dataset]: """ :param spec: Either `'*'` to get all datasets for a specific entry point, or glob pattern \ matching dataset modules in the current directory (if `glob == True`), or a `str` as accepted \ by :func:`get_dataset`. """ if spec == '*': return list(iter_datasets(ep)) if glob: return nfilter( dataset_from_module(p) for p in pathlib.Path('.').glob(spec)) return nfilter([get_dataset(spec, ep=ep)])
def col_defs(self): get_param = lambda v: v.valueset.parameter get_lang = lambda v: v.valueset.language core = nfilter([ CognateCol(self, 'name'), PhoneticCol(self, 'phonetic') if not (self.language and self.language.proto) else None, Col(self, 'description', sTitle='Meaning'), Col(self, 'comment', model_col=Counterpart.comment, format=lambda i: markup_italic(i.comment)), ]) if self.language: if self.language.id == 'psi': return [ LinkCol(self, 'lemma', get_object=get_param, model_col=Parameter.name)] + \ core + [ RootExtCol( self, 'reconstruction_with_root_extension_code', model_col=Entry.psi_reconstruction_with_root_extension_code), RefsCol(self, 'sources')] return [ LinkCol(self, 'lemma', get_object=get_param, model_col=Parameter.name)] +\ core + [RefsCol(self, 'sources')] if self.parameter: return [ LanguageCol( self, 'language', model_col=Language.name, get_object=get_lang)] +\ core + [RefsCol(self, 'sources')] return [ LinkCol(self, 'lemma', get_object=get_param, model_col=Parameter.name), LanguageCol(self, 'language', model_col=Language.name, get_object=get_lang)] +\ core
def refs(api, glottolog, sheet): glottolog = Glottolog(glottolog) languoid, lang = glottolog.api.languoid(sheet.glottocode), None # Determine the associated language-level languoid: if languoid.level.name == 'dialect': # pragma: no cover for _, gc, _ in reversed(languoid.lineage): lang = glottolog.api.languoid(gc) if lang.level.name == 'language': break else: lang = languoid ids = set( nfilter([ languoid.id, languoid.hid, languoid.iso, lang.id, lang.hid, lang.iso ])) bibs = Bibs(glottolog, api) lgks = collections.defaultdict(set) for key, code in bibs.iter_codes(): if code in ids: lgks[languoid.id].add(key) def source(key): type_, fields = bibs[key] return key, type_, fields.get('author', fields.get('editor', '-')), fields.get( 'year', '-') unresolved = collections.Counter() res = bibdata(sheet, list(sheet.iter_row_objects(api)), bibs, lgks, unresolved) return list(res), unresolved, [source(k) for k in lgks[languoid.id]]
def add(concept, data, names, contrib): domain = data['Domain'].get(concept.code) if domain is None: domain = data.add( models.Domain, concept.code, id=concept.code, name=concept.code_eng, description=concept.code_fr) scid = '-'.join([concept.code, concept.subcode.replace('.', '_')]) subdomain = data['Subdomain'].get(scid) if subdomain is None: subdomain = data.add( models.Subdomain, scid, id=scid, name=concept.subcode_eng, description=concept.sous_code_fr, domain=domain) cid = '%05d' % int(concept.ref) if concept.English in names: name = '%s (%s)' % (concept.English, names[concept.English] + 1) else: name = concept.English names[concept.English] += 1 c = data['Concept'].get(cid) if c is None: c = data.add( models.Concept, cid, core=concept.core == '1', id=cid, name=name, description=concept.Francais, subdomain=subdomain, jsondata=dict(ref='%s-%s-%s' % (concept.code, concept.subcode, concept.subsubcode))) if concept.species: c.species = concept.species if concept.family: c.family = concept.family else: assert cid == '50325' for gc, forms in concept.forms.items(): lang = data['Languoid'].get(gc) assert lang if not forms: continue vs = common.ValueSet( id='-'.join([cid, gc]), language=lang, contribution=contrib, parameter=c) for j, form in enumerate(nfilter(util.split_words(forms))): attrs = util.parse_form(form) if attrs['name'] and attrs['name'] != 'xxx': models.Counterpart( id='-'.join([vs.id, str(j + 1)]), valueset=vs, **util.parse_form(form))
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ for vs in DBSession.query(common.ValueSet).options( joinedload(common.ValueSet.values)): d = [] for generic_term, words in groupby( sorted(vs.values, key=lambda v: v.description), key=lambda v: v.description ): if generic_term: generic_term += ': ' else: generic_term = '' d.append(generic_term + ', '.join(nfilter([w.name for w in words]))) vs.description = '; '.join(d) for model in [models.Country, models.Ecoregion]: for instance in DBSession.query(model).options( joinedload(getattr(model, 'taxa')) ): if not instance.taxa: instance.active = False
def run(args): dataset = get_dataset(args) with update(dataset.dir / '.zenodo.json', indent=4, default=collections.OrderedDict()) as md: modules = ['cldf:' + spec.module for spec in dataset.cldf_specs_dict.values()] contribs = dataset.dir / 'CONTRIBUTORS.md' creators, contributors = get_creators_and_contributors( contribs.read_text(encoding='utf8') if contribs.exists() else '', strict=False) if creators: md['creators'] = [contrib(p) for p in creators] if contributors: md["contributors"] = [contrib(p) for p in contributors] communities = [r["identifier"] for r in md.get("communities", [])] + \ [c.strip() for c in nfilter(args.communities.split(','))] if communities: md['communities'] = [ {"identifier": community_id} for community_id in sorted(set(communities))] md.update( { "title": dataset.metadata.title, "access_right": "open", "keywords": sorted(set(md.get("keywords", []) + ["linguistics"] + modules)), "upload_type": "dataset", } ) if dataset.metadata.citation: md['description'] = "<p>Cite the source of the dataset as:</p>\n\n" \ "<blockquote>\n<p>{}</p>\n</blockquote>".format( html.escape(dataset.metadata.citation)) if dataset.metadata.zenodo_license: md['license'] = {'id': dataset.metadata.zenodo_license}
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ for vs in DBSession.query(common.ValueSet).options( joinedload(common.ValueSet.values)): d = [] for generic_term, words in groupby(sorted(vs.values, key=lambda v: v.description), key=lambda v: v.description): if generic_term: generic_term += ': ' else: generic_term = '' d.append(generic_term + ', '.join(nfilter([w.name for w in words]))) vs.description = '; '.join(d) for model in [models.Country, models.Ecoregion]: for instance in DBSession.query(model).options( joinedload(getattr(model, 'taxa'))): if not instance.taxa: instance.active = False
def format(self, item): vs = self.get_obj(item) return ', '.join( nfilter([ getattr(vs, 'source', None), linked_references(self.dt.req, vs) ]))
def source_to_refs(src, lgid, e, lgks, unresolved, fixrefs=None): fixrefs = fixrefs or REFS ays = list(iter_ayps(src)) refs = sorted(set(ref for s in ays for ref in iter_key_pages(lgid, s, e, lgks)), key=lambda r: (r[0], r[1] or '')) src_comment = None if not refs: if repageonly.match(src): src = "[%s] default source:%s" % (lgid, src) print("PAGEONLY:", src, lgid) elif not (src.find("p.c") == -1 and src.find("personal communication") == -1 and src.find("pers comm") == -1 and src.find("pers. comm") == -1 and src.find("ieldnotes") == -1 and src.find("ield notes") == -1 and src.find("forth") == -1 and src.find("Forth") == -1 and src.find("ubmitted") == -1 and src.find("o appear") == -1 and src.find("in press") == -1 and src.find("in prep") == -1 and src.find("in prog") == -1 and not src.startswith("http")): src_comment = src else: if ays: for author, year, pages, word_from_title in ays: if (author, year, lgid) in fixrefs: refs.append((fixrefs[(author, year, lgid)], pages)) else: unresolved.update([(author, year, lgid)]) else: unresolved.update([(src, lgid)]) return [(k, nfilter(r[1] for r in rs)) for k, rs in groupby(refs, lambda r: r[0])], src_comment
def add_sources(args, data): bib = Database.from_file(args.data_file('phoible-references.bib'), lowercase=True) ext = [ Record.from_string('@' + s, lowercase=True) for s in nfilter(BIB.split('@')) ] for rec in chain(ext, bib): if rec.id not in data['Source']: data.add(Source, rec.id, _obj=bibtex2source(rec)) # # add aliases to lookup records with bibtex keys with numeric prefixes without # specifying the prefix # for key in list(data['Source'].keys()): if '_' in key: no, rem = key.split('_', 1) try: int(no) if rem not in data['Source']: data['Source'][rem] = data['Source'][key] except (ValueError, TypeError): pass
def get_ts_search_string(s_): """Converts a search string into a ts_query conform syntax - a " " will be replaced by " & " - a :* will be append to each search term for partial matching ("starts with") """ # if any special character appear return None to let handle plainto_tsquery() the search if any(e in s_ for e in ["'", '*', ':', '&', '|', '(', ')', '!']): return None # while creating tsvector # _ and - will be replaced by . to avoid tokenizing # ,\t\r\n – will be replaced by 'space' to take them as search separator s = re.sub(r'[,\t\r\n]', ' ', re.sub(r'[_\-]', '.', s_)) search_items = set(nfilter(re.split(' +', s.replace('"', '')))) search_items = nfilter([a.strip() for a in search_items]) return ' & '.join(['%s:*' % (a) for a in search_items])
def feature_properties(self, ctx, req, valueset): return { 'values': list(valueset.values), 'label': ', '.join(nfilter(v.name for v in valueset.values)) or self.get_language(ctx, req, valueset).name }
class DocumentType(ConfigObject): rank = attr.ib(converter=int) id = attr.ib() name = attr.ib() description = attr.ib() abbv = attr.ib() bibabbv = attr.ib() webabbr = attr.ib() triggers = attr.ib(converter=lambda s: nfilter(s.split('\n')))
def load_ecoregions(data_file, data): ecoregions = jsonload(data_file('ecoregions.json'))['features'] biome_map = { 1: ('Tropical & Subtropical Moist Broadleaf Forests', '008001'), 2: ('Tropical & Subtropical Dry Broadleaf Forests', '557715'), 3: ('Tropical & Subtropical Coniferous Forests', ''), 4: ('Temperate Broadleaf & Mixed Forests', ''), 5: ('Temperate Conifer Forests', ''), 6: ('Boreal Forests/Taiga', ''), 7: ('Tropical & Subtropical Grasslands, Savannas & Shrublands', '98ff66'), 8: ('Temperate Grasslands, Savannas & Shrublands', ''), 9: ('Flooded Grasslands & Savannas', '0265fe'), 10: ('Montane Grasslands & Shrublands', 'cdffcc'), 11: ('Tundra', ''), 12: ('Mediterranean Forests, Woodlands & Scrub', 'cc9900'), 13: ('Deserts & Xeric Shrublands', 'feff99'), 14: ('Mangroves', '870083'), } for eco_code, features in groupby( sorted(ecoregions, key=lambda e: e['properties']['eco_code']), key=lambda e: e['properties']['eco_code']): features = list(features) props = features[0]['properties'] if int(props['BIOME']) not in biome_map: continue biome = data['Biome'].get(props['BIOME']) if not biome: name, color = biome_map[int(props['BIOME'])] biome = data.add(Biome, props['BIOME'], id=str(int(props['BIOME'])), name=name, description=color or 'ffffff') centroid = (None, None) f = sorted(features, key=lambda _f: _f['properties']['AREA'])[-1] if f['geometry']: coords = f['geometry']['coordinates'][0] if f['geometry']['type'] == 'MultiPolygon': coords = coords[0] centroid = get_center(coords) polygons = nfilter([_f['geometry'] for _f in features]) data.add(Ecoregion, eco_code, id=eco_code, name=props['ECO_NAME'], description=props['G200_REGIO'], latitude=centroid[1], longitude=centroid[0], biome=biome, area=props['area_km2'], gbl_stat=Ecoregion.gbl_stat_map[int(props['GBL_STAT'])], realm=Ecoregion.realm_map[props['REALM']], jsondata=dict(polygons=polygons))
def feature_properties(self, ctx, req, valueset): return { 'values': list(valueset.values), 'label': ', '.join(nfilter(v.name for v in valueset.values)) if valueset.parameter.contribution.id == 'Wordlist' else self.get_language(ctx, req, valueset).name }
def parse_coords(s): cc = nfilter(ss.strip().replace(' ', '') for ss in re.split('[,;]', s)) res = [] for i in range(0, len(cc), 2): try: res.append(Coordinate(cc[i], cc[i + 1])) except ValueError: continue return res
def load_ecoregions(data_file, data): ecoregions = jsonload(data_file('ecoregions.json'))['features'] biome_map = { 1: ('Tropical & Subtropical Moist Broadleaf Forests', '008001'), 2: ('Tropical & Subtropical Dry Broadleaf Forests', '557715'), 3: ('Tropical & Subtropical Coniferous Forests', ''), 4: ('Temperate Broadleaf & Mixed Forests', ''), 5: ('Temperate Conifer Forests', ''), 6: ('Boreal Forests/Taiga', ''), 7: ('Tropical & Subtropical Grasslands, Savannas & Shrublands', '98ff66'), 8: ('Temperate Grasslands, Savannas & Shrublands', ''), 9: ('Flooded Grasslands & Savannas', '0265fe'), 10: ('Montane Grasslands & Shrublands', 'cdffcc'), 11: ('Tundra', ''), 12: ('Mediterranean Forests, Woodlands & Scrub', 'cc9900'), 13: ('Deserts & Xeric Shrublands', 'feff99'), 14: ('Mangroves', '870083'), } for eco_code, features in groupby( sorted(ecoregions, key=lambda e: e['properties']['eco_code']), key=lambda e: e['properties']['eco_code']): features = list(features) props = features[0]['properties'] if int(props['BIOME']) not in biome_map: continue biome = data['Biome'].get(props['BIOME']) if not biome: name, color = biome_map[int(props['BIOME'])] biome = data.add( Biome, props['BIOME'], id=str(int(props['BIOME'])), name=name, description=color or 'ffffff') centroid = (None, None) f = sorted(features, key=lambda _f: _f['properties']['AREA'])[-1] if f['geometry']: coords = f['geometry']['coordinates'][0] if f['geometry']['type'] == 'MultiPolygon': coords = coords[0] centroid = get_center(coords) polygons = nfilter([_f['geometry'] for _f in features]) data.add( Ecoregion, eco_code, id=eco_code, name=props['ECO_NAME'], description=props['G200_REGIO'], latitude=centroid[1], longitude=centroid[0], biome=biome, area=props['area_km2'], gbl_stat=Ecoregion.gbl_stat_map[int(props['GBL_STAT'])], realm=Ecoregion.realm_map[props['REALM']], jsondata=dict(polygons=polygons))
def merged_refs(self, type): assert type in ['sub', 'family'] res = defaultdict(set) for m in Reference.pattern.finditer(getattr(self, type) or ''): res[m.group('key')].add(m.group('pages')) for ref in getattr(self, type + 'refs'): res[ref.key].add(ref.pages) return [ Reference(key=key, pages=';'.join(sorted(nfilter(pages))) or None) for key, pages in res.items() ]
def __init__(self, req, *args, **kw): Parameters.__init__(self, req, *args, **kw) if kw.get('languages'): self.languages = kw['languages'] elif 'languages' in req.params: self.languages = nfilter([ Language.get(id_, default=None) for id_ in req.params['languages'].split(',')]) else: self.languages = [] self._langs = [ aliased(ValueSet, name='l%s' % i) for i in range(len(self.languages))]
class DocumentType(ConfigObject): """ Document types categorize Glottolog references """ rank = attr.ib(converter=int) #: id = attr.ib() #: name = attr.ib() #: description = attr.ib() #: abbv = attr.ib() bibabbv = attr.ib() webabbr = attr.ib() triggers = attr.ib(converter=lambda s: nfilter(s.split('\n')))
def get_env(template_dir=None, fallback_template_dir=None): loader = jinja2.FileSystemLoader(searchpath=[ str(d) for d in nfilter([template_dir, fallback_template_dir, TEMPLATE_DIR]) ]) env = jinja2.Environment(loader=loader, trim_blocks=True, lstrip_blocks=True) def paragraphs(s): return '\n\n'.join(s.split('\n')) env.filters['paragraphs'] = paragraphs return env
def serialize(obj): if obj is None: return '' if isinstance(obj, string_types): return obj if isinstance(obj, list): return ';'.join(list(sorted(nfilter(obj)))) if isinstance(obj, tuple): if obj[0] is None: return '' return '{0:.6f} {1:.6f}'.format(*obj) if isinstance(obj, date): return obj.isoformat() raise ValueError(obj) # pragma: no cover
def __init__(self, req, *args, **kw): Parameters.__init__(self, req, *args, **kw) if kw.get('languages'): self.languages = kw['languages'] elif 'languages' in req.params: self.languages = nfilter([ Language.get(id_, default=None) for id_ in req.params['languages'].split(',') ]) else: self.languages = [] self._langs = [ aliased(ValueSet, name='l%s' % i) for i in range(len(self.languages)) ]
def split(self, item, value, lexemes=None): lexemes = lexemes or {} if value in lexemes: log.debug('overriding via lexemes.csv: %r -> %r' % (value, lexemes[value])) value = lexemes[value] if self.normalize_unicode: value = unicodedata.normalize(self.normalize_unicode, value) res = misc.nfilter( self.clean(form, item=item) for form in text.split_text_with_context( value, separators=self.separators, brackets=self.brackets)) if self.first_form_only: return res[:1] return res
def split_text(text, separators=re.compile(r'\s'), brackets=None, strip=False): """Split text along the separators unless they appear within brackets. :param separators: An iterable single characters or a compiled regex pattern. :param brackets: `dict` mapping start tokens to end tokens of what is to be \ recognized as brackets. .. note:: This function will also strip content within brackets. """ if not isinstance(separators, PATTERN_TYPE): separators = re.compile(r'[{0}]'.format(''.join(r'\{0}'.format(c) for c in separators))) return nfilter( s.strip() if strip else s for s in separators.split(strip_brackets(text, brackets=brackets)))
def get_creators_and_contributors(fname, strict=True): ctypes = {c.lower(): c for c in CONTRIBUTOR_TYPES} creators, contributors = [], [] for row in iter_rows(fname): row = {k.lower(): v for k, v in row.items()} for role in nfilter([r.strip().lower() for r in row.get('role', '').split(',')]): c = {k: v for k, v in row.items() if k != 'role'} if role in {'author', 'creator', 'maintainer'}: creators.append(c) else: if strict: c['type'] = ctypes[role] else: c['type'] = ctypes.get(role, 'Other') contributors.append(c) return creators, contributors
def from_txt(cls, txt, session=None, **kw): session = session or DBSession lines = nfilter(txt.split('\n')) m = LANGUAGE_LINE_PATTERN.match(lines[0]) assert m kw['id'] = m.group('name') kw['name'] = ' '.join(s.capitalize() for s in kw['id'].split('_')) for cname in ['wals', 'ethnologue', 'glottolog']: if m.group(cname[0]): kw['classification_' + cname] = m.group(cname[0]) kw.update(parse_metadata(lines[1])) doculect = cls(**kw) if doculect.classification_ethnologue: doculect.ethnologue_family = doculect.classification_ethnologue.split( ',')[0] if doculect.classification_glottolog: doculect.glottolog_family = doculect.classification_glottolog.split( ',')[0] doculect.wordlist = Contribution(id=kw['id'], language=doculect, name=doculect.id) parameters = {p.id: p for p in session.query(Parameter)} for line in lines[2:]: if '\t' in line: wid, words, comment = parse_word(line) # if int(wid) not in MEANINGS_ALL: # # drop non-core meanings # continue vsid = '%s-%s' % (doculect.id, wid) vs = Synset(id=vsid, description=comment, language=doculect, contribution=doculect.wordlist, parameter=parameters[wid]) for i, word in enumerate(words): id_ = '%s-%s' % (vsid, i + 1) word, loan = word word = Word(id=id_, name=word, valueset=vs, loan=loan) return doculect
def iterupdated(self, languoids): # pragma: no cover res = reader(io.StringIO( requests.get(MD_URL).content.decode('utf-8-sig')), dialect=Dialect(skipBlankRows=True, commentPrefix='<'), dicts=True) md = {d['language_code']: d for d in res} lmap = collections.defaultdict(set) return for line in requests.get(URL).text.splitlines(): if line.startswith('var curItem'): line = line.split('=', maxsplit=1)[1] d = json.loads(line) if d['AIATSIS_Code'] and d['Glottolog_ID']: codes = [ c.strip().replace('*', '') for c in d['AIATSIS_Code'].split(',') ] for code in codes: if code: if code not in md: print(d['AIATSIS_Code'], list(md.keys())[:10]) continue lmap[d['Glottolog_ID']].add(code) with pathlib.Path(__file__).parent.joinpath('aiatsis.json').open( encoding='utf8') as fp: for code, gc in json.load(fp).items(): if code not in md: print(code, list(md.keys())[:10]) continue lmap[gc].add(code) for lang in languoids: links, names = [], [] for c in sorted(lmap.get(lang.id, [])): links.append((md[c]['uri'], md[c]['language_name'])) if md[c]['language_name']: names.append(md[c]['language_name']) names.extend( nfilter([ n.strip() for n in md[c]['language_synonym'].split('|') ])) if any([ lang.update_links(DOMAIN, links), lang.update_names(names, type_='aiatsis'), ]): yield lang
def details(path): soup = get_soup(path) if not soup.find('h2'): return res = dict(id=path.split('/')[-1], name=soup.find('h2').get_text()) data = OrderedDict() for tr in soup.find_all('tr'): tds = list(tr.find_all('td')) if len(tds) == 3: data[tds[0].get_text().strip()] = tds[2].get_text().strip() names = data.get('ALSO KNOWN AS') if names: res['alternative_names'] = nfilter([n.strip() for n in names.split(',')]) if data.get('CODE AUTHORITY') == 'ISO 639-3': res['iso-639-3'] = data.get('LANGUAGE CODE') return res
def details(path): # pragma: no cover soup = get_soup(path) if not soup.find('h2'): return res = dict(id=path.split('/')[-1], name=soup.find('h2').get_text()) data = OrderedDict() for tr in soup.find_all('tr'): tds = list(tr.find_all('td')) if len(tds) == 3: data[tds[0].get_text().strip()] = tds[2].get_text().strip() names = data.get('ALSO KNOWN AS') if names: res['alternative_names'] = nfilter( [n.strip() for n in names.split(',')]) if data.get('CODE AUTHORITY') == 'ISO 639-3': res['iso-639-3'] = data.get('LANGUAGE CODE') return res
def merged_rows(rows, active): if all(r['Conflict'].lower().startswith('true') for r in rows): for row in rows: if row['Select'].lower().strip() == 'true': return row assert rows[0]['Feature_ID'] not in active, str(rows) return None elif all(r['Conflict'].lower().strip() == 'false' for r in rows): row = rows[0] for k in ['Sheet', 'Comment', 'Source']: row[k] = [row[k]] for r in rows[1:]: for k in ['Sheet', 'Comment', 'Source']: row[k].append(r[k]) for k, sep in [('Sheet', ' '), ('Comment', '. '), ('Source', '; ')]: row[k] = sep.join(nfilter(row[k])) return row raise ValueError(rows)
def split_text_with_context(text, separators=WHITESPACE, brackets=None): """Splits text at separators outside of brackets. :param text: :param separators: An iterable of single character tokens. :param brackets: :return: A `list` of non-empty chunks. .. note:: This function leaves content in brackets in the chunks. """ res, chunk = [], [] for c, type_ in _tokens(text, brackets=brackets): if type_ == TextType.text and c in separators: res.append(''.join(chunk).strip()) chunk = [] else: chunk.append(c) res.append(''.join(chunk).strip()) return nfilter(res)
def report(dataset, tr_analysis=None, glottolog=None, log=None): # # FIXME: in case of multiple cldf datasets: # - write only summary into README.md # - separate lexemes.md and transcriptions.md # lines = [] # add NOTES.md if dataset.dir.joinpath('NOTES.md').exists(): lines.append('## Notes\n') lines.append(dataset.dir.joinpath('NOTES.md').read_text() + '\n\n') badges = nfilter([build_status_badge(dataset)]) for cldf_spec in dataset.cldf_specs_dict.values(): lines.extend( cldf_report(cldf_spec, tr_analysis, badges, log, glottolog)) break return '\n'.join(lines)
def get_personnel(self, args, contributors_path=None): if contributors_path is None: contributors_path = self.dir / "CONTRIBUTORS.md" personnel = {'author': [], 'data entry': [], 'consultant': []} try: for d in itertools.chain.from_iterable( itertools.chain( pylexibank.get_creators_and_contributors( contributors_path))): if 'name' in d and d['name']: for desc in nfilter([ r.strip().lower() for r in d.get('description', '').split(',') ]): if desc in personnel and d['name'] not in personnel[ desc]: personnel[desc].append(d['name']) else: args.log.warn("No 'name' found in file 'CONTRIBUTORS.md'") except FileNotFoundError: # pragma: no cover args.log.warn("File '{}' not found".format(contributors_path)) return personnel
def cmd_makecldf(self, args): args.writer.add_sources() langs = {lang["Name"]: lang["Glottocode"] for lang in self.languages} concepticon = { c.number: c.concepticon_id for c in self.conceptlists[0].concepts.values() } varieties, meanings, allforms, rels = parse(self) for mn, cognatesets in sorted(allforms.items()): args.writer.add_concept(ID=mn, Name=meanings[mn], Concepticon_ID=concepticon[mn]) for ccn, forms in sorted(cognatesets.items()): for ln, form in forms: args.writer.add_language( ID=ln, Name=varieties[ln]["name"].strip(), Glottocode=langs[varieties[ln]["name"].strip()], ) ffs = [ff.strip().lower() for ff in form.split(",")] for i, f in enumerate(nfilter(ffs)): for row in args.writer.add_lexemes( Language_ID=ln, Parameter_ID=mn, Value=f, Source=["Dyen1992"], Cognacy="%s-%s" % (mn, ccn), ): if len(ffs) == 1 and (2 <= int(ccn) <= 99 or 200 <= int(ccn) <= 399): # most conservative cognacy judgements only args.writer.add_cognate(lexeme=row, Cognateset_ID="%s-%s" % (mn, ccn), Source="Dyen1992")
def getlist(self, section, option): return nfilter(self.get(section, option, fallback='').strip().split('\n'))
def load(self, submission, data, vocab, lang, comparison_meanings, labels): def id_(oid): return '%s-%s' % (submission.id, oid) print('######\n a CLDF dict! \n####') try: media = {d['ID']: d for d in self.cldf['media.csv']} except KeyError: media = {} metalanguages = submission.props.get('metalanguages', {}) entries = self.cldf['EntryTable'] colmap = {k: self.cldf['EntryTable', k].name for k in ['id', 'headword', 'partOfSpeech', 'languageReference', 'source'] if self.cldf.get(('EntryTable', k))} fks = get_foreign_keys(self.cldf, entries) elabels = get_labels('entry', entries, colmap, submission, exclude=fks['EntryTable'][:]) for lemma in entries: if not lemma[colmap['headword']]: continue oid = lemma.pop(colmap['id']) word = data.add( models.Word, oid, id=id_(oid), name=lemma.pop(colmap['headword']), pos=lemma.pop(colmap['partOfSpeech']), dictionary=vocab, language=lang) DBSession.flush() files = [(md5, media[md5]) for md5 in set(lemma.get('Media_IDs', [])) if md5 in media] for md5, spec in sorted( files, key=lambda i: i[1].get(submission.props.get('media_order', 'Description')) or i[1]['ID'] ): submission.add_file(None, md5, common.Unit_files, word, spec) self.add_refs(data, 'EntryTable', lemma, word) for index, (key, label) in enumerate(elabels.items()): label, with_links = label if lemma.get(key): DBSession.add(common.Unit_data( object_pk=word.pk, key=label, value=lemma[key], ord=index, jsondata=dict(with_links=with_links))) DBSession.flush() # # Now that all entries are in the DB and have primary keys, we can create the # self-referential links: # fullentries = defaultdict(list) for lemma in entries: fullentries[lemma[colmap['id']]].extend(list(lemma.items())) word = data['Word'][lemma[colmap['id']]] for col in fks['EntryTable']: col = self.cldf['EntryTable', col] label = col.titles.getfirst() if col.titles else col.name if label == 'Entry_IDs': label = 'See also' label = label.replace('_', ' ') for lid in lemma[col.name] or []: if lid not in data['Word']: print('missing entry ID: {0}'.format(lid)) else: DBSession.add(models.SeeAlso( source_pk=word.pk, target_pk=data['Word'][lid].pk, description=label)) sense2word = {} colmap = {k: self.cldf['SenseTable', k].name for k in ['id', 'entryReference', 'description', 'source'] if self.cldf.get(('SenseTable', k))} fks = get_foreign_keys(self.cldf, self.cldf['SenseTable']) slabels = get_labels( 'sense', self.cldf['SenseTable'], colmap, submission, exclude=['alt_translation1', 'alt_translation2'] + fks['EntryTable'][:]) for sense in self.cldf['SenseTable']: fullentries[sense[colmap['entryReference']]].extend(list(sense.items())) sense2word[sense[colmap['id']]] = sense[colmap['entryReference']] try: w = data['Word'][sense[colmap['entryReference']]] except KeyError: print('missing entry: {0}'.format(sense[colmap['entryReference']])) continue dsc = sense[colmap['description']] if not isinstance(dsc, list): dsc = [dsc] kw = dict( id=id_(sense[colmap['id']]), name='; '.join(nfilter(dsc)), semantic_domain=sense.pop('Semantic_Domain', None), word=w) if 'alt_translation1' in sense and metalanguages.get('gxx'): kw['alt_translation1'] = sense['alt_translation1'] kw['alt_translation_language1'] = metalanguages.get('gxx') if 'alt_translation2' in sense and metalanguages.get('gxy'): kw['alt_translation2'] = sense['alt_translation2'] kw['alt_translation_language2'] = metalanguages.get('gxy') m = data.add(models.Meaning, sense[colmap['id']], **kw) DBSession.flush() self.add_refs(data, 'SenseTable', sense, m) for index, (key, label) in enumerate(slabels.items()): label, with_links = label if sense.get(key): DBSession.add(models.Meaning_data( object_pk=m.pk, key=label, value=sense[key], ord=index, jsondata=dict(with_links=with_links))) for i, md in enumerate(nfilter(sense[colmap['description']]), start=1): key = md.lower() if key in comparison_meanings: concept = comparison_meanings[key] else: continue vsid = '%s-%s' % (lang.id, concept) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id=vsid, language=lang, contribution=vocab, parameter_pk=concept) DBSession.add(models.Counterpart( id='{0}-{1}'.format(m.id, i), name=w.name, valueset=vs, word=w)) DBSession.flush() files = [(md5, media[md5]) for md5 in set(sense.get('Media_IDs', [])) if md5 in media] for md5, spec in sorted( files, key=lambda i: i[1].get(submission.props.get('media_order', 'Description')) or i[1]['ID'] ): submission.add_file(None, md5, models.Meaning_files, m, spec) for col in fks['EntryTable']: col = self.cldf['SenseTable', col] if col.name == colmap['entryReference']: continue label = col.titles.getfirst() if col.titles else col.name label = label.replace('_', ' ') entry_ids = sense[col.name] if entry_ids: if not isinstance(entry_ids, list): entry_ids = [entry_ids] for eid in entry_ids: if eid not in data['Word']: print('missing entry ID: {0}'.format(eid)) else: DBSession.add(models.Nym( source_pk=m.pk, target_pk=data['Word'][eid].pk, description=label)) colmap = {k: self.cldf['ExampleTable', k].name for k in ['id', 'primaryText', 'translatedText']} for ex in self.cldf['ExampleTable']: # # FIXME: Detect the column with sense IDs by looking at the foreign keys! # mids = ex.get('Senses') or ex.get('Sense_IDs', []) if not isinstance(mids, list): mids = mids.split(' ; ') for mid in mids: if mid not in data['Meaning']: continue if mid in sense2word: fullentries[sense2word[mid]].extend(list(ex.items())) models.MeaningSentence( meaning=data['Meaning'][mid], sentence=data['Example'][ex[colmap['id']]]) else: print('missing sense: {0}'.format(mid)) for wid, d in fullentries.items(): if wid in data['Word']: data['Word'][wid].fts = tsvector( '; '.join('{0}: {1}'.format(k, v) for k, v in d if v))
def nattr(p, attr): return len(nfilter([getattr(i, attr, None) for i in read_all(p)]))
def tests_path(*comps): return Path(__file__).parent.joinpath(*nfilter(comps))
def test_nfilter(): from clldutils.misc import nfilter assert nfilter(range(5)) == list(range(1, 5))
def feature_properties(self, ctx, req, valueset): return { "values": list(valueset.values), "label": ", ".join(nfilter(v.name for v in valueset.values)) or self.get_language(ctx, req, valueset).name, }
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) data = Data() concept_list = Concepticon(CONCEPTICON_REPOS).conceptlist('Key-2016-1310') def concepticon_id(ids_code): for item in concept_list: if item['IDS_ID'] == ids_code: return int(item['CONCEPTICON_ID']) if item['CONCEPTICON_ID'] else None def read(table): fname = args.data_file(table + '.all.csv') if not fname.exists(): fname = args.data_file(table + '.csv') return list(dsv.reader(fname, namedtuples=True)) dataset = common.Dataset( id=ids.__name__, name="IDS", description="The Intercontinental Dictionary Series", published=date(2015, 5, 25), publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License', }, domain='ids.clld.org') DBSession.add(dataset) for rec in Database.from_file(args.data_file('sources.bib'), lowercase=True): if rec.id not in data['Source']: data.add(common.Source, rec.id, _obj=bibtex2source(rec)) DBSession.flush() data_desc = defaultdict(dict) for l in read('x_lg_data'): data_desc[l.lg_id][l.map_ids_data] = l.header # language lang iso_codes = {l.id: l.sil_code for l in read('sil_lang')} iso_codes = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil')} languages = [] exclude = [] for l in read('lang'): if l.status == '1': exclude.append(l.lg_id) continue lang_changed = LANGS.get(int(l.lg_id), {}) code = lang_changed.get('glotto') or lang_changed.get('iso') or iso_codes.get(l.lg_id) lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=lang_changed.get('name', l.lg_name)) if code: languages.append((code, lang)) data.add( models.Dictionary, l.lg_id, id=l.lg_id, name=l.lg_name, language=lang, default_representation=data_desc[l.lg_id].get('1'), alt_representation=data_desc[l.lg_id].get('2'), jsondata=dict(status=l.status, date=l.date)) iso2glotto = {} for l in walk_tree(tree=languoids_path('tree', GLOTTOLOG_REPOS)): if l.iso: iso2glotto[l.iso] = l.id load_families( Data(), [(iso2glotto.get(c, c), l) for c, l in languages], glottolog=Glottolog(GLOTTOLOG_REPOS), isolates_icon='tcccccc') contributors = defaultdict(list) sources = defaultdict(list) for l in read('lang_compilers'): if l.lg_id in exclude: continue if l.name == "BIBIKO": continue #name lg_id what_did_id if int(l.what_did_id) in models.ROLES: contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id)) else: assert int(l.what_did_id) in [4, 395] sources[l.name].append(l.lg_id) for s, roles in contributors.items(): name = roles[0][0] c = data.add(common.Contributor, s, id=s, name=name) if name == 'Mary Ritchie Key': c.address = 'University of California, Irvine' for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]): sroles = sorted( [s[1] for s in specs], reverse=True, key=lambda what: what + 2 if what == 2 else what) what = sroles[0] DBSession.add(common.ContributionContributor( contribution=data['Dictionary'][lg], contributor=c, ord=what, primary=what == 2)) data.add( common.Contributor, 'bernardcomrie', id='bernardcomrie', name="Bernard Comrie", address="University of California, Santa Barbara") for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) #for i, name in enumerate(sorted(sources.keys())): # c = data.add(common.Source, name, id=str(i + 1), name=name, description=name) DBSession.flush() for name, lgs in sources.items(): for _src in name.split(';'): src = data['Source'].get(_src.strip()) if not src: print('-- missing source --', _src) raise ValueError for lg in lgs: if lg in exclude: continue assert lg in data['Dictionary'] DBSession.add(common.ContributionReference( contribution_pk=data['Dictionary'][lg].pk, source_pk=src.pk)) altnames = {} for i, l in enumerate(read('alt_names')): if l.name in altnames: identifier = altnames[l.name] else: identifier = data.add( common.Identifier, l.name, id='name-%s' % i, type='name', name=l.name, description='IDS') altnames[l.name] = identifier if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name: DBSession.add(common.LanguageIdentifier( identifier=identifier, language=data['IdsLanguage'][l.lg_id])) # parameter chapter/entry for l in read('chapter'): data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title) entries = {} for l in read('entry'): id_ = '%s-%s' % (l.chap_id, l.entry_id) name = l.trans_english if name in entries: entries[name] += 1 name = name + ' (%s)' % entries[name] else: entries[name] = 1 kw = { 'id': id_, 'name': name, 'concepticon_id': concepticon_id(id_), 'chapter': data['Chapter'][l.chap_id]} for ll in 'french russian spanish portugese'.split(): kw[ll] = getattr(l, 'trans_' + ll) data.add(models.Entry, id_, sub_code=l.entry_id, **kw) misaligned = [] DBSession.flush() for entity in 'IdsLanguage Entry Chapter Dictionary'.split(): for k in data[entity].keys()[:]: data[entity][k] = data[entity][k].pk synsets = set() counterparts = set() problems = defaultdict(list) for lg_id, entries in groupby( sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id): if lg_id in exclude or not lg_id: continue # keep the memory footprint reasonable transaction.commit() transaction.begin() language = common.Language.get(data['IdsLanguage'][lg_id]) desc = data_desc.get(lg_id, {}) words = defaultdict(list) for l in entries: if empty.match(l.data_1): continue entry_id = '%s-%s' % (l.chap_id, l.entry_id) if entry_id not in data['Entry']: continue #data.add( # models.Entry, entry_id, # id=entry_id, # name=entry_id, # concepticon_id=concepticon_id(entry_id), # sub_code=l.entry_id, # chapter_pk=data['Chapter'][l.chap_id]) #DBSession.flush() #data['Entry'][entry_id] = data['Entry'][entry_id].pk id_ = '%s-%s' % (entry_id, l.lg_id) if id_ in synsets: vs = models.Synset.get(id_) else: vs = models.Synset( id=id_, comment=get_string(l.comment or ''), alt_representation=get_string(l.data_2), language=language, contribution_pk=data['Dictionary'][l.lg_id], parameter_pk=data['Entry'][entry_id]) synsets.add(id_) trans1 = list(split_counterparts(l.data_1)) trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2)) if trans2: if len(trans2) != len(trans1): if language.id != '238': misaligned.append((l.chap_id, l.entry_id, l.lg_id)) #print('===', language.id, language.name) #print(l.data_1) #print(l.data_2) # 83 cases of misaligned transcriptions trans2 = None for i, word in enumerate(trans1): cid = id_ + '-' + str(i + 1 + len(vs.values)) if cid not in counterparts: v = models.Counterpart( id=cid, name=word, description=desc.get('1'), valueset=vs) words[word].append((v, trans2[i] if trans2 else None)) counterparts.add(cid) else: print(cid) #12 - 420 - 811 - 3 #5 - 390 - 818 - 3 #2 - 930 - 819 - 3 #2 - 930 - 819 - 3 #3 - 120 - 819 - 3 #10 - 140 - 822 - 3 #9 - 160 - 825 - 3 #2 - 430 - 829 - 4 for i, form in enumerate(words.keys()): # Since we identify words based on their string representation, we have to # make sure a word has the same alternative transcription for all meanings. if language.id == '238': alt_names = [] else: alt_names = set(norm(w[1] or '', desc.get('2'), language.id) for w in words[form]) alt_names = nfilter(alt_names) try: assert len(alt_names) <= 1 except AssertionError: problems[(language.id, language.name)].append(alt_names) word = models.Word( id='%s-%s' % (language.id, i + 1), name=form, description=desc.get('1'), language=language, alt_name=', '.join(alt_names) if alt_names else None, alt_description=desc.get('2') ) for v, _ in words[form]: word.counterparts.append(v) DBSession.add(word) DBSession.flush() with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp: fp.writerows(misaligned) # about 250 cases where alternative transcriotions do not covary across meanings. for k, v in problems.items(): print(k, len(v))
def main(args): # # order of init: # - villages # - files # - movies # videos = defaultdict(list) for f in util.iter_files(args): obj = models.File(**attr.asdict(f)) if obj.mime_type.startswith('video'): videos[slug(obj.name.split('.')[0])].append(obj) DBSession.add(obj) lexicon = list(util.iter_lexicon(args)) villages = util.get_villages(args) ff_images = list(util.ff_images(args)) bib = list(util.get_bib(args)) data = Data() dataset = common.Dataset( id=dogonlanguages.__name__, name="Dogon and Bangime Linguistics", contact="*****@*****.**", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='dogonlanguages.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'} ) DBSession.add(dataset) if Glottolog: if socket.gethostname() == 'dlt5502178l': glottolog = Glottolog( Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog')) else: glottolog = Glottolog( Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath( 'glottolog')) languoids = {l.id: l for l in glottolog.languoids()} else: languoids = {} print('got glottolog') for c in util.CONTRIBUTORS: id_ = slug(c.name.split()[-1]) data.add(models.Member, id_, id=id_, **attr.asdict(c)) data.add( models.Member, 'forkel', id='forkel', name='Robert Forkel', email='*****@*****.**', in_project=False) for i, id_ in enumerate(['moran', 'forkel', 'heath']): DBSession.add(common.Editor( dataset=dataset, ord=i + 1, contributor=data['Member'][id_])) contrib = data.add(common.Contribution, 'd', id='d', name='Dogon Languages') for doc in bib: obj = data.add( models.Document, doc.rec.id, _obj=bibtex2source(doc.rec, cls=models.Document)) keywords = nfilter([s.strip() for s in doc.rec.get('keywords', '').split(',')]) for dt in 'grammar lexicon typology texts'.split(): if dt in keywords: obj.doctype = dt break obj.project_doc = ('DLP' in keywords) or bool(doc.files) if obj.project_doc: for i, cid in enumerate(util.get_contributors(doc.rec, data)): models.DocumentContributor( document=obj, contributor=data['Member'][cid], ord=i) for i, (path, cdstar) in enumerate(doc.files): common.Source_files( id='%s-%s' % (obj.id, i + 1), name=path, object=obj, mime_type=guess_type(path)[0], jsondata=cdstar, ) print('got bib') for name, (gc, desc) in LANGUAGES.items(): gl_lang = languoids[gc] lat, lon = gl_lang.latitude, gl_lang.longitude lang = data.add( models.Languoid, gc, id=gc, name=name, description=desc, latitude=lat, longitude=lon, family=gl_lang.family.name if gl_lang and gl_lang.family else name, ) if name == 'Penange' and lang.longitude > 0: lang.longitude = -lang.longitude if name == 'Bankan Tey': lang.latitude, lang.longitude = 15.07, -2.91 if name == 'Ben Tey': lang.latitude, lang.longitude = 14.85, -2.95 if name == 'Togo Kan': lang.latitude, lang.longitude = 14.00, -3.25 add_language_codes(data, lang, gl_lang.iso, glottocode=gc) villages_by_name = defaultdict(list) contrib_by_initial = {c.abbr: c for c in data['Member'].values()} for i, village in enumerate(villages): lang = None if village.glottocode: lang = data['Languoid'].get(village.glottocode) if not lang: gl_lang = languoids[village.glottocode] lang = data.add( models.Languoid, gl_lang.id, id=gl_lang.id, name=gl_lang.name, in_project=False, family=gl_lang.family.name if gl_lang.family else gl_lang.name) v = data.add( models.Village, str(i + 1), id=str(i + 1), name=village.name, description=village.data.pop('social info'), surnames=village.data.pop('surnames'), major_city=village.data['MajorCity'] == 'Y', transcribed_name=village.data.pop('Transcribed Village Name'), source_of_coordinates=village.data.pop('sourceOfCoordinates'), latitude=village.lat, longitude=village.lon, languoid=lang, jsondata=village.data, ) villages_by_name[village.name].append(v) for img in village.images: mimetype = guess_type(img.name)[0] if mimetype: f = models.Village_files( id=img.id, name=img.name, description=img.description, date_created=img.date, latitude=img.coords[0] if img.coords else None, longitude=-img.coords[1] if img.coords else None, object=v, mime_type=mimetype, jsondata=img.cdstar, ) for initial in img.creators: if initial in contrib_by_initial: models.Fotographer( foto=f, contributor=contrib_by_initial[initial]) for cat, desc, place, name in MOVIES: s = slug(name) m = models.Movie( id=s, name=desc, description=cat, place=place, ) if place in villages_by_name and len(villages_by_name[place]) == 1: m.village = villages_by_name[place][0] #print('found village: %s' % name) for v in videos[s]: #print('found video: %s' % name) v.movie = m m.duration = v.duration names = defaultdict(int) for concept in lexicon: add(concept, data, names, contrib) count = set() for img in ff_images: if img.id in count: continue count.add(img.id) if img.ref: if img.ref in data['Concept']: concept = data['Concept'][img.ref] if img.tsammalex_taxon and not concept.tsammalex_taxon: concept.tsammalex_taxon = img.tsammalex_taxon #print(concept.tsammalex_taxon) common.Parameter_files( object=concept, id=img.id, name=img.name.decode('utf8'), mime_type=guess_type(img.name)[0], jsondata=img.cdstar) else: print('missing ref: %s' % img.ref)
def load_examples(self, dictionary, data, lang): abbr_p = re.compile('\$(?P<abbr>[a-z1-3][a-z]*(\.[a-z]+)?)') if hasattr(self.dictionary, 'cldf'): #ID,Language_ID,Primary_Text,Analyzed_Word,Gloss,Translated_Text,Meta_Language_ID,Comment,Sense_IDs,Analyzed,Media_IDs #XV000001,tzh,lek a lok',,,salió bien,,,SN000001,, colmap = {} for k in [ 'id', 'primaryText', 'analyzedWord', 'gloss', 'translatedText', 'languageReference', 'metaLanguageReference', 'comment', ]: try: colmap[k] = self.dictionary.cldf['ExampleTable', k].name except KeyError: pass for i, ex in enumerate(self.dictionary.cldf['ExampleTable']): obj = data.add( models.Example, ex[colmap['id']], id='%s-%s' % (self.id, ex.pop(colmap['id']).replace('.', '_')), name=ex.pop(colmap['primaryText']), number='{0}'.format(i + 1), source=ex.pop('Corpus_Reference', None), comment=ex.pop(colmap['comment'], None) if 'comment' in colmap else None, original_script=ex.pop('original_script', None), language=lang, serialized='{0}'.format(ex), dictionary=dictionary, analyzed='\t'.join( nfilter(ex.pop(colmap['analyzedWord'], []) or [])) if 'analyzedWord' in colmap else None, gloss='\t'.join( [abbr_p.sub(lambda m: m.group('abbr').upper(), g or '') for g in ex[colmap['gloss']]]) if 'gloss' in colmap and ex[colmap['gloss']] \ else ((ex[colmap['gloss']] or None) if 'gloss' in colmap else None), description=ex.pop(colmap['translatedText'], None), alt_translation1=ex.pop('alt_translation1', None), alt_translation_language1=self.props.get('metalanguages', {}).get('gxx'), alt_translation2=ex.pop('alt_translation2', None), alt_translation_language2=self.props.get('metalanguages', {}).get('gxy'), ) for col in ['languageReference', 'metaLanguageReference', 'gloss']: if col in colmap: del ex[colmap[col]] DBSession.flush() for md5 in sorted(set(ex.pop('Media_IDs', []))): self.add_file(None, md5, common.Sentence_files, obj) for k, v in ex.items(): if v and (k not in ['Sense_IDs']): DBSession.add(common.Sentence_data( object_pk=obj.pk, key=k, value=ex[k], )) elif self.dir.joinpath('processed', 'examples.sfm').exists(): for i, ex in enumerate( Examples.from_file(self.dir.joinpath('processed', 'examples.sfm'))): obj = data.add( models.Example, ex.id, id='%s-%s' % (self.id, ex.id.replace('.', '_')), name=ex.text, number='{0}'.format(i + 1), source=ex.corpus_ref, language=lang, serialized='{0}'.format(ex), dictionary=dictionary, analyzed=ex.morphemes, gloss=abbr_p.sub(lambda m: m.group('abbr').upper(), ex.gloss) if ex.gloss else ex.gloss, description=ex.translation, alt_translation1=ex.alt_translation, alt_translation_language1=self.props.get('metalanguages', {}).get('gxx'), alt_translation2=ex.alt_translation2, alt_translation_language2=self.props.get('metalanguages', {}).get('gxy')) DBSession.flush() if ex.soundfile: self.add_file('audio', ex.soundfile, common.Sentence_files, obj)
def col_defs(self): kw = {} if self.language: kw['bSearchable'] = False kw['bSortable'] = False name_col = ApicsValueNameCol(self, 'value', **kw) if self.parameter and self.parameter.domain: name_col.choices = [de.name for de in self.parameter.domain] class ValueLanguageCol(LinkCol): def search(self, qs): if self.dt.language: return ValueSet.language_pk == int(qs) if self.dt.parameter: return icontains(self.dt.vs_lang.name, qs) def order(self): if self.dt.parameter: return cast(self.dt.vs_lang.id, Integer) if self.dt.language: return ValueSet.language_pk lang_col = ValueLanguageCol( self, 'language', model_col=Language.name, get_obj=lambda item: item.valueset.language, bSearchable=bool(self.parameter or self.language), bSortable=bool(self.parameter or self.language)) if self.language: if self.language.lects: lang_col.choices = [ (l.pk, l.name) for l in [self.language] + self.language.lects] lang_col.js_args['sTitle'] = 'lect' else: lang_col = None get_param = lambda i: i.valueset.parameter if self.parameter: return nfilter([ lang_col, name_col, Col(self, 'lexifier', format=lambda i: i.valueset.language.lexifier, model_col=self.vs_lect.lexifier, choices=get_distinct_values( Lect.lexifier, key=lambda v: 'z' + v if v == 'Other' else v)), LinkToMapCol( self, 'm', get_object=lambda i: None if i.valueset.language.language_pk else i.valueset.language), DetailsRowLinkCol(self, 'more') if self.parameter.feature_type != 'sociolinguistic' else None, RefsCol(self, 'source') if self.parameter.feature_type != 'segment' else None, ]) if self.language: return nfilter([ IntegerIdCol(self, 'id', get_obj=get_param, model_col=Parameter.id), LinkCol(self, 'parameter', get_obj=get_param, model_col=Parameter.name), name_col, lang_col, DetailsRowLinkCol(self, 'more'), RefsCol(self, 'source'), ]) return [ LinkCol(self, 'parameter', get_obj=get_param, model_col=Parameter.name), name_col, lang_col, DetailsRowLinkCol(self, 'more'), RefsCol(self, 'source'), ]
def format(self, item): vs = self.get_obj(item) return ', '.join( nfilter([getattr(vs, 'source', None), linked_references(self.dt.req, vs)]))
def joined(iterable): return ' / '.join(sorted(nfilter(set(iterable))))