def cldf(dataset, concepticon, **kw): unmapped = set() for ods in clld.itercldf(dataset, __name__): lid = ods.name.split('-')[-1] fields = list(ods.fields) + [ 'Language_local_ID', 'Parameter_local_ID', 'Loan', 'Context' ] with CldfDataset(fields, dataset, subset=lid) as ds: ds.table.schema.columns['Loan'].datatype = 'boolean' ds.table.schema.columns['Parameter_local_ID'].valueUrl = \ clld.url(__name__, path='/meaning/{Parameter_local_ID}') ds.table.schema.columns['Language_local_ID'].valueUrl = \ clld.url(__name__, path='/language/{Language_local_ID}') ds.table.schema.columns['Word_ID'].valueUrl = \ clld.url(__name__, path='/word/{Word_ID}') ds.metadata.update( {k: v for k, v in ods.metadata.items() if k.startswith('dc:')}) ds.sources.add(*ods.sources.items()) for row in ods.rows: if row['Language_ID'] == 'None': row['Language_ID'] = None unmapped.add((row['Language_name'], lid)) keys = list(row.keys()) for i, (form, context) in enumerate(split(row['Value'])): _row = row.to_list() _row[keys.index('Value')] = form _row[keys.index('ID')] = '%s-%s' % (row['ID'], i + 1) # Note: We count words marked as "probably borrowed" as loans. _row.extend([ lid, row['WOLD_Meaning_ID'], float(row['Borrowed_score']) > 0.6, context ]) ds.add_row(_row) assert not unmapped
def cognates(self): for entry in self.entries: for i, (form, context) in enumerate(util.split(entry.name)): for cognate_set_id, doubt in entry.cognates: yield ( '{0}-{1}'.format(entry.id, i + 1), self.id, form, cognate_set_id, doubt, 'expert', '', '', '', '', )
def check_split(string, expected): assert_equal(list(util.split(string)), expected)
def cldf(dataset, concepticon, **kw): concept_map = { re.sub('^(\*|\$)', '', c.english): c.concepticon_id for c in dataset.conceptlist.concepts.values()} for c in dataset.concepts: concept_map[(c['ID'], c['GLOSS'])] = c['CONCEPTICON_ID'] or None language_map = {l['ID']: l['GLOTTOCODE'] or None for l in dataset.languages} concepts = [] languages = {} for path in dataset.raw.glob('languages-language-*.json'): data = jsonlib.load(path) data['glottocode'] = language_map[data['id']] languages[data['id']] = data for path in sorted( dataset.raw.glob('lexical-feature-*.json'), key=lambda p: int(p.stem.split('-')[-1])): data = jsonlib.load(path) data['concepticon'] = concept_map.get(data['concept']) if not data['concepticon']: data['concepticon'] = concept_map[(data['id'], data['concept'])] concepts.append(data) fields = defaultdict(lambda: Counter()) sources = {} with CldfDataset(( 'ID', 'Language_ID', 'Language_iso', 'Language_name', 'Language_local_ID', 'Parameter_ID', 'Parameter_name', 'Parameter_local_ID', 'Semantic_field', 'Value', 'Context', 'Loan', 'Phonemic', 'Source', 'Creator', 'Comment', ), dataset) as ds: ds.table.schema.columns['Loan'].datatype = 'boolean' ds.table.schema.columns['Parameter_local_ID'].valueUrl = \ 'https://huntergatherer.la.utexas.edu/lexical/feature/{Parameter_local_ID}' ds.table.schema.columns['Language_local_ID'].valueUrl = \ 'https://huntergatherer.la.utexas.edu/languages/language/{Language_local_ID}' for param in concepts: for lid, items in groupby( sorted(param['items'], key=lambda i: i['Language']), lambda i: i['Language']): lid = lid.split('/')[-1] if lid in missing_languages: continue lang = languages[lid] i = 0 for item in items: form = item['Orthographic Form'].strip() refs = [ref for ref in itersources(item, lang, sources) if ref] ds.sources.add(*[ref.source for ref in refs]) for k, v in item.items(): if v: fields[k].update([v]) for fform, context in split(form): i += 1 ds.add_row([ '%s-%s-%s' % (lid, param['id'], i), lang['glottocode'], lang['ISO 639-3'], lang['name'], lang['id'], param['concepticon'], param['concept'], param['id'], param['Semantic Field'], fform, context, bool(item['Loan Source'] or item['Wanderwort Status']), item['Phonemicized Form'] or None, ';'.join('%s' % ref for ref in refs), item.get('Created By'), item.get('General Notes'), ])
def to_cldf(self, concept_map, unmapped, citekey=None, source=None, concept_key=None): if concept_key is None: concept_key = lambda entry: entry.word_id if not self.language.glottocode: unmapped.languages.add( (self.language.id, self.language.name, self.language.iso)) with CldfDataset(( 'ID', 'Language_ID', 'Language_iso', 'Language_name', 'Language_local_ID', 'Parameter_ID', 'Parameter_name', 'Parameter_local_ID', 'Value', 'Value_in_source', 'Segments', 'Context', 'Source', 'Cognate_Set', 'Comment', 'Loan', ), self.dataset, subset=self.language.id) as ds: ds.metadata['dc:creator'] = self.language.author ds.metadata['dc:identifier'] = self.url('language.php?id=%s' % self.language.id) if self.language.typedby: ds.metadata['dc:contributor'] = self.language.typedby if self.language.checkedby: ds.metadata['dc:contributor'] = self.language.checkedby if self.language.notes: ds.metadata['dc:description'] = self.language.notes ds.table.schema.aboutUrl = '%s.csv#{ID}' % ds.name ds.table.schema.columns['Loan'].datatype = 'boolean' ds.table.schema.columns['Parameter_local_ID'].valueUrl = \ self.url('word.php?v=1{Parameter_local_ID}') ds.table.schema.columns['Language_local_ID'].valueUrl = \ self.url('language.php?id={Language_local_ID}') ref = None if citekey and source: ref = citekey ds.sources.add(Source('misc', citekey, title=source)) for entry in self.entries: if entry.name == '?': continue if not (citekey and source): src = entry.e.find('source') if src and getattr(src, 'text'): ref = slug(text_type(src.text)) ds.sources.add(Source('misc', ref, title=src.text)) cid = concept_map.get(concept_key(entry)) if not cid: unmapped.concepts.add((entry.word_id, entry.word)) for i, (form, context) in enumerate(util.split(entry.name)): ds.add_row([ '{0}-{1}'.format(entry.id, i + 1), self.language.glottocode, self.language.iso, self.language.name, self.language.id, cid, entry.word, entry.word_id, util.clean_form(form), form, '', context, ref, entry.cognacy, entry.comment or '', entry.loan == 'L', ]) segmentize(ds) return ds