Ejemplo n.º 1
0
def cldf(dataset, concepticon, **kw):
    unmapped = set()
    for ods in clld.itercldf(dataset, __name__):
        lid = ods.name.split('-')[-1]
        fields = list(ods.fields) + [
            'Language_local_ID', 'Parameter_local_ID', 'Loan', 'Context'
        ]
        with CldfDataset(fields, dataset, subset=lid) as ds:
            ds.table.schema.columns['Loan'].datatype = 'boolean'
            ds.table.schema.columns['Parameter_local_ID'].valueUrl = \
                clld.url(__name__, path='/meaning/{Parameter_local_ID}')
            ds.table.schema.columns['Language_local_ID'].valueUrl = \
                clld.url(__name__, path='/language/{Language_local_ID}')
            ds.table.schema.columns['Word_ID'].valueUrl = \
                clld.url(__name__, path='/word/{Word_ID}')
            ds.metadata.update(
                {k: v
                 for k, v in ods.metadata.items() if k.startswith('dc:')})
            ds.sources.add(*ods.sources.items())
            for row in ods.rows:
                if row['Language_ID'] == 'None':
                    row['Language_ID'] = None
                    unmapped.add((row['Language_name'], lid))
                keys = list(row.keys())
                for i, (form, context) in enumerate(split(row['Value'])):
                    _row = row.to_list()
                    _row[keys.index('Value')] = form
                    _row[keys.index('ID')] = '%s-%s' % (row['ID'], i + 1)
                    # Note: We count words marked as "probably borrowed" as loans.
                    _row.extend([
                        lid, row['WOLD_Meaning_ID'],
                        float(row['Borrowed_score']) > 0.6, context
                    ])
                    ds.add_row(_row)
    assert not unmapped
Ejemplo n.º 2
0
 def cognates(self):
     for entry in self.entries:
         for i, (form, context) in enumerate(util.split(entry.name)):
             for cognate_set_id, doubt in entry.cognates:
                 yield (
                     '{0}-{1}'.format(entry.id, i + 1),
                     self.id,
                     form,
                     cognate_set_id,
                     doubt,
                     'expert',
                     '',
                     '',
                     '',
                     '',
                 )
Ejemplo n.º 3
0
def check_split(string, expected):
    assert_equal(list(util.split(string)), expected)
Ejemplo n.º 4
0
def cldf(dataset, concepticon, **kw):
    concept_map = {
        re.sub('^(\*|\$)', '', c.english): c.concepticon_id
        for c in dataset.conceptlist.concepts.values()}
    for c in dataset.concepts:
        concept_map[(c['ID'], c['GLOSS'])] = c['CONCEPTICON_ID'] or None
    language_map = {l['ID']: l['GLOTTOCODE'] or None for l in dataset.languages}

    concepts = []
    languages = {}
    for path in dataset.raw.glob('languages-language-*.json'):
        data = jsonlib.load(path)
        data['glottocode'] = language_map[data['id']]
        languages[data['id']] = data

    for path in sorted(
            dataset.raw.glob('lexical-feature-*.json'),
            key=lambda p: int(p.stem.split('-')[-1])):
        data = jsonlib.load(path)
        data['concepticon'] = concept_map.get(data['concept'])
        if not data['concepticon']:
            data['concepticon'] = concept_map[(data['id'], data['concept'])]
        concepts.append(data)

    fields = defaultdict(lambda: Counter())
    sources = {}
    with CldfDataset((
            'ID',
            'Language_ID',
            'Language_iso',
            'Language_name',
            'Language_local_ID',
            'Parameter_ID',
            'Parameter_name',
            'Parameter_local_ID',
            'Semantic_field',
            'Value',
            'Context',
            'Loan',
            'Phonemic',
            'Source',
            'Creator',
            'Comment',
            ), 
            dataset) as ds:
        ds.table.schema.columns['Loan'].datatype = 'boolean'
        ds.table.schema.columns['Parameter_local_ID'].valueUrl = \
            'https://huntergatherer.la.utexas.edu/lexical/feature/{Parameter_local_ID}'
        ds.table.schema.columns['Language_local_ID'].valueUrl = \
            'https://huntergatherer.la.utexas.edu/languages/language/{Language_local_ID}'

        for param in concepts:
            for lid, items in groupby(
                    sorted(param['items'], key=lambda i: i['Language']),
                    lambda i: i['Language']):
                lid = lid.split('/')[-1]
                if lid in missing_languages:
                    continue
                lang = languages[lid]
                i = 0
                for item in items:
                    form = item['Orthographic Form'].strip()
                    refs = [ref for ref in itersources(item, lang, sources) if ref]
                    ds.sources.add(*[ref.source for ref in refs])
                    for k, v in item.items():
                        if v:
                            fields[k].update([v])
                    for fform, context in split(form):
                        i += 1
                        ds.add_row([
                            '%s-%s-%s' % (lid, param['id'], i),
                            lang['glottocode'],
                            lang['ISO 639-3'],
                            lang['name'],
                            lang['id'],
                            param['concepticon'],
                            param['concept'],
                            param['id'],
                            param['Semantic Field'],
                            fform,
                            context,
                            bool(item['Loan Source'] or item['Wanderwort Status']),
                            item['Phonemicized Form'] or None,
                            ';'.join('%s' % ref for ref in refs),
                            item.get('Created By'),
                            item.get('General Notes'),
                        ])
Ejemplo n.º 5
0
    def to_cldf(self,
                concept_map,
                unmapped,
                citekey=None,
                source=None,
                concept_key=None):
        if concept_key is None:
            concept_key = lambda entry: entry.word_id

        if not self.language.glottocode:
            unmapped.languages.add(
                (self.language.id, self.language.name, self.language.iso))

        with CldfDataset((
                'ID',
                'Language_ID',
                'Language_iso',
                'Language_name',
                'Language_local_ID',
                'Parameter_ID',
                'Parameter_name',
                'Parameter_local_ID',
                'Value',
                'Value_in_source',
                'Segments',
                'Context',
                'Source',
                'Cognate_Set',
                'Comment',
                'Loan',
        ),
                         self.dataset,
                         subset=self.language.id) as ds:
            ds.metadata['dc:creator'] = self.language.author
            ds.metadata['dc:identifier'] = self.url('language.php?id=%s' %
                                                    self.language.id)
            if self.language.typedby:
                ds.metadata['dc:contributor'] = self.language.typedby
            if self.language.checkedby:
                ds.metadata['dc:contributor'] = self.language.checkedby
            if self.language.notes:
                ds.metadata['dc:description'] = self.language.notes

            ds.table.schema.aboutUrl = '%s.csv#{ID}' % ds.name
            ds.table.schema.columns['Loan'].datatype = 'boolean'
            ds.table.schema.columns['Parameter_local_ID'].valueUrl = \
                self.url('word.php?v=1{Parameter_local_ID}')
            ds.table.schema.columns['Language_local_ID'].valueUrl = \
                self.url('language.php?id={Language_local_ID}')

            ref = None
            if citekey and source:
                ref = citekey
                ds.sources.add(Source('misc', citekey, title=source))

            for entry in self.entries:
                if entry.name == '?':
                    continue
                if not (citekey and source):
                    src = entry.e.find('source')
                    if src and getattr(src, 'text'):
                        ref = slug(text_type(src.text))
                        ds.sources.add(Source('misc', ref, title=src.text))
                cid = concept_map.get(concept_key(entry))
                if not cid:
                    unmapped.concepts.add((entry.word_id, entry.word))
                for i, (form, context) in enumerate(util.split(entry.name)):
                    ds.add_row([
                        '{0}-{1}'.format(entry.id, i + 1),
                        self.language.glottocode,
                        self.language.iso,
                        self.language.name,
                        self.language.id,
                        cid,
                        entry.word,
                        entry.word_id,
                        util.clean_form(form),
                        form,
                        '',
                        context,
                        ref,
                        entry.cognacy,
                        entry.comment or '',
                        entry.loan == 'L',
                    ])
            segmentize(ds)
        return ds