Beispiel #1
0
 def from_md(cls, fname):
     header, rows = next(iter_markdown_tables(fname.read_text(encoding='utf8')))
     rows = [Contributor(**dict(zip([norm_header(c) for c in header], row))) for row in rows]
     byid = collections.Counter([r.id for r in rows])
     if byid.most_common(1)[0][1] > 1:  # pragma: no cover
         raise ValueError(
             'duplicate ids: {0}'.format([k for k, v in byid.most_common() if v > 1]))
     return cls(rows)
Beispiel #2
0
def run(args):
    roles = collections.Counter()
    for c in args.repos.contributors:
        roles.update([c.bio])
    for k, v in roles.most_common():
        print(k, v)
        break
    return

    bios = {r[0]: (r[1], r[2]) for r in iter_html_data(args.html)}

    # id | Last name | First name | Node | Status | Language competence | GitHub-username | email
    header, rows = next(
        iter_markdown_tables(
            args.repos.path('CONTRIBUTORS_details.md').read_text(
                encoding='utf8')))
    rows = [dict(zip(header, row)) for row in rows]
    rows = collections.OrderedDict([(r['id'], r) for r in rows])

    contribs = args.repos.contributors
    contribs = collections.OrderedDict([(r.id, r) for r in contribs])

    assert not set(rows) - set(contribs)
    lnames = {c.last_name: c.id for c in contribs.values()}
    fnames = {
        '{0.first_name} {0.last_name}'.format(c): c.id
        for c in contribs.values()
    }

    bios_by_id = {}
    for name in bios:
        if name in NAME2ID:
            bios_by_id[NAME2ID[name]] = bios[name]
            continue
        hname = HumanName(name)
        if hname.last in lnames:
            bios_by_id[lnames[hname.last]] = bios[name]
            continue
        full = '{0.first} {0.last}'.format(hname)
        if full in fnames:
            bios_by_id[fnames[full]] = bios[name]
            continue
        #print('---', name)

    for i, (cid, c) in enumerate(contribs.items()):
        bio, photo = bios_by_id.get(cid, (None, None))
        if bio:
            assert '\n' not in bio and ('|' not in bio)
        md = rows.get(cid, {})
        md['bio'] = bio
        md['photo'] = photo
        contrib = get_row(c, md)
        if i == 0:
            print(' | '.join(contrib.keys()))
            print(' | '.join([' --- ' for _ in range(len(contrib.keys()))]))
        print(' | '.join(contrib.values()))
Beispiel #3
0
 def editors(self) -> typing.List[Editor]:
     res = []
     header, rows = next(
         iter_markdown_tables(
             self.path('CONTRIBUTORS.md').read_text(encoding='utf8')))
     for (period, name) in rows:
         start, to_, end = period.strip().partition('-')
         start, end = start.strip(), end.strip()
         res.append(
             Editor(name.strip(), start, start if not to_ else end or None))
     return res
Beispiel #4
0
def read_editions(repos):
    head, rows = next(
        iter_markdown_tables(
            repos.path('CONTRIBUTORS.md').read_text(encoding='utf8')))
    res = []
    for row in rows:
        row = dict(zip([c.lower() for c in head], row))
        row['editors'] = [n.strip() for n in row['editors'].split('&')]
        res.append(row)

    return sorted(res,
                  key=lambda d: pkg_resources.parse_version(d['version']),
                  reverse=True)
    def cmd_makecldf(self, args):
        repos = Grambank(self.raw_dir / 'Grambank',
                         wiki=self.raw_dir / 'grambank.wiki')
        create(args.writer.cldf, repos, args.glottolog.api)

        self.cldf_reader().validate(log=args.log)

        header, contribs = next(
            iter_markdown_tables(
                self.raw_dir.joinpath(
                    'Grambank', 'CONTRIBUTORS.md').read_text(encoding='utf8')))
        self.dir.joinpath('CONTRIBUTORS.md').write_text(
            CONTRIBUTORS_TMPL.format('\n'.join([
                '{First name} {Last name} | | author'.format(
                    **dict(zip(header, row))) for row in contribs
            ])))
Beispiel #6
0
def get_creators_and_contributors(text, strict=True):
    ctypes = {c.lower(): c for c in CONTRIBUTOR_TYPES}
    creators, contributors = [], []
    # Read first table in CONTRIBUTORS.md
    try:
        header, rows = next(iter_markdown_tables(text))
    except StopIteration:  # pragma: no cover
        return creators, contributors
    for row in rows:
        row = {k.lower(): v for k, v in zip(header, row)}
        for role in nfilter(
            [r.strip().lower() for r in row.get('role', '').split(',')]):
            c = {k: v for k, v in row.items() if k != 'role'}
            if role in {'author', 'creator', 'maintainer'}:
                if c not in creators:
                    creators.append(c)
            else:
                if strict:
                    c['type'] = ctypes[role]
                else:
                    c['type'] = ctypes.get(role, 'Other')
                if c not in contributors:
                    contributors.append(c)
    return creators, contributors
def main(args):
    for (org, repos), recs in itertools.groupby(
            sorted(oai.Records('tular'),
                   key=lambda r: (r.repos.org, r.repos.repos, r.version),
                   reverse=True),
            lambda r: (r.repos.org, r.repos.repos),
    ):
        if org == 'tupian-language-resources' and repos in DATASETS:
            DATASETS[repos] = next(recs)

    data = Data()
    dataset = data.add(
        common.Dataset,
        'tular',
        id=tular.__name__,
        domain="tular.clld.org",
        name="TuLaR",
        description="Tupían Language Resources",
        publisher_name="Max-Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        license='https://creativecommons.org/licenses/by-sa/4.0/',
        contact="*****@*****.**",
        jsondata={
            'license_icon':
            'cc-by-sa.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 4.0 International License'
        },
    )

    rd = pathlib.Path(tular.__file__).parent.parent.parent.resolve()
    root = input('Project dir [{}]: '.format(str(rd)))
    root = pathlib.Path(root) if root else rd
    clts = clts_from_input(rd / '..' / 'cldf-clts' / 'clts-data')

    for db, rec in DATASETS.items():
        print(db, rec.doi, rec.tag)
        dbdir = root.joinpath(db)
        assert dbdir.exists()
        md = jsonlib.load(dbdir / 'metadata.json')
        name = md['title']
        if md['description']:
            name += ': {}'.format(md['description'])
        contribution = data.add(
            Database,
            db,
            id=db,
            name=name,
            description=rec.citation if rec else None,
            doi=rec.doi if rec else None,
        )
        header, contribs = next(
            iter_markdown_tables(
                dbdir.joinpath('CONTRIBUTORS.md').read_text(encoding='utf8')))
        for i, contrib in enumerate(contribs):
            contrib = dict(zip(header, contrib))
            cid = slug(HumanName(contrib['Name']).last)
            contributor = data['Contributor'].get(cid)
            if not contributor:
                contributor = data.add(
                    common.Contributor,
                    cid,
                    id=cid,
                    name=contrib['Name'],
                    description=contrib.get('Affiliation'),
                )
            DBSession.add(
                common.ContributionContributor(
                    contribution=contribution,
                    contributor=contributor,
                    primary='author' in contrib['Role'].lower(),
                    ord=i,
                ))

    for i, cid in enumerate(
        ['gerardi', 'reichert', 'aragon', 'list', 'forkel']):
        DBSession.add(
            common.Editor(contributor=data['Contributor'][cid],
                          dataset=dataset,
                          ord=i))

    source_ids = list(add_sources(args.cldf.bibpath, DBSession))
    sources = {s.id: s.pk for s in DBSession.query(common.Source)}
    subgroups = []

    for row in args.cldf['LanguageTable']:
        if row['SubGroup'] not in subgroups:
            subgroups.append(row['SubGroup'])
        family = data['Family'].get(row['Family'])
        if (not family) and row['Family']:
            family = data.add(Family,
                              row['Family'],
                              id=slug(row['Family']),
                              name=row['Family'])
        data.add(
            Doculect,
            row['ID'],
            id=row['ID'],
            name=row['Name'].replace('_', ' '),
            family=family,
            subfamily=row['SubGroup'],
            iso_code=row['ISO639P3code'],
            glotto_code=row['Glottocode'],
            longitude=row['Longitude'],
            latitude=row['Latitude'],
            jsondata=dict(icon=SUBGROUPS[row['SubGroup']]),
        )

    tudet = Dataset.from_metadata(root / 'tudet' / 'cldf' /
                                  'Generic-metadata.json')
    seen = set()
    for row in tudet['ExampleTable']:
        if row['ID'] in seen:
            print('skipping duplicate sentence ID {}'.format(row['ID']))
            continue
        seen.add(row['ID'])
        DBSession.add(
            Example(id=row['ID'],
                    name=row['Primary_Text'],
                    description=row['Translated_Text'],
                    language=data['Doculect'][row['Language_ID']],
                    conllu=row['conllu']))

    contrib = data['Database']['tuled']
    for row in args.cldf['ParameterTable']:
        data.add(
            Concept,
            row['ID'],
            id=row['ID'].split('_')[0],
            name=row['Name'],
            portuguese=row['Portuguese_Gloss'],
            semantic_field=row['Semantic_Field'],
            concepticon_class=row['Concepticon_ID'],
            eol=row['EOL_ID'],
        )
    for (lid, pid), rows in itertools.groupby(
            sorted(args.cldf.iter_rows('FormTable', 'languageReference',
                                       'parameterReference'),
                   key=lambda r: (r['Language_ID'], r['Parameter_ID'])),
            lambda r: (r['Language_ID'], r['Parameter_ID']),
    ):
        vsid = '{}-{}'.format(lid, pid)
        vs = data.add(
            common.ValueSet,
            vsid,
            id=vsid,
            language=data['Doculect'][lid],
            parameter=data['Concept'][pid],
            contribution=contrib,
        )
        refs = set()
        for row in rows:
            data.add(
                Word,
                row['ID'],
                id=row['ID'],
                valueset=vs,
                name=row['Form'],
                tokens=' '.join(row['Segments']),
                simple_cognate=int(row['SimpleCognate']),
                notes=row['Comment'],
                morphemes=' '.join(row['Morphemes']),
                partial_cognate=' '.join([k for k in row['PartialCognates']])
                if row['PartialCognates'] else None,
            )
            refs = refs.union(row['Source'])

        for ref in refs:
            if ref in source_ids:
                DBSession.add(
                    common.ValueSetReference(valueset=vs,
                                             source_pk=sources[slug(
                                                 ref, lowercase=False)]))

    load_inventories(args.cldf, clts, data['Doculect'])

    for row in args.cldf['CognateTable']:
        cc = data['Cognateset'].get(row['Cognateset_ID'])
        if not cc:
            cc = data.add(
                Cognateset,
                row['Cognateset_ID'],
                id=row['Cognateset_ID'],
                name=row['Cognateset_ID'],
                contribution=contrib,
            )
        data.add(
            Cognate,
            row['ID'],
            cognateset=cc,
            counterpart=data['Word'][row['Form_ID']],
            alignment=' '.join(row['Alignment'] or []),
        )