Ejemplo n.º 1
0
Archivo: data.py Proyecto: kpu/mtdata
 def resolve_entries(cls, langs, names):
     inp_names = set(names)
     assert len(inp_names) == len(names), f'{names} are not unique.'
     entries = get_entries(langs=langs, names=inp_names)
     out_names = set(e.name for e in entries)
     if inp_names & out_names != inp_names | out_names:
         missed = inp_names - out_names
         assert missed
         raise Exception(f'Could not find: {missed} for languages: {langs}')
     return entries
Ejemplo n.º 2
0
def generate_report(langs, names, not_names=None, format='plain'):
    from mtdata.index import get_entries
    entries = get_entries(langs, names, not_names)
    lang_stats = defaultdict(int)
    name_stats = defaultdict(int)
    group_stats = defaultdict(int)
    for ent in entries:
        lang_stats[ent.lang_str] += 1
        name_stats[ent.did.name] += 1
        group_stats[ent.did.group] += 1

    print("Languages:")
    for key, val in lang_stats.items():
        print(f'{key}\t{val:,}')

    print("\nNames:")
    for key, val in name_stats.items():
        print(f'{key}\t{val:,}')

    print("\nGroups:")
    for key, val in group_stats.items():
        print(f'{key}\t{val:,}')
Ejemplo n.º 3
0
def list_data(langs,
              names,
              not_names=None,
              full=False,
              groups=None,
              not_groups=None,
              id_only=False):
    from mtdata.index import get_entries
    entries = get_entries(langs,
                          names,
                          not_names,
                          groups=groups,
                          not_groups=not_groups,
                          fuzzy_match=True)
    for i, ent in enumerate(entries):
        if id_only:
            print(ent.did)
        else:
            print(ent.format(delim='\t'))
        if full:
            print(ent.cite or "CITATION_NOT_LISTED", end='\n\n')
    log.info(f"Total {len(entries)} entries")
Ejemplo n.º 4
0
    ).json()
    names = [f'opus_{d["corpus"]}/{d["version"]}' for d in datasets['corpora']]
elif type == 'sacrebleu':
    import sacrebleu
    names = [
        f'sacrebleu_{name}' for name, meta in sacrebleu.DATASETS.items()
        if f'{source}-{target}' in meta or f'{target}-{source}' in meta
    ]
elif type == 'mtdata':
    from mtdata.entry import LangPair, lang_pair
    from mtdata.index import get_entries
    from mtdata.iso import iso3_code
    source_tricode = iso3_code(source, fail_error=True)
    target_tricode = iso3_code(target, fail_error=True)
    exclude += ['opus', 'newstest', 'UNv1']
    entries = sorted(get_entries(
        lang_pair(source_tricode + '-' + target_tricode), None, None, True),
                     key=lambda entry: entry.did.group)
    names = [
        f'mtdata_{entry.did.group}-{entry.did.name}-{entry.did.version}-{entry.did.lang_str}'
        for entry in entries
    ]
else:
    print(
        f'Importer type {type} is unsupported. Supported importers: opus, mtdata, sacrebleu'
    )

cleaned = set()
for name in names:
    filter = False
    for ex in exclude:
        if ex.lower() in name.lower():