Exemple #1
0
def _build_config_and_filter_files(config_settings,
                                   wiki_name,
                                   dialect_suffix=""):
    path_affix = f'../tsv/{config_settings["key"]}_{dialect_suffix}'

    phonemic_config = wikipron.Config(**config_settings)
    phonemic_path = f"{path_affix}phonemic.tsv"
    phonemic_count = _call_scrape(config_settings, phonemic_config,
                                  phonemic_path)

    phonetic_config = wikipron.Config(phonetic=True, **config_settings)
    phonetic_path = f"{path_affix}phonetic.tsv"
    phonetic_count = _call_scrape(config_settings, phonetic_config,
                                  phonetic_path)

    # Removes TSVs with less than 100 lines.
    # Log language name and count to check whether Wikipron scraped any data.
    if phonemic_count < 100:
        logger.info(
            ('"%s" (count: %s) has less than '
             "100 entries in phonemic transcription."),
            wiki_name,
            phonemic_count,
        )
        os.remove(phonemic_path)
    if phonetic_count < 100:
        os.remove(phonetic_path)
        logger.info(
            ('"%s" (count: %s) has less than '
             "100 entries in phonetic transcription."),
            wiki_name,
            phonetic_count,
        )
Exemple #2
0
def _build_scraping_config(config_settings, wiki_name, dialect_suffix=""):
    path_affix = f'../tsv/{config_settings["key"]}_{dialect_suffix}'

    phonemic_config = wikipron.Config(**config_settings)
    phonemic_path = f"{path_affix}phonemic.tsv"
    _call_scrape(config_settings, phonemic_config, phonemic_path)

    phonetic_config = wikipron.Config(phonetic=True, **config_settings)
    phonetic_path = f"{path_affix}phonetic.tsv"
    _call_scrape(config_settings, phonetic_config, phonetic_path)
Exemple #3
0
def _build_scraping_config(config_settings: Dict[str, Any],
                           wiki_name: str,
                           dialect_suffix: str = "") -> None:
    path_affix = f'../tsv/{config_settings["key"]}_{dialect_suffix}'
    whitelist_path_affix = (
        f"../whitelist/{config_settings['key']}_{dialect_suffix}")

    # Configures phonemic TSV.
    phonemic_config = wikipron.Config(**config_settings)
    phonemic_path = f"{path_affix}phonemic.tsv"
    # Checks for phonemic whitelist file.
    whitelist_phonemic = f"{whitelist_path_affix}phonemic.whitelist"
    if os.path.exists(whitelist_phonemic):
        logging.info(
            "Phonemic whitelist found for '%s' at '%s'",
            config_settings["key"],
            whitelist_phonemic,
        )
        phonemic_path_filtered = f"{path_affix}phonemic_filtered.tsv"
        phoneme_set = frozenset(_whitelist_reader(whitelist_phonemic))
        _call_scrape(
            config_settings,
            phonemic_config,
            phonemic_path,
            phoneme_set,
            phonemic_path_filtered,
        )
    else:
        _call_scrape(config_settings, phonemic_config, phonemic_path)

    # Configures phonetic TSV.
    phonetic_config = wikipron.Config(phonetic=True, **config_settings)
    phonetic_path = f"{path_affix}phonetic.tsv"
    # Checks for phonetic whitelist file.
    whitelist_phonetic = f"{whitelist_path_affix}phonetic.whitelist"
    if os.path.exists(whitelist_phonetic):
        logging.info(
            "Phonetic whitelist found for '%s' at '%s.'",
            config_settings["key"],
            whitelist_phonetic,
        )
        phonetic_path_filtered = f"{whitelist_path_affix}phonetic.whitelist"
        phone_set = frozenset(_whitelist_reader(whitelist_phonetic))
        _call_scrape(
            config_settings,
            phonetic_config,
            phonetic_path,
            phone_set,
            phonetic_path_filtered,
        )
    else:
        _call_scrape(config_settings, phonetic_config, phonetic_path)
Exemple #4
0
def _build_scraping_config(config_settings: Dict[str, Any],
                           dialect_suffix: str = "") -> None:
    path_affix = f'../tsv/{config_settings["key"]}_{dialect_suffix}'
    phones_path_affix = f"../phones/{config_settings['key']}_{dialect_suffix}"

    # Configures phonemic TSV.
    phonemic_config = wikipron.Config(**config_settings)
    phonemic_path = f"{path_affix}phonemic.tsv"
    # Checks for phonemic phones file.
    phones_phonemic = f"{phones_path_affix}phonemic.phones"
    if os.path.exists(phones_phonemic):
        logging.info(
            "Phonemic phones found for %r at %r",
            config_settings["key"],
            phones_phonemic,
        )
        phonemic_path_filtered = f"{path_affix}phonemic_filtered.tsv"
        phoneme_set = frozenset(_phones_reader(phones_phonemic))
        _call_scrape(
            config_settings,
            phonemic_config,
            phonemic_path,
            phoneme_set,
            phonemic_path_filtered,
        )
    else:
        _call_scrape(config_settings, phonemic_config, phonemic_path)

    # Configures phonetic TSV.
    phonetic_config = wikipron.Config(phonetic=True, **config_settings)
    phonetic_path = f"{path_affix}phonetic.tsv"
    # Checks for phonetic phones file.
    phones_phonetic = f"{phones_path_affix}phonetic.phones"
    if os.path.exists(phones_phonetic):
        logging.info(
            "Phonetic phones found for %r at %r",
            config_settings["key"],
            phones_phonetic,
        )
        phonetic_path_filtered = f"{path_affix}phonetic_filtered.tsv"
        phone_set = frozenset(_phones_reader(phones_phonetic))
        _call_scrape(
            config_settings,
            phonetic_config,
            phonetic_path,
            phone_set,
            phonetic_path_filtered,
        )
    else:
        _call_scrape(config_settings, phonetic_config, phonetic_path)
Exemple #5
0
def test_language_coverage():
    """Check if WikiPron covers languages with a sufficient amount of data.

    If any warnings are raised, they should be suppressed by expanding
    the LANGUAGE_CODES dict to handle the relevant languages.
    """
    categories = _get_language_categories()
    sizes = _get_language_sizes(categories)
    for language, size in sizes.items():
        if size < _MIN_LANGUAGE_SIZE:
            continue
        if language in ("Mon", "Translingual"):
            # "mon" is the ISO 639 code for Mongolian, but there is also
            # the Mon language (ISO 639 code: "mnw").
            continue
        try:
            language_code = iso639.to_iso639_2(language)
        except iso639.NonExistentLanguageError:
            # Check if WikiPron can handle `language` directly.
            language_code = language
        try:
            language_inferred = wikipron.Config(key=language_code).language
        except iso639.NonExistentLanguageError:
            warnings.warn(f'WikiPron cannot handle "{language}".')
            continue
        if language_inferred != language:
            warnings.warn(
                f'WikiPron resolves the key "{language_code}" to '
                f'"{language_inferred}", '
                f'which is not "{language}" on Wiktionary.'
            )
Exemple #6
0
def _build_scraping_config(
    config_settings: Dict[str, Any], path_affix: str, phones_path_affix: str
) -> None:
    # Configures broad TSV.
    broad_config = wikipron.Config(**config_settings)
    broad_path = f"{path_affix}broad.tsv"
    # Checks for broad phones file.
    phones_broad = f"{phones_path_affix}broad.phones"
    if os.path.exists(phones_broad):
        logging.info(
            "Broad transcription phones found for %r at %r",
            config_settings["key"],
            phones_broad,
        )
        broad_path_filtered = f"{path_affix}broad_filtered.tsv"
        phoneme_set = frozenset(_phones_reader(phones_broad))
        _call_scrape(
            config_settings,
            broad_config,
            broad_path,
            phoneme_set,
            broad_path_filtered,
        )
    else:
        _call_scrape(config_settings, broad_config, broad_path)
    # Configures narrow TSV.
    narrow_config = wikipron.Config(narrow=True, **config_settings)
    narrow_path = f"{path_affix}narrow.tsv"
    # Checks for narrow phones file.
    phones_narrow = f"{phones_path_affix}narrow.phones"
    if os.path.exists(phones_narrow):
        logging.info(
            "Narrow phones found for %r at %r",
            config_settings["key"],
            phones_narrow,
        )
        narrow_path_filtered = f"{path_affix}narrow_filtered.tsv"
        phone_set = frozenset(_phones_reader(phones_narrow))
        _call_scrape(
            config_settings,
            narrow_config,
            narrow_path,
            phone_set,
            narrow_path_filtered,
        )
    else:
        _call_scrape(config_settings, narrow_config, narrow_path)
Exemple #7
0
def _check_language_code_against_wiki(language_code: str,
                                      language: str) -> None:
    """Checks if WikiPron can handle the assigned ISO language code."""
    try:
        language_inferred = wikipron.Config(key=language_code).language
    except iso639.NonExistentLanguageError:
        logging.warning("WikiPron cannot handle %r", language)
    else:
        if language_inferred != language:
            logging.warning(
                "WikiPron resolves the key %r to %r "
                "listed as %r on Wiktionary",
                language_code,
                language_inferred,
                language,
            )
Exemple #8
0
# -*- coding: utf-8 -*-
"""
Created on Sat May 30 14:54:44 2020

@author: qtckp
"""

import wikipron
import os
import json

dic = {}

config = wikipron.Config(key="en")

t = 0
for word, pron in wikipron.scrape(config):
    t += 1
    if t % 100 == 0:
        print(f'{t} {word} {pron}')
    if len(word) > 1:
        dic[word] = pron

with open("english.json", "w") as write_file:
    json.dump(dic, write_file, indent=4)