Beispiel #1
0
 def _cache_IETF(cls):
     cls.precache(include=['script_expr_txt', 'region_expr_txt', 'region_expr_uid'], all_lv=True)
     for uid in cls._cache:
         cls._cache[uid]['IETF'] = set()
     result = panlex.query_all('/expr', {'trans_uid': 'art-420', 'uid': 'art-274', 'include': 'trans_txt'})['result']
     regions = {cls._cache[r['txt']]['region_expr'] for r in result}
     regions_result = panlex.query_all('/expr', {'trans_expr': list(regions), 'uid': 'art-006', 'include': 'trans_txt'})['result']
     regions_dict = {r['trans_txt']: r['txt'] for r in regions_result if len(r['txt']) == 2}
     for r in result:
         uid = r['txt']
         lang = cls._cache[uid]
         given_tag = Language.get(r['trans_txt'], normalize=False)
         normalized_tag = Language.get(r['trans_txt'], normalize=True)
         language_set = {lang['lang_code'], given_tag.language, normalized_tag.language}
         script_set = {lang['script_expr_txt'], given_tag.script, normalized_tag.script}
         region_set = {given_tag.region, normalized_tag.region}
         if lang['region_expr_uid'] == 'art-279' and lang['region_expr_txt'] != '001':
             region_set.add(lang['region_expr_txt'])
             try:
                 region_set.add(regions_dict[lang['region_expr_txt']])
             except KeyError:
                 pass
         if {'GB', 'UK'} & region_set: region_set |= {'GB', 'UK'}
         if {'001', None} & region_set: region_set |= {'001', None}
         for language, script, region in product(language_set, script_set, region_set):
             new_tag = normalized_tag.update_dict({'language': language, 'script': script, 'region': region})
             cls._cache[uid]['IETF'].add(str(new_tag))
         if lang['region_expr_uid'] == 'art-279' and lang['region_expr_txt'] == '001':
             for language, script, region in product({lang['lang_code']}, script_set, {'001', None}):
                 new_tag = normalized_tag.update_dict({'language': language, 'script': script, 'region': region})
                 cls._cache[uid]['IETF'].add(str(new_tag))
 def _set_lang_codes(self):
     with open (os.path.join(os.path.dirname(__file__), 'locale.json')) as f:
         data = json.load(f)
     lang = Language.get(self.lang).language
     terr = Language.get(self.lang).territory
     if terr is None:
         self.lang_codes = self._get_lang_codes(lang, data)
     else:
         self.lang_codes = [self.lang.replace('-', '_')]
Beispiel #3
0
    def _create_user(self, user_id=None):
        lang_code = random.choice(['zh-hans', 'jp', 'ko', 'en', 'es'])
        language = Language.get(lang_code)
        user_id = utils.random_string()[0:10] if user_id is None else user_id

        param = {
            'user_id': user_id,
            'domain_id': self.domain.domain_id,
            'password': '******',
            'name': 'Steven' + utils.random_string()[0:5],
            'language': language.__str__(),
            'timezone': 'Asia/Seoul',
            'tags': {
                'aa': 'bb'
            },
            'email': 'Steven' + utils.random_string()[0:5] + '@mz.co.kr',
            'mobile': '+821026671234',
            'group': 'group-id',
        }

        user = self.identity_v1.User.create(param,
                                            metadata=(('token', self.token), ))
        self.user = user
        self.users.append(user)
        self.assertEqual(self.user.name, param['name'])
Beispiel #4
0
    def test_create_user(self,
                         user_id=None,
                         name=None,
                         user_type=None,
                         backend=None):
        lang_code = random.choice(['zh-hans', 'jp', 'ko', 'en', 'es'])
        language = Language.get(lang_code)
        if user_id is None:
            user_id = utils.random_string() + '@mz.co.kr'

        if name is None:
            name = 'Steven' + utils.random_string()

        params = {
            'user_id': user_id,
            'password': utils.generate_password(),
            'name': name,
            'email': user_id,
            'timezone': 'Asia/Seoul',
            'language': language.__str__(),
            'tags': {
                'tag_key': 'tag_value'
            },
            'domain_id': self.domain.domain_id
        }

        user = self.identity_v1.User.create(params,
                                            metadata=(('token',
                                                       self.owner_token), ))
        self.user = user
        self.users.append(user)
        self._print_data(self.user, 'test_create_user')
        self.assertEqual(self.user.name, params['name'])
Beispiel #5
0
def format_language(value: Optional[str]):
    lang = Language.get(value)

    if not lang.is_valid():
        raise ValueError('language_invalid')

    return lang.simplify_script().to_tag()
Beispiel #6
0
    async def prepareDataSet(self, data_set: NLUDataSet) -> None:
        last_exception = None

        # Try all language tag derivations, from specific to broad
        for language in Language.get(
                data_set.language).simplify_script().broaden():
            language = language.to_tag()
            try:
                if not self.__skip_language_installations:
                    self._logger.info(
                        "Installing language resources for \"%s\"...",
                        language)

                    subprocess.run([
                        self.__python, "-m", "snips_nlu", "download", language
                    ],
                                   check=True)

                self.__language = language

                last_exception = None
                break
            except BaseException as e:  # pylint: disable=broad-except
                last_exception = e

        if last_exception is not None:
            raise last_exception
Beispiel #7
0
    async def prepareDataSet(self, data_set: NLUDataSet) -> None:
        self.__language = Language.get(data_set.language).simplify_script().to_tag()

        agent_parent_path = self.__agents_client.project_path(self.__project)

        # The default language code doesn't really matter as this code always explicitly passes the
        # exact language on each step. Still, the default language code HAS to be set and it MUST
        # be set to the code that already is the default.
        # The following code attempts to retrieve the current agent and to extract the current
        # default language code from it.
        try:
            default_language_code = self.__agents_client.get_agent(
                agent_parent_path
            ).default_language_code
        except: # pylint: disable=bare-except
            # TODO: Unable to figure out which exact error is raised in case the agent doesn't
            # exist, which is why this code catches any exception that might be raised by the call
            # to get_agent.
            default_language_code = "en"

        self.__agents_client.set_agent(dialogflow_v2.types.Agent(
            parent       = agent_parent_path,
            display_name = self.__agent,
            time_zone    = self.__time_zone,
            default_language_code    = default_language_code,
            supported_language_codes = [ self.__language ]
        ))
Beispiel #8
0
    def test_create_user(self, user_id=None, name=None):
        lang_code = random.choice(['zh-hans', 'jp', 'ko', 'en', 'es'])
        language = Language.get(lang_code)
        if user_id is None:
            user_id = utils.random_string()[0:10]

        if name is None:
            name = 'Steven' + utils.random_string()[0:5]

        params = {
            'user_id': user_id,
            'password': '******',
            'name': name,
            'language': language.__str__(),
            'tags': {
                'key': 'value'
            },
            'email': 'Steven' + utils.random_string()[0:5] + '@mz.co.kr',
            'mobile': '+821026671234',
            'group': 'group-id',
            'domain_id': self.domain.domain_id
        }

        user = self.identity_v1.User.create(params,
                                            metadata=(('token', self.token), ))
        self.user = user
        self.users.append(user)
        self._print_data(self.user, 'test_create_user')
        self.assertEqual(self.user.name, params['name'])
Beispiel #9
0
def from_IETF(tag, normalize=False):
    cache = cache_IETF()
    tag = str(Language.get(tag, normalize))
    output = []
    for uid in cache:
        if tag in cache[uid]['IETF']:
            output.append(uid)
    return output
Beispiel #10
0
    async def _prepareDataSet(self, data_set: NLUDataSet) -> None:
        language = Language.get(data_set.language).language

        if self.__pipeline == "supervised":
            pipeline_config = "supervised_embeddings"
            image = "rasa/rasa:{}".format(self.__VERSION)
        if self.__pipeline == "pretrained":
            pipeline_config = "pretrained_embeddings_spacy"
            # In theory it should be enough to install rasa/rasa:latest-spacy-{language}, but in
            # practice the training fails in these images due to the spaCy models not being found.
            # This bug is reported in the Rasa repo: https://github.com/RasaHQ/rasa/issues/4789
            image = "rasa/rasa:{}-spacy-{}".format(self.__VERSION, language)

        # Create the Rasa config
        self.__rasa_config_yml = yaml.dump({ "language": language, "pipeline": pipeline_config })

        # Connect to the Docker daemon and pull the Rasa container
        self._logger.info("Preparing the docker container for Rasa...")
        self._logger.debug("Pulling Rasa image \"%s\"...", image)
        self.__docker.images.pull(image)

        self._logger.debug("Starting the Rasa HTTP server...")
        self.__container = self.__docker.containers.run(
            image,

            # Run the Rasa server and enable the HTTP API
            [ "run", "--enable-api" ],

            # Automatically remove the container after the server shuts down
            auto_remove=True,

            # Don't wait for the command to finish
            detach=True,

            # Expose port 5005 (used for HTTP by Rasa) for TCP traffic to a random port
            ports={ "5005/tcp": None }
        )

        # Update the container information from the Docker daemon
        self.__container.reload()

        # Extract the port mapping and build the base url for the HTTP API
        port_mapping = self.__container.attrs["NetworkSettings"]["Ports"]["5005/tcp"][0]
        self.__url = "http://{}:{}/".format(port_mapping["HostIp"], port_mapping["HostPort"])

        self._logger.debug("Waiting for the health endpoint to come alive...")
        for _ in range(self.__timeout):
            try:
                success = requests.get(self.__url).status_code == 200
            except requests.exceptions.ConnectionError:
                success = False

            if success:
                break

            await asyncio.sleep(1)

        self._logger.info("Container running.")
Beispiel #11
0
 async def _prepareDataSet(self, data_set: NLUDataSet) -> None:
     self.__app_id = self.__authoring_client.apps.add({
         "name":
         "NLUTestFramework",
         "culture":
         Language.get(data_set.language).simplify_script().to_tag(),
         "initial_version_id":
         self.__class__.FAKE_VERSION
     })
Beispiel #12
0
def get_languages() -> List[Language]:
    langs = getattr(settings, "GARNETT_TRANSLATABLE_LANGUAGES",
                    [get_default_language()])
    if callable(langs):
        langs = langs()
    if type(langs) == list:
        return [Language.get(lang) for lang in langs]
    raise ImproperlyConfigured(
        "GARNETT_TRANSLATABLE_LANGUAGES must be a list or a callable that returns a list"
    )
Beispiel #13
0
 def get_video_print(self, videos: List[Track]) -> List[List[str]]:
     if not videos:
         return [["--"]]
     data = []
     for video in videos:
         codec = {
             "MPEG Video":
             f"MPEG-{(video.format_version or '').replace('Version ', '')}"
         }.get(video.format, video.format)
         scan_overview = video.scan_type
         vst = False
         if codec in ["MPEG-1", "MPEG-2"]:
             # parse d2v file with pyd2v, generates D2V if needed
             d2v = D2V.load(Path(self.file))
             self.file = d2v.path
             # get every frames' flag data, this contains information on displaying frames
             # add vob and cell number to each frames flag data as well
             flags = [
                 f for line in [[
                     dict(**y, vob=x["vob"], cell=x["cell"])
                     for y in x["flags"]
                 ] for x in d2v.data] for f in line
             ]
             interlaced_percent = (
                 sum(1 for f in flags if not f["progressive_frame"]) /
                 len(flags)) * 100
             if interlaced_percent == 100:
                 scan_overview = "Interlaced (CST)"
             else:
                 scan_overview = f"{round(interlaced_percent, 2)}% Interlaced (VST)"
                 vst = True
             for ext in ["log", "d2v", "mpg", "mpeg"]:
                 fp = os.path.splitext(self.file)[0] + "." + ext
                 if os.path.exists(fp):
                     os.unlink(fp)
         line_1 = "- {language}, {codec} ({profile}) {width}x{height} ({aspect}) @ {bitrate}".format(
             language=Language.get(video.language).display_name(),
             codec=codec,
             profile=video.format_profile,
             width=video.width,
             height=video.height,
             aspect=video.other_display_aspect_ratio[0],
             bitrate=
             f"{video.other_bit_rate[0]}{f' ({video.bit_rate_mode})' if video.bit_rate_mode else ''}"
         )
         line_2 = "  {fps} FPS ({fps_mode}), {color_space}{subsampling}P{bit_depth}, {scan}".format(
             fps=f"{video.framerate_num}/{video.framerate_den}"
             if video.framerate_num else video.frame_rate,
             fps_mode="VFR" if vst else video.frame_rate_mode,
             color_space=video.color_space,
             subsampling=video.chroma_subsampling.replace(":", ""),
             bit_depth=video.bit_depth,
             scan=scan_overview)
         data.append([line_1, line_2])
     return data
Beispiel #14
0
 def from_IETF(cls, tag, normalize=True):
     if cls._cache:
         try:
             [cls._cache[uid]['IETF'] for uid in cls._cache]
         except KeyError:
             cls._cache_IETF()
     else:
         cls._cache_IETF()
     tag = str(Language.get(tag, normalize))
     output = []
     for uid in cls._cache:
         if tag in cls._cache[uid]['IETF']:
             output.append(uid)
     return output
Beispiel #15
0
def get_language_from_request(request) -> Language:
    opt_order = getattr(
        settings,
        "GARNETT_REQUEST_LANGUAGE_SELECTORS",
        [
            "garnett.selectors.query",
            "garnett.selectors.cookie",
            "garnett.selectors.header",
        ],
    )
    for opt in opt_order:
        func = import_string(opt)
        if lang := func(request):
            return Language.get(lang)
Beispiel #16
0
    def _setLanguage(self, language: str) -> None:
        """
        Args:
            language: The language of this data set. Use this method to set the language, if the
                language was dynamically loaded from the data set itself. The language is
                represented by its ISO 639-1 code (e.g. "en").

        Raises:
            :exc:`ValueError`: if the language was already set.
        """

        if self.__language is not None:
            raise ValueError("The language for this data set was already set.")

        self.__language = Language.get(language).maximize().to_tag()
Beispiel #17
0
def get_default_language():
    setting = getattr(settings, "GARNETT_DEFAULT_TRANSLATABLE_LANGUAGE",
                      "en-AU")
    if callable(setting):
        default = setting()
    else:
        default = setting

    if isinstance(default, Language):
        return default
    elif isinstance(default, str):
        return Language.get(default)
    else:
        raise ImproperlyConfigured(
            "GARNETT_DEFAULT_TRANSLATABLE_LANGUAGE must be a string or callable that returns a string or `Language` object"
        )
 def _create_user(self):
     self.user_param = {
         'user_id': (utils.random_string()[0:10]),
         'password': '******',
         'name': 'Steven' + utils.random_string()[0:5],
         'language': Language.get('jp').__str__(),
         'timezone': 'utc+9',
         'tags': {
             'aa': 'bb'
         },
         'domain_id': self.domain.domain_id,
         'email': 'Steven' + utils.random_string()[0:5] + '@mz.co.kr',
         'mobile': '+821026671234',
         'group': 'group-id',
     }
     self.user = self.identity_v1.User.create(
         self.user_param, metadata=(('token', self.owner_token), ))
Beispiel #19
0
    def get_subtitle_print(subs: List[Track]) -> List[str]:
        """
        Return a list of a brief subtitle overview per-subtitle.

        e.g.
        - English, Forced, SubRip (SRT)
        - English, SubRip (SRT)
        - English, SDH, SubRip (SRT)
        - Spanish, Latin American (SDH), SubRip (SRT)

        The bit of text between the Language and the Subtitle format is the Track Title.
        It can be of any format, but it is recommended to be used as shown above.

        It will be returned as a list of strings with the `- ` already pre-pended to each entry.
        """
        data = []
        if not subs:
            data.append("--")
        for sub in subs:
            line_items = []

            # following sub.title tree checks and supports three different language and title scenarios
            # The second scenario is the recommended option to choose if you are open to choosing any
            # The third scenario should be used if you have nothing unique to state about the track
            # | Language     | Track Title                   | Output                                        |
            # | ------------ | ----------------------------- | --------------------------------------------- |
            # | es / Spanish | Spanish (Latin American, SDH) | - Spanish (Latin American, SDH), SubRip (SRT) |
            # | es / Spanish | Latin American (SDH)          | - Spanish, Latin American (SDH), SubRip (SRT) |
            # | es / Spanish | None                          | - Spanish, SubRip (SRT)                       |
            language = Language.get(sub.language).display_name()
            if sub.title:
                if language.lower() in sub.title.lower():
                    line_items.append(sub.title)
                else:
                    line_items.append(f"{language}, {sub.title}")
            else:
                line_items.append(language)

            line_items.append(sub.format.replace("UTF-8", "SubRip (SRT)"))

            line = "- " + ", ".join(line_items)
            data += [("  " + x if i > 0 else x)
                     for i, x in enumerate(textwrap.wrap(line, 64))]
        return data
Beispiel #20
0
def runSimpleJSONDataSetTests(path, title, constructor_language, expected_language, size):
    expected_language = Language.get(expected_language).maximize().to_tag()

    # Run the tests twice, ignoring existing caches on the first run
    for ignore_cache in [ True, False ]:
        # Construct the data set
        data_set = SimpleJSONDataSet(title, path, 50, constructor_language, ignore_cache)

        assert data_set.title == title
        assert data_set.language == expected_language

        # Verify that the training data does not contain any None-intent sentences
        assert len(list(filter(lambda x: x.intent is None, data_set.training_data))) == 0

        # Get the number of None-intent sentences in the validation data
        num_none_intent_sentences = len(list(filter(
            lambda x: x.intent is None,
            data_set.validation_data
        )))

        # Verify that the training and validation data (without None-intent sentences) was split
        # correctly at about 50%
        validation_size_without_none = len(data_set.validation_data) - num_none_intent_sentences
        assert abs(len(data_set.training_data) - validation_size_without_none) <= 1

        # Verify that all entries were loaded
        assert len(data_set.training_data) + len(data_set.validation_data) == size

        # Make sure that the data returned on subsequent calls is the same
        assert data_set.training_data == data_set.training_data
        assert data_set.validation_data == data_set.validation_data

        # Verify that the data is sorted and split differently after reshuffling the data
        training_data = data_set.training_data
        validation_data = data_set.validation_data
        data_set.reshuffle()
        assert training_data != data_set.training_data
        assert validation_data != data_set.validation_data

        # Make sure that a copy of the data is returned and not a reference
        data_set.training_data.pop()
        data_set.validation_data.pop()
        assert len(data_set.training_data) + len(data_set.validation_data) == size
Beispiel #21
0
def langcodes_score(language, segment, score):
    '''Use langcodes on selected URL segments and integrate
       them into a score.'''
    # see also: https://babel.pocoo.org/en/latest/locale.html
    # test if the code looks like a country or a language
    if segment[:2] not in COUNTRY_CODES and segment[:2] not in LANGUAGE_CODES:
        return score
    # test if tag is valid (caution: private codes are)
    if tag_is_valid(segment):
        # try to identify language code
        identified = Language.get(segment).language
        # see if it matches
        if identified is not None:
            LOGGER.debug('langcode %s found in URL segment %s', identified,
                         segment)
            if identified != language:
                score -= 1
            else:
                score += 1
    return score
Beispiel #22
0
def _prepare_user_data(scenario_user):
    lang_code = random.choice(['ko', 'en'])
    language = Language.get(lang_code)
    user_id = random_string()[0:10]

    default_user = {
        'user_id': user_id,
        'password': user_id,
        'name': 'Steven' + random_string()[0:5],
        'language': language.__str__(),
        'timezone': 'Asia/Seoul',
        'tags': {
            'aa': 'bb'
        },
        'email': 'Steven' + random_string()[0:5] + '@mz.co.kr',
        'mobile': '+821026671234'
    }
    # Overwrite param, if needed
    default_user.update(scenario_user)
    return default_user
Beispiel #23
0
 def get_audio_print(self, audio: List[Track]) -> List[str]:
     if not audio:
         return ["--"]
     data = []
     for t in audio:
         if t.title and "Commentary" in t.title:
             title = t.title
         else:
             title = Language.get(t.language).display_name()
         if t.channel_layout:
             channels = float(
                 sum(
                     self.AUDIO_CHANNEL_LAYOUT_WEIGHT.get(x, 1)
                     for x in t.channel_layout.split(" ")))
         else:
             channels = float(t.channel_s)
         bit_rate_mode = f" ({t.bit_rate_mode})" if t.bit_rate_mode else ""
         l1 = f"- {title}, {t.format} {channels} @ {t.other_bit_rate[0]}{bit_rate_mode}"
         data += [("  " + x if i > 0 else x)
                  for i, x in enumerate(textwrap.wrap(l1, 64))]
     return data
Beispiel #24
0
    def test_create_owner(self):
        lang_code = random.choice(['zh-hans', 'jp', 'ko', 'en', 'es'])
        language = Language.get(lang_code)
        owner_id = utils.random_string()

        params = {
            'owner_id': owner_id,
            'password': utils.generate_password(),
            'name': 'Steven' + utils.random_string(),
            'language': language.__str__(),
            'timezone': 'Asia/Seoul',
            'email': 'Steven' + utils.random_string() + '@mz.co.kr',
            'domain_id': self.domain.domain_id
        }

        owner = self.identity_v1.DomainOwner.create(params)
        self.domain_owner = owner
        self.params = params
        self.assertEqual(params['name'], self.domain_owner.name)

        self._issue_owner_token(params['owner_id'], params['password'])
Beispiel #25
0
    def _test_create_user(self, name='test', user_id=None):
        if self.role is None:
            self._test_create_role()

        if user_id is None:
            user_id = utils.random_string()[0:10]

        lang_code = random.choice(['zh-hans', 'jp', 'ko', 'en', 'es'])
        language = Language.get(lang_code)

        params = {
            'user_id': user_id,
            'domain_id': self.domain.domain_id,
            'password': '******',
            'name': name + utils.random_string()[0:5],
            'language': language.__str__(),
            'timezone': 'utc+9',
            'tags': {
                'aa': 'bb'
            },
            'email': name + utils.random_string()[0:5] + '@mz.co.kr',
            'mobile': '+821026671234',
            'group': 'group-id'
        }

        self.user = self.identity_v1.User.create(params,
                                                 metadata=(('token',
                                                            self.token), ))

        self.user = self.identity_v1.User.update_role(
            {
                'user_id': self.user.user_id,
                'domain_id': self.domain.domain_id,
                'roles': [self.role.role_id]
            },
            metadata=(('token', self.token), ))

        self.users.append(self.user)

        return self.user
Beispiel #26
0
    def test_create_owner(self):
        lang_code = random.choice(['zh-hans', 'jp', 'ko', 'en', 'es'])
        language = Language.get(lang_code)
        owner_id = utils.random_string()[0:10]

        param = {
            'owner_id': owner_id,
            'password': '******',
            'name': 'Steven' + utils.random_string()[0:5],
            'language': language.__str__(),
            'timezone': 'utc+9',
            'email': 'Steven' + utils.random_string()[0:5] + '@mz.co.kr',
            'mobile': '+821026671234',
            'domain_id': self.domain.domain_id
        }

        owner = self.identity_v1.DomainOwner.create(
            param
        )
        self.domain_owner = owner
        self.param = param
        self.assertEqual(param['name'], self.domain_owner.name)
 def __init__(self, language):
     if isinstance(language, Language):
         self.language = language
     else:
         self.language = Language.get(language)
     self.token = None
def get_language_info(language):
    """
    Looks up the things we need to know about how to handle text in a given
    language. This will return a dictionary with the following fields:

    'script': a BCP 47 script code such as 'Latn', 'Cyrl', 'Hans'...

        Indicates the script that tokens in this language should be in,
        _after_ our preprocessing. The script for 'zh' is 'Hans', for example,
        because even if the input is in Traditional Chinese ('Hant'), we
        convert it to Simplified.

    'tokenizer': 'regex', 'jieba', 'mecab', or None

        Indicates the best way we know to separate tokens in the language.

        'regex' is what will be used for most languages, meaning that we can
        segment the text with a Unicode-aware regular expression. If a language
        generally uses spaces to separate words, the regex will work well.

        'jieba' and 'mecab' are tokenizers for specific languages written
        without spaces.

        A tokenizer of None means we don't have a good way to segment the
        language. We'll use the regex anyway, but the results will be pretty
        bad.

    'normal_form': 'NFC' or 'NFKC'

        How "should" Unicode be normalized when comparing text in this
        language? This is not a standard, it's just based on experience.
        Many languages need NFKC normalization for text comparisons to work
        properly, but in many European languages, NFKC normalization is
        excessive and loses information.

    'remove_marks': True or False

        Determines whether marks and decorations, such as vowel points and
        tatweels, should be removed. True for languages in abjad scripts.

    'dotless_i': True or False

        Is "ı" the lowercase of "I" in this language, as in Turkish?

    'diacritics_under': 'cedillas', 'commas', or None

        Should we convert any diacritics that are under the letters "s" and
        "t" in this language? 'cedillas' means we should convert commas to
        cedillas, and 'commas' means we should convert cedillas to commas.

    'transliteration': 'sr-Latn', 'az-Latn', or None

        Indicates a type of transliteration that we should use for normalizing
        a multi-script language. 'sr-Latn' means to use Serbian romanization,
        and 'az-Latn' means to use Azerbaijani romanization.

    'lookup_transliteration': 'zh-Hans' or None

        Indicates a lossy transliteration that should be not be used for output,
        but should be applied when looking up words in a list. 'zh-Hans' means
        that we should convert Traditional Chinese characters to Simplified.
    """
    # The input is probably a string, so parse it into a Language. If it's
    # already a Language, it will pass through.
    language = Language.get(language)

    # Assume additional things about the language, such as what script it's in,
    # using the "likely subtags" table
    language_full = language.maximize()

    # Start the `info` dictionary with default values, including the 'script'
    # value that we now know from `language_full`.
    info = {
        'script': language_full.script,
        'tokenizer': 'regex',
        'normal_form': 'NFKC',
        'remove_marks': False,
        'dotless_i': False,
        'diacritics_under': None,
        'transliteration': None,
        'lookup_transliteration': None
    }

    if _language_in_list(language, ['ja', 'ko']):
        info['tokenizer'] = 'mecab'
    elif _language_in_list(language, ['zh', 'yue']):
        info['tokenizer'] = 'jieba'
    elif info['script'] in SPACELESS_SCRIPTS:
        info['tokenizer'] = None

    # Cased alphabetic scripts get NFC normal form
    if info['script'] in ['Latn', 'Grek', 'Cyrl']:
        info['normal_form'] = 'NFC'

    if info['script'] in ['Arab', 'Hebr']:
        info['remove_marks'] = True

    if _language_in_list(language, ['tr', 'az', 'kk']):
        info['dotless_i'] = True
        info['diacritics_under'] = 'cedillas'
    elif _language_in_list(language, ['ro']):
        info['diacritics_under'] = 'commas'

    if _language_in_list(language, ['sr']):
        info['transliteration'] = 'sr-Latn'
    elif _language_in_list(language, ['az']):
        info['transliteration'] = 'az-Latn'

    if language.language == 'zh' and language.script != 'Hant':
        info['lookup_transliteration'] = 'zh-Hans'

    return info
def language_display(language, display_language=None):
    if type(language) is str:
        language = Language.get(language)
    if display_language is None:
        return language.display_name()
    return language.display_name(display_language)
Beispiel #30
0
def get_language_info(language):
    """
    Looks up the things we need to know about how to handle text in a given
    language. This will return a dictionary with the following fields:

    'script': a BCP 47 script code such as 'Latn', 'Cyrl', 'Hans'...

        Indicates the script that tokens in this language should be in,
        _after_ our preprocessing. The script for 'zh' is 'Hans', for example,
        because even if the input is in Traditional Chinese ('Hant'), we
        convert it to Simplified.

    'tokenizer': 'regex', 'jieba', 'mecab', or None

        Indicates the best way we know to separate tokens in the language.

        'regex' is what will be used for most languages, meaning that we can
        segment the text with a Unicode-aware regular expression. If a language
        generally uses spaces to separate words, the regex will work well.

        'jieba' and 'mecab' are tokenizers for specific languages written
        without spaces.

        A tokenizer of None means we don't have a good way to segment the
        language. We'll use the regex anyway, but the results will be pretty
        bad.

    'normal_form': 'NFC' or 'NFKC'

        How "should" Unicode be normalized when comparing text in this
        language? This is not a standard, it's just based on experience.
        Many languages need NFKC normalization for text comparisons to work
        properly, but in many European languages, NFKC normalization is
        excessive and loses information.

    'remove_marks': True or False

        Determines whether marks and decorations, such as vowel points and
        tatweels, should be removed. True for languages in abjad scripts.

    'dotless_i': True or False

        Is "ı" the lowercase of "I" in this language, as in Turkish?

    'diacritics_under': 'cedillas', 'commas', or None

        Should we convert any diacritics that are under the letters "s" and
        "t" in this language? 'cedillas' means we should convert commas to
        cedillas, and 'commas' means we should convert cedillas to commas.

    'transliteration': 'sr-Latn', 'az-Latn', or None

        Indicates a type of transliteration that we should use for normalizing
        a multi-script language. 'sr-Latn' means to use Serbian romanization,
        and 'az-Latn' means to use Azerbaijani romanization.

    'lookup_transliteration': 'zh-Hans' or None

        Indicates a lossy transliteration that should be not be used for output,
        but should be applied when looking up words in a list. 'zh-Hans' means
        that we should convert Traditional Chinese characters to Simplified.
    """
    # The input is probably a string, so parse it into a Language. If it's
    # already a Language, it will pass through.
    language = Language.get(language)

    # Assume additional things about the language, such as what script it's in,
    # using the "likely subtags" table
    language_full = language.maximize()

    # Start the `info` dictionary with default values, including the 'script'
    # value that we now know from `language_full`.
    info = {
        'script': language_full.script,
        'tokenizer': 'regex',
        'normal_form': 'NFKC',
        'remove_marks': False,
        'dotless_i': False,
        'diacritics_under': None,
        'transliteration': None,
        'lookup_transliteration': None
    }

    if _language_in_list(language, ['ja', 'ko']):
        info['tokenizer'] = 'mecab'
    elif _language_in_list(language, ['zh', 'yue']):
        info['tokenizer'] = 'jieba'
    elif info['script'] in SPACELESS_SCRIPTS:
        info['tokenizer'] = None

    # Cased alphabetic scripts get NFC normal form
    if info['script'] in ['Latn', 'Grek', 'Cyrl']:
        info['normal_form'] = 'NFC'

    if info['script'] in ['Arab', 'Hebr']:
        info['remove_marks'] = True

    if _language_in_list(language, ['tr', 'az', 'kk']):
        info['dotless_i'] = True
        info['diacritics_under'] = 'cedillas'
    elif _language_in_list(language, ['ro']):
        info['diacritics_under'] = 'commas'

    if _language_in_list(language, ['sr']):
        info['transliteration'] = 'sr-Latn'
    elif _language_in_list(language, ['az']):
        info['transliteration'] = 'az-Latn'

    if language.language == 'zh' and language.script != 'Hant':
        info['lookup_transliteration'] = 'zh-Hans'

    return info
Beispiel #31
0
for lang in SOUP.find_all('code'):
    curlang = str(lang.getText())
    if curlang == 'en' or curlang == 'la' or curlang == 'ceb':
        continue
    if curlang == 'zh-CN' or curlang == 'zh-TW':
        continue

    filename = Path(curlang + ".po")
    if filename.is_file():
        print(filename, " already exists, skipping...")
        continue
    else:
        print(filename, " doesn't exist, processing...")

    language = Language.get(curlang).language_name('en')

    POT_DATA.metadata['Language-Team'] = language + ' <' + curlang + '@li.org>'
    POT_DATA.metadata['Language'] = curlang

    for i in range(len(POT_DATA)):
        tmpstr = POT_DATA[i].msgid.replace('%1', '__')
        tmpTrans = TRANS.translate(tmpstr,
                                   source_language='en',
                                   target_language=curlang)['translatedText']
        POT_DATA[i].msgstr = tmpTrans
        POT_DATA[i].msgstr = POT_DATA[i].msgstr.replace('__', '%1')

    F2 = open(filename, 'wt')
    F2.write(str(POT_DATA))
    F2.close()
Beispiel #32
0
def get_language_name(chars):
    """

    """
    name = Language.get(chars).language_name()
    return name
Beispiel #33
0
def cache_IETF():
    cache = {}
    result = query(
        """
        select expr.txt, exprsrc.txt as trans_txt 
        from expr
        inner join denotationx as denotation on denotation.expr = expr.id
        inner join denotationx as denotationsrc on denotationsrc.meaning = denotation.meaning and denotationsrc.expr != denotation.expr
        inner join expr as exprsrc on exprsrc.id = denotationsrc.expr
        where expr.langvar = uid_langvar('art-274') and denotationsrc.langvar = uid_langvar('art-420')
        """)
    for r in result:
        cache[r['txt']] = {}
        cache[r['txt']]['IETF'] = set()
    for r in query(
        """
        select 
            langvar.lang_code, 
            langvar.region_expr,
            uid(langvar.lang_code,langvar.var_code), 
            script_expr.txt as script_expr_txt, 
            uid(region_expr_langvar.lang_code,region_expr_langvar.var_code) as region_expr_uid, 
            region_expr.txt as region_expr_txt 
        from langvar 
        inner join expr on expr.id = langvar.name_expr 
        inner join expr as script_expr on script_expr.id = langvar.script_expr 
        inner join expr as region_expr on region_expr.id = langvar.region_expr 
        inner join langvar as region_expr_langvar on region_expr_langvar.id = region_expr.langvar 
        where uid(langvar.lang_code,langvar.var_code) = any(%s)
        """, (list(cache.keys()),)):
        cache[r['uid']].update(r)
    regions_dict = {}
    for r in query(
        """
        select expr.txt, exprsrc.txt as trans_txt
        from expr
        inner join denotationx as denotation on denotation.expr = expr.id
        inner join denotationx as denotationsrc on denotationsrc.meaning = denotation.meaning and denotationsrc.expr != denotation.expr
        inner join expr as exprsrc on exprsrc.id = denotationsrc.expr
        where expr.langvar = uid_langvar('art-006') and denotationsrc.expr = any(%s)
        """, ([l['region_expr'] for l in cache.values()],)):
        if len(r['txt']) == 2:
            regions_dict[r['trans_txt']] = r['txt']
    for r in result:
        uid = r['txt']
        lang = cache[uid]
        given_tag = Language.get(r['trans_txt'], normalize=False)
        normalized_tag = Language.get(r['trans_txt'], normalize=True)
        language_set = {lang['lang_code'], given_tag.language, normalized_tag.language}
        script_set = {lang['script_expr_txt'], given_tag.script, normalized_tag.script}
        region_set = {given_tag.region, normalized_tag.region}
        if lang['region_expr_uid'] == 'art-279' and lang['region_expr_txt'] != '001':
            region_set.add(lang['region_expr_txt'])
            try:
                region_set.add(regions_dict[lang['region_expr_txt']])
            except KeyError:
                pass
        if {'GB', 'UK'} & region_set: region_set |= {'GB', 'UK'}
        if {'001', None} & region_set: region_set |= {'001', None}
        for language, script, region in product(language_set, script_set, region_set):
            new_tag = normalized_tag.update_dict({'language': language, 'script': script, 'region': region})
            cache[uid]['IETF'].add(str(new_tag))
        if lang['region_expr_uid'] == 'art-279' and lang['region_expr_txt'] == '001':
            for language, script, region in product({lang['lang_code']}, script_set, {'001', None}):
                new_tag = normalized_tag.update_dict({'language': language, 'script': script, 'region': region})
                cache[uid]['IETF'].add(str(new_tag))
    return cache
Beispiel #34
0
def resolve_language(context, language_tag):
    lang = Language.get(language_tag)
    request_language = get_language_from_request(context['request'])
    return lang.language_name(request_language)
Beispiel #35
0
def json_export(out: Export = Export("sbx_metadata/[metadata.id].json"),
                corpus_id: Corpus = Corpus(),
                lang: Language = Language(),
                metadata: dict = Config("metadata"),
                sentences: AnnotationCommonData = AnnotationCommonData(
                    "misc.<sentence>_count"),
                tokens: AnnotationCommonData = AnnotationCommonData(
                    "misc.<token>_count"),
                korp_protected: bool = Config("korp.protected"),
                korp_mode: bool = Config("korp.mode"),
                md_trainingdata: bool = Config("sbx_metadata.trainingdata"),
                md_xml_export: str = Config("sbx_metadata.xml_export"),
                md_stats_export: bool = Config("sbx_metadata.stats_export"),
                md_korp: bool = Config("sbx_metadata.korp"),
                md_downloads: list = Config("sbx_metadata.downloads"),
                md_interface: list = Config("sbx_metadata.interface"),
                md_contact: dict = Config("sbx_metadata.contact_info")):
    """Export corpus metadata to JSON format."""
    md_obj = {}
    md_obj["id"] = corpus_id
    md_obj["type"] = "corpus"
    md_obj["trainingdata"] = md_trainingdata

    # Set language info
    md_obj["lang"] = [{
        "code":
        lang,
        "name_en":
        languages.get(part3=lang).name if lang in languages.part3 else lang,
        "name_sv":
        Language.get(lang).display_name("swe"),
    }]

    # Set name and description
    md_obj["name_en"] = metadata.get("name", {}).get("eng")
    md_obj["name_sv"] = metadata.get("name", {}).get("swe")
    md_obj["description_en"] = metadata.get("description", {}).get("eng")
    md_obj["description_sv"] = metadata.get("description", {}).get("swe")

    # Set downloads
    downloads = []
    downloads.append(
        metadata_utils.make_standard_xml_export(md_xml_export, corpus_id))
    downloads.append(
        metadata_utils.make_standard_stats_export(md_stats_export, corpus_id))
    downloads.append(metadata_utils.make_metashare(corpus_id))
    downloads.extend(md_downloads)
    md_obj["downloads"] = [d for d in downloads if d]

    # Set interface
    interface = []
    interface.append(metadata_utils.make_korp(md_korp, corpus_id, korp_mode))
    interface.extend(md_interface)
    md_obj["interface"] = [d for d in interface if d]

    # Set contact info
    if md_contact == "sbx-default":
        md_obj["contact_info"] = metadata_utils.SBX_DEFAULT_CONTACT
    else:
        md_obj["contact_info"] = md_contact

    # Set size
    md_obj["size"] = {"tokens": tokens.read(), "sentences": sentences.read()}

    # Write JSON to file
    os.makedirs(os.path.dirname(out), exist_ok=True)
    json_str = json.dumps(md_obj, ensure_ascii=False, indent=4)
    with open(out, "w") as f:
        f.write(json_str)
    logger.info("Exported: %s", out)