Example #1
0
def __set_dynamic_defaults(config: dict) -> dict:
    """Fill configuration dictionary with some preset values."""
    if 'mediawords' not in config or config['mediawords'] is None:
        raise McConfigException('Configuration does not have "mediawords" key')

    if 'data_dir' not in config['mediawords'] or config['mediawords']['data_dir'] is None:
        # FIXME create a helper in 'paths'
        config['mediawords']['data_dir'] = os.path.join(mc_root_path(), 'data')

    # FIXME probably not needed
    if 'session' not in config or config['session'] is None:
        config['session'] = {}
    if 'storage' not in config['session'] or config['session']['storage'] is None:
        config['session']['storage'] = os.path.join(os.path.expanduser('~'), "tmp", "mediacloud-session")

    # MC_REWRITE_TO_PYTHON: probably not needed after Python rewrite
    if 'Plugin::Authentication' not in config or config['Plugin::Authentication'] is None:
        config['Plugin::Authentication'] = {
            "default_realm": 'users',
            "users": {
                "credential": {
                    "class": 'MediaWords'
                },
                "store": {
                    "class": 'MediaWords'
                }
            }
        }

    return config
Example #2
0
 def __templates_path() -> str:
     """Return path to Jinja2 email templates."""
     root_path = mc_root_path()
     email_templates_path = os.path.join(
         root_path, 'lib', 'MediaWords', 'Util', 'Mail', 'Message', 'Templates', 'email-templates'
     )
     if not os.path.isdir(email_templates_path):
         raise McMailTemplatesNotFound('Templates directory was not found at "%s".' % email_templates_path)
     return email_templates_path
Example #3
0
 def __templates_path() -> str:
     """Return path to Jinja2 email templates."""
     root_path = mc_root_path()
     email_templates_path = os.path.join(root_path, 'lib', 'MediaWords',
                                         'Util', 'Mail', 'Message',
                                         'Templates', 'email-templates')
     if not os.path.isdir(email_templates_path):
         raise McMailTemplatesNotFound(
             'Templates directory was not found at "%s".' %
             email_templates_path)
     return email_templates_path
Example #4
0
def get_config() -> dict:
    """Get configuration dictionary."""
    global __CONFIG

    if __CONFIG is not None:
        return __CONFIG

    # FIXME: This should be standardized
    set_config_file(os.path.join(mc_root_path(), "mediawords.yml"))

    # noinspection PyTypeChecker
    # FIXME inspection could still be enabled here
    return __CONFIG
Example #5
0
def get_config() -> dict:
    """Get configuration dictionary."""
    global __CONFIG

    if __CONFIG is not None:
        return __CONFIG

    # FIXME: This should be standardized
    set_config_file(os.path.join(mc_root_path(), "mediawords.yml"))

    # noinspection PyTypeChecker
    # FIXME inspection could still be enabled here
    return __CONFIG
Example #6
0
def rotate_supervisor_logs():
    root_path = mc_root_path()
    l.debug('Media Cloud root path: %s' % root_path)

    config = py_get_config()
    child_log_dir = config['supervisor']['childlogdir']
    l.debug('Child log directory: %s' % child_log_dir)

    supervisor_logs_dir = os.path.join(root_path, child_log_dir)
    l.info('Supervisor logs path: %s' % supervisor_logs_dir)

    logrotate_state_file = os.path.join(supervisor_logs_dir, 'logrotate.state')
    l.debug('logrotate state file: %s' % logrotate_state_file)

    if not os.path.isdir(supervisor_logs_dir):
        raise Exception(
            'Supervisor logs directory does not exist at path: %s' %
            supervisor_logs_dir)

    logrotate_config = '''
%(supervisor_logs_dir)s/*.log {
    size %(log_max_size)d
    rotate %(old_log_count)d
    copytruncate
    compress
    missingok
    notifempty
}
''' % {
        'supervisor_logs_dir': supervisor_logs_dir,
        'log_max_size': __LOG_MAX_SIZE,
        'old_log_count': __OLD_LOG_COUNT,
    }

    logrotate_temp_fd, logrotate_temp_config_path = tempfile.mkstemp(
        suffix='.conf', prefix='logrotate')
    l.debug('Temporary logtorate config path: %s' % logrotate_temp_config_path)

    with os.fdopen(logrotate_temp_fd, 'w') as tmp:
        tmp.write(logrotate_config)

    l.info('Running logrotate...')
    subprocess.check_call([
        'logrotate', '--verbose', '--state', logrotate_state_file,
        logrotate_temp_config_path
    ])

    l.debug('Cleaning up temporary logrotate config...')
    os.unlink(logrotate_temp_config_path)
Example #7
0
def rotate_supervisor_logs():
    root_path = mc_root_path()
    l.debug('Media Cloud root path: %s' % root_path)

    config = get_config()
    child_log_dir = config['supervisor']['childlogdir']
    l.debug('Child log directory: %s' % child_log_dir)

    supervisor_logs_dir = os.path.join(root_path, child_log_dir)
    l.info('Supervisor logs path: %s' % supervisor_logs_dir)

    logrotate_state_file = os.path.join(supervisor_logs_dir, 'logrotate.state')
    l.debug('logrotate state file: %s' % logrotate_state_file)

    if not os.path.isdir(supervisor_logs_dir):
        raise Exception('Supervisor logs directory does not exist at path: %s' % supervisor_logs_dir)

    logrotate_config = '''
%(supervisor_logs_dir)s/*.log {
    size %(log_max_size)d
    rotate %(old_log_count)d
    copytruncate
    compress
    missingok
    notifempty
}
''' % {
        'supervisor_logs_dir': supervisor_logs_dir,
        'log_max_size': __LOG_MAX_SIZE,
        'old_log_count': __OLD_LOG_COUNT,
    }

    logrotate_temp_fd, logrotate_temp_config_path = tempfile.mkstemp(suffix='.conf', prefix='logrotate')
    l.debug('Temporary logtorate config path: %s' % logrotate_temp_config_path)

    with os.fdopen(logrotate_temp_fd, 'w') as tmp:
        tmp.write(logrotate_config)

    l.info('Running logrotate...')
    subprocess.check_call([
        'logrotate',
        '--verbose',
        '--state', logrotate_state_file,
        logrotate_temp_config_path
    ])

    l.debug('Cleaning up temporary logrotate config...')
    os.unlink(logrotate_temp_config_path)
Example #8
0
def get_path_to_data_files(subdirectory: str = '') -> str:
    """Get path to where data file(s) should be stored."""

    subdirectory = decode_object_from_bytes_if_needed(subdirectory)

    path = os.path.join(mc_root_path(), 't', 'data', subdirectory)

    # Try to create just the base directory
    if not os.path.isdir(path):
        log.warning("Creating test data directory '{}'...".format(path))
        os.mkdir(path)

    if not os.path.isdir(path):
        raise McGetPathToDataFilesException(
            "Test data file path '{}' is not a directory (or doesn't exist at all).".format(path)
        )

    return path
Example #9
0
def get_path_to_data_files(subdirectory: str = '') -> str:
    """Get path to where data file(s) should be stored."""

    subdirectory = decode_object_from_bytes_if_needed(subdirectory)

    path = os.path.join(mc_root_path(), 't', 'data', subdirectory)

    # Try to create just the base directory
    if not os.path.isdir(path):
        log.warning("Creating test data directory '{}'...".format(path))
        os.mkdir(path)

    if not os.path.isdir(path):
        raise McGetPathToDataFilesException(
            "Test data file path '{}' is not a directory (or doesn't exist at all)."
            .format(path))

    return path
Example #10
0
    def schema_is_up_to_date(self) -> bool:
        """Checks if the database schema is up-to-date"""
        root_dir = mc_root_path()

        # Check if the database is empty
        db_vars_table_exists = len(
            self.query("""
            -- noinspection SqlResolve
            SELECT *
            FROM information_schema.tables
            WHERE table_name = 'database_variables'
        """).flat()) > 0
        if not db_vars_table_exists:
            l.info(
                "Database table 'database_variables' does not exist, probably the database is empty at this point."
            )
            return True

        # Current schema version
        (current_schema_version, ) = self.query("""
            SELECT value AS schema_version
            FROM database_variables
            WHERE name = 'database-schema-version'
            LIMIT 1
        """).flat()
        current_schema_version = int(current_schema_version)
        if current_schema_version == 0:
            raise McSchemaIsUpToDateException("Current schema version is 0")

        # Target schema version
        sql = open(os.path.join(root_dir, 'schema', 'mediawords.sql'),
                   'r').read()
        target_schema_version = schema_version_from_lines(sql)
        if not target_schema_version:
            raise McSchemaIsUpToDateException("Invalid target schema version.")

        # Check if the current schema is up-to-date
        if current_schema_version != target_schema_version:
            return self.__should_continue_with_outdated_schema(
                current_schema_version, target_schema_version)
        else:
            # Things are fine at this point.
            return True
Example #11
0
    def schema_is_up_to_date(self) -> bool:
        """Checks if the database schema is up-to-date"""
        root_dir = mc_root_path()

        # Check if the database is empty
        db_vars_table_exists = len(self.query("""
            -- noinspection SqlResolve
            SELECT *
            FROM information_schema.tables
            WHERE table_name = 'database_variables'
        """).flat()) > 0
        if not db_vars_table_exists:
            l.info("Database table 'database_variables' does not exist, probably the database is empty at this point.")
            return True

        # Current schema version
        (current_schema_version,) = self.query("""
            SELECT value AS schema_version
            FROM database_variables
            WHERE name = 'database-schema-version'
            LIMIT 1
        """).flat()
        current_schema_version = int(current_schema_version)
        if current_schema_version == 0:
            raise Exception("Current schema version is 0")

        # Target schema version
        sql = open(os.path.join(root_dir, 'schema', 'mediawords.sql'), 'r').read()
        target_schema_version = schema_version_from_lines(sql)
        if not target_schema_version:
            raise Exception("Invalid target schema version.")

        # Check if the current schema is up-to-date
        if current_schema_version != target_schema_version:
            return self.__should_continue_with_outdated_schema(current_schema_version, target_schema_version)
        else:
            # Things are fine at this point.
            return True
Example #12
0
def __set_dynamic_defaults(config: dict) -> dict:
    """Fill configuration dictionary with some preset values."""
    if 'mediawords' not in config or config['mediawords'] is None:
        raise McConfigException('Configuration does not have "mediawords" key')

    if 'data_dir' not in config['mediawords'] or config['mediawords']['data_dir'] is None:
        # FIXME create a helper in 'paths'
        config['mediawords']['data_dir'] = os.path.join(mc_root_path(), 'data')

    # MC_REWRITE_TO_PYTHON: probably not needed after Python rewrite
    if 'Plugin::Authentication' not in config or config['Plugin::Authentication'] is None:
        config['Plugin::Authentication'] = {
            "default_realm": 'users',
            "users": {
                "credential": {
                    "class": 'MediaWords'
                },
                "store": {
                    "class": 'MediaWords'
                }
            }
        }

    return config
Example #13
0
class McChineseTokenizer(object):
    """Chinese language tokenizer that uses jieba."""

    # Path to jieba dictionary(ies)
    __dict_path = os.path.join(mc_root_path(),
                               'lib/MediaWords/Languages/resources/zh/')
    __jieba_dict_path = os.path.join(__dict_path, 'dict.txt.big')
    __jieba_userdict_path = os.path.join(__dict_path, 'userdict.txt')

    # jieba instance
    __jieba = None

    # Text -> sentence tokenizer for Chinese text
    __chinese_sentence_tokenizer = RegexpTokenizer(
        r'([^!?。]*[!?。])',
        gaps=True,  # don't discard non-Chinese text
        discard_empty=True,
    )

    # Text -> sentence tokenizer for non-Chinese (e.g. English) text
    __non_chinese_sentence_tokenizer = PunktSentenceTokenizer()

    def __init__(self):
        """Initialize jieba tokenizer."""

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__dict_path):
            raise McChineseTokenizerException("""
                jieba dictionary directory was not found: %s
                Maybe you forgot to initialize Git submodules?
                """ % self.__dict_path)

        if not os.path.isfile(self.__jieba_dict_path):
            raise McChineseTokenizerException("""
                Default dictionary not found in jieba dictionary directory: %s
                Maybe you forgot to run jieba installation script?
                """ % self.__dict_path)
        if not os.path.isfile(self.__jieba_userdict_path):
            raise McChineseTokenizerException("""
                User dictionary not found in jieba dictionary directory: %s
                Maybe you forgot to run jieba installation script?
                """ % self.__dict_path)
        try:
            # loading dictionary is part of the init process
            self.__jieba.set_dictionary(os.path.join(self.__jieba_dict_path))
            self.__jieba.load_userdict(os.path.join(
                self.__jieba_userdict_path))
        except Exception as ex:
            raise McChineseTokenizerException(
                "Unable to initialize jieba: %s" % str(ex))

    def tokenize_text_to_sentences(self, text: str) -> list:
        """Tokenize Chinese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            log.warning("Text to tokenize into sentences is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Chinese text
        chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in chinese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split("\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split("\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Chinese text
                    non_chinese_sentences = self.__non_chinese_sentence_tokenizer.tokenize(
                        list_item)

                    sentences += non_chinese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    def tokenize_sentence_to_words(self, sentence: str) -> list:
        """Tokenize Chinese sentence into words.
        
        Removes punctuation."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__jieba.lcut(sentence, cut_all=False)
        parsed_tokens = [x for x in parsed_text if x.strip()]
        words = []
        for parsed_token in parsed_tokens:
            if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None:
                words.append(parsed_token)
            else:
                pass
        return words
Example #14
0
def test_mc_root_path():
    root_path = mc_paths.mc_root_path()
    assert os.path.exists(root_path)
    assert os.path.isdir(root_path)
    assert os.path.isfile(os.path.join(root_path, 'mediawords.yml.dist'))
Example #15
0
def __read_static_defaults() -> dict:
    """Return configuration defaults dictionary."""
    defaults_file_yml = os.path.join(mc_root_path(), "mediawords.yml.dist")
    static_defaults = __parse_yaml(defaults_file_yml)
    return static_defaults
def rotate_http_request_log():
    root_path = mc_root_path()
    l.debug("Media Cloud root path: %s" % root_path)

    logs_dir = os.path.join(root_path, "data", "logs")
    if not os.path.isdir(logs_dir):
        raise Exception("Logs directory does not exist at path: %s" % logs_dir)
    l.debug("Logs path: %s" % logs_dir)

    try:
        path_to_xz = subprocess.check_output(["/bin/bash", "-c", "command -v xz"]).decode("utf-8").strip()
    except subprocess.CalledProcessError as ex:
        raise Exception('"xz" not found on the system: %s' % str(ex))
    l.info('Path to "xz": %s' % path_to_xz)

    try:
        path_to_unxz = subprocess.check_output(["/bin/bash", "-c", "command -v unxz"]).decode("utf-8").strip()
    except subprocess.CalledProcessError as ex:
        raise Exception('"unxz" not found on the system: %s' % str(ex))
    l.info('Path to "unxz": %s' % path_to_unxz)

    http_request_log_path = os.path.join(logs_dir, "http_request.log")
    if not os.path.isfile(http_request_log_path):
        raise Exception("HTTP request log does not exist at path: %s" % http_request_log_path)
    l.info("HTTP request log path: %s" % http_request_log_path)

    logrotate_state_file = os.path.join(logs_dir, "http_request-logrotate.state")
    l.debug("logrotate state file: %s" % logrotate_state_file)

    logrotate_config = """
%(http_request_log_path)s {
    daily
    size %(log_max_size)d
    rotate %(old_log_count)d
    copytruncate
    compress
    compresscmd %(path_to_xz)s
    compressext .xz
    compressoptions -9
    uncompresscmd %(path_to_unxz)s
    missingok
    notifempty
}
""" % {
        "http_request_log_path": http_request_log_path,
        "log_max_size": __LOG_MAX_SIZE,
        "old_log_count": __OLD_LOG_COUNT,
        "path_to_xz": path_to_xz,
        "path_to_unxz": path_to_unxz,
    }

    logrotate_temp_fd, logrotate_temp_config_path = tempfile.mkstemp(suffix=".conf", prefix="logrotate")
    l.debug("Temporary logtorate config path: %s" % logrotate_temp_config_path)

    with os.fdopen(logrotate_temp_fd, "w") as tmp:
        tmp.write(logrotate_config)

    l.info("Running logrotate...")
    subprocess.check_call(["logrotate", "--verbose", "--state", logrotate_state_file, logrotate_temp_config_path])

    l.debug("Cleaning up temporary logrotate config...")
    os.unlink(logrotate_temp_config_path)
Example #17
0
def __read_static_defaults() -> dict:
    """Return configuration defaults dictionary."""
    defaults_file_yml = os.path.join(mc_root_path(), "mediawords.yml.dist")
    static_defaults = __parse_yaml(defaults_file_yml)
    return static_defaults
Example #18
0
def _word2vec_test_data_dir() -> str:
    """Return path to word2vec testing data directory."""
    return os.path.join(mc_root_path(), 'mediacloud', 'test-data', 'word2vec')
def rotate_http_request_log():
    root_path = mc_root_path()
    log.debug('Media Cloud root path: %s' % root_path)

    logs_dir = os.path.join(root_path, 'data', 'logs')
    if not os.path.isdir(logs_dir):
        raise Exception('Logs directory does not exist at path: %s' % logs_dir)
    log.debug('Logs path: %s' % logs_dir)

    try:
        path_to_xz = subprocess.check_output(['/bin/bash', '-c', 'command -v xz']).decode('utf-8').strip()
    except subprocess.CalledProcessError as ex:
        raise Exception('"xz" not found on the system: %s' % str(ex))
    log.info('Path to "xz": %s' % path_to_xz)

    try:
        path_to_unxz = subprocess.check_output(['/bin/bash', '-c', 'command -v unxz']).decode('utf-8').strip()
    except subprocess.CalledProcessError as ex:
        raise Exception('"unxz" not found on the system: %s' % str(ex))
    log.info('Path to "unxz": %s' % path_to_unxz)

    http_request_log_path = os.path.join(logs_dir, 'http_request.log')
    if not os.path.isfile(http_request_log_path):
        raise Exception('HTTP request log does not exist at path: %s' % http_request_log_path)
    log.info('HTTP request log path: %s' % http_request_log_path)

    logrotate_state_file = os.path.join(logs_dir, 'http_request-logrotate.state')
    log.debug('logrotate state file: %s' % logrotate_state_file)

    logrotate_config = '''
%(http_request_log_path)s {
    daily
    size %(log_max_size)d
    rotate %(old_log_count)d
    copytruncate
    compress
    compresscmd %(path_to_xz)s
    compressext .xz
    compressoptions -9
    uncompresscmd %(path_to_unxz)s
    missingok
    notifempty
}
''' % {
        'http_request_log_path': http_request_log_path,
        'log_max_size': __LOG_MAX_SIZE,
        'old_log_count': __OLD_LOG_COUNT,
        'path_to_xz': path_to_xz,
        'path_to_unxz': path_to_unxz,
    }

    logrotate_temp_fd, logrotate_temp_config_path = tempfile.mkstemp(suffix='.conf', prefix='logrotate')
    log.debug('Temporary logtorate config path: %s' % logrotate_temp_config_path)

    with os.fdopen(logrotate_temp_fd, 'w') as tmp:
        tmp.write(logrotate_config)

    log.info('Running logrotate...')
    subprocess.check_call([
        'logrotate',
        '--verbose',
        '--state', logrotate_state_file,
        logrotate_temp_config_path
    ])

    log.debug('Cleaning up temporary logrotate config...')
    os.unlink(logrotate_temp_config_path)
Example #20
0
def _word2vec_test_data_dir() -> str:
    """Return path to word2vec testing data directory."""
    return os.path.join(mc_root_path(), 'mediacloud', 'test-data', 'word2vec')
Example #21
0
class McJapaneseTokenizer(object):
    """Japanese language tokenizer that uses MeCab."""

    # Path to MeCab dictionary
    # (protected and not private because used by the unit test)
    _MECAB_DICTIONARY_PATH = os.path.join(
        mc_root_path(),
        'lib/MediaWords/Languages/resources/ja/mecab-ipadic-neologd/')

    # MeCab instance
    __mecab = None

    # Text -> sentence tokenizer for Japanese text
    __japanese_sentence_tokenizer = RegexpTokenizer(
        r'([^!?。]*[!?。])',
        gaps=True,  # don't discard non-Japanese text
        discard_empty=True,
    )

    # Text -> sentence tokenizer for non-Japanese (e.g. English) text
    __non_japanese_sentence_tokenizer = PunktSentenceTokenizer()

    __MECAB_TOKEN_POS_SEPARATOR = random_string(
        length=16)  # for whatever reason tab doesn't work
    __MECAB_EOS_MARK = 'EOS'

    def __init__(self):
        """Initialize MeCab tokenizer."""

        if not os.path.isdir(self._MECAB_DICTIONARY_PATH):
            raise McJapaneseTokenizerException("""
                MeCab dictionary directory was not found: %s
                Maybe you forgot to initialize Git submodules?
                """ % self._MECAB_DICTIONARY_PATH)

        if not os.path.isfile(
                os.path.join(self._MECAB_DICTIONARY_PATH, 'sys.dic')):
            raise McJapaneseTokenizerException("""
                MeCab dictionary directory does not contain a dictionary: %s
                Maybe you forgot to run ./install/install_mecab-ipadic-neologd.sh?
                """ % self._MECAB_DICTIONARY_PATH)

        try:
            self.__mecab = MeCab.Tagger(
                '--dicdir=%(dictionary_path)s '
                '--node-format=%%m%(token_pos_separator)s%%h\\n '
                '--eos-format=%(eos_mark)s\\n' % {
                    'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR,
                    'eos_mark': self.__MECAB_EOS_MARK,
                    'dictionary_path': self._MECAB_DICTIONARY_PATH,
                })
        except Exception as ex:
            raise McJapaneseTokenizerException(
                "Unable to initialize MeCab: %s" % str(ex))

    def tokenize_text_to_sentences(self, text: str) -> list:
        """Tokenize Japanese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            l.warning("Text to tokenize into sentences is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Japanese text
        japanese_sentences = self.__japanese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in japanese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split("\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split("\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Japanese text
                    non_japanese_sentences = self.__non_japanese_sentence_tokenizer.tokenize(
                        list_item)

                    sentences += non_japanese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    @staticmethod
    def _mecab_allowed_pos_ids() -> Dict[int, str]:
        """Return allowed MeCab part-of-speech IDs and their definitions from pos-id.def.
        
        Definitions don't do much in the language module itself, they're used by unit tests to verify that pos-id.def
        didn't change in some unexpected way and we're not missing out on newly defined POSes.
        """
        return {
            36: '名詞,サ変接続,*,*',  # noun-verbal
            38: '名詞,一般,*,*',  # noun
            40: '名詞,形容動詞語幹,*,*',  # adjectival nouns or quasi-adjectives
            41: '名詞,固有名詞,一般,*',  # proper nouns
            42: '名詞,固有名詞,人名,一般',  # proper noun, names of people
            43: '名詞,固有名詞,人名,姓',  # proper noun, first name
            44: '名詞,固有名詞,人名,名',  # proper noun, last name
            45: '名詞,固有名詞,組織,*',  # proper noun, organization
            46: '名詞,固有名詞,地域,一般',  # proper noun in general
            47: '名詞,固有名詞,地域,国',  # proper noun, country name
        }

    def tokenize_sentence_to_words(self, sentence: str) -> list:
        """Tokenize Japanese sentence into words.
        
        Removes punctuation and words that don't belong to part-of-speech whitelist."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            l.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__mecab.parse(sentence).strip()
        parsed_tokens = parsed_text.split("\n")

        allowed_pos_ids = self._mecab_allowed_pos_ids()

        words = []
        for parsed_token_line in parsed_tokens:
            if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line:

                primary_form_and_pos_number = parsed_token_line.split(
                    self.__MECAB_TOKEN_POS_SEPARATOR)

                primary_form = primary_form_and_pos_number[0]
                pos_number = primary_form_and_pos_number[1]

                if pos_number.isdigit():
                    pos_number = int(pos_number)

                    if pos_number in allowed_pos_ids:
                        words.append(primary_form)

            else:
                # Ignore all the "EOS" stuff
                pass

        return words