def __set_dynamic_defaults(config: dict) -> dict: """Fill configuration dictionary with some preset values.""" if 'mediawords' not in config or config['mediawords'] is None: raise McConfigException('Configuration does not have "mediawords" key') if 'data_dir' not in config['mediawords'] or config['mediawords']['data_dir'] is None: # FIXME create a helper in 'paths' config['mediawords']['data_dir'] = os.path.join(mc_root_path(), 'data') # FIXME probably not needed if 'session' not in config or config['session'] is None: config['session'] = {} if 'storage' not in config['session'] or config['session']['storage'] is None: config['session']['storage'] = os.path.join(os.path.expanduser('~'), "tmp", "mediacloud-session") # MC_REWRITE_TO_PYTHON: probably not needed after Python rewrite if 'Plugin::Authentication' not in config or config['Plugin::Authentication'] is None: config['Plugin::Authentication'] = { "default_realm": 'users', "users": { "credential": { "class": 'MediaWords' }, "store": { "class": 'MediaWords' } } } return config
def __templates_path() -> str: """Return path to Jinja2 email templates.""" root_path = mc_root_path() email_templates_path = os.path.join( root_path, 'lib', 'MediaWords', 'Util', 'Mail', 'Message', 'Templates', 'email-templates' ) if not os.path.isdir(email_templates_path): raise McMailTemplatesNotFound('Templates directory was not found at "%s".' % email_templates_path) return email_templates_path
def __templates_path() -> str: """Return path to Jinja2 email templates.""" root_path = mc_root_path() email_templates_path = os.path.join(root_path, 'lib', 'MediaWords', 'Util', 'Mail', 'Message', 'Templates', 'email-templates') if not os.path.isdir(email_templates_path): raise McMailTemplatesNotFound( 'Templates directory was not found at "%s".' % email_templates_path) return email_templates_path
def get_config() -> dict: """Get configuration dictionary.""" global __CONFIG if __CONFIG is not None: return __CONFIG # FIXME: This should be standardized set_config_file(os.path.join(mc_root_path(), "mediawords.yml")) # noinspection PyTypeChecker # FIXME inspection could still be enabled here return __CONFIG
def rotate_supervisor_logs(): root_path = mc_root_path() l.debug('Media Cloud root path: %s' % root_path) config = py_get_config() child_log_dir = config['supervisor']['childlogdir'] l.debug('Child log directory: %s' % child_log_dir) supervisor_logs_dir = os.path.join(root_path, child_log_dir) l.info('Supervisor logs path: %s' % supervisor_logs_dir) logrotate_state_file = os.path.join(supervisor_logs_dir, 'logrotate.state') l.debug('logrotate state file: %s' % logrotate_state_file) if not os.path.isdir(supervisor_logs_dir): raise Exception( 'Supervisor logs directory does not exist at path: %s' % supervisor_logs_dir) logrotate_config = ''' %(supervisor_logs_dir)s/*.log { size %(log_max_size)d rotate %(old_log_count)d copytruncate compress missingok notifempty } ''' % { 'supervisor_logs_dir': supervisor_logs_dir, 'log_max_size': __LOG_MAX_SIZE, 'old_log_count': __OLD_LOG_COUNT, } logrotate_temp_fd, logrotate_temp_config_path = tempfile.mkstemp( suffix='.conf', prefix='logrotate') l.debug('Temporary logtorate config path: %s' % logrotate_temp_config_path) with os.fdopen(logrotate_temp_fd, 'w') as tmp: tmp.write(logrotate_config) l.info('Running logrotate...') subprocess.check_call([ 'logrotate', '--verbose', '--state', logrotate_state_file, logrotate_temp_config_path ]) l.debug('Cleaning up temporary logrotate config...') os.unlink(logrotate_temp_config_path)
def rotate_supervisor_logs(): root_path = mc_root_path() l.debug('Media Cloud root path: %s' % root_path) config = get_config() child_log_dir = config['supervisor']['childlogdir'] l.debug('Child log directory: %s' % child_log_dir) supervisor_logs_dir = os.path.join(root_path, child_log_dir) l.info('Supervisor logs path: %s' % supervisor_logs_dir) logrotate_state_file = os.path.join(supervisor_logs_dir, 'logrotate.state') l.debug('logrotate state file: %s' % logrotate_state_file) if not os.path.isdir(supervisor_logs_dir): raise Exception('Supervisor logs directory does not exist at path: %s' % supervisor_logs_dir) logrotate_config = ''' %(supervisor_logs_dir)s/*.log { size %(log_max_size)d rotate %(old_log_count)d copytruncate compress missingok notifempty } ''' % { 'supervisor_logs_dir': supervisor_logs_dir, 'log_max_size': __LOG_MAX_SIZE, 'old_log_count': __OLD_LOG_COUNT, } logrotate_temp_fd, logrotate_temp_config_path = tempfile.mkstemp(suffix='.conf', prefix='logrotate') l.debug('Temporary logtorate config path: %s' % logrotate_temp_config_path) with os.fdopen(logrotate_temp_fd, 'w') as tmp: tmp.write(logrotate_config) l.info('Running logrotate...') subprocess.check_call([ 'logrotate', '--verbose', '--state', logrotate_state_file, logrotate_temp_config_path ]) l.debug('Cleaning up temporary logrotate config...') os.unlink(logrotate_temp_config_path)
def get_path_to_data_files(subdirectory: str = '') -> str: """Get path to where data file(s) should be stored.""" subdirectory = decode_object_from_bytes_if_needed(subdirectory) path = os.path.join(mc_root_path(), 't', 'data', subdirectory) # Try to create just the base directory if not os.path.isdir(path): log.warning("Creating test data directory '{}'...".format(path)) os.mkdir(path) if not os.path.isdir(path): raise McGetPathToDataFilesException( "Test data file path '{}' is not a directory (or doesn't exist at all).".format(path) ) return path
def get_path_to_data_files(subdirectory: str = '') -> str: """Get path to where data file(s) should be stored.""" subdirectory = decode_object_from_bytes_if_needed(subdirectory) path = os.path.join(mc_root_path(), 't', 'data', subdirectory) # Try to create just the base directory if not os.path.isdir(path): log.warning("Creating test data directory '{}'...".format(path)) os.mkdir(path) if not os.path.isdir(path): raise McGetPathToDataFilesException( "Test data file path '{}' is not a directory (or doesn't exist at all)." .format(path)) return path
def schema_is_up_to_date(self) -> bool: """Checks if the database schema is up-to-date""" root_dir = mc_root_path() # Check if the database is empty db_vars_table_exists = len( self.query(""" -- noinspection SqlResolve SELECT * FROM information_schema.tables WHERE table_name = 'database_variables' """).flat()) > 0 if not db_vars_table_exists: l.info( "Database table 'database_variables' does not exist, probably the database is empty at this point." ) return True # Current schema version (current_schema_version, ) = self.query(""" SELECT value AS schema_version FROM database_variables WHERE name = 'database-schema-version' LIMIT 1 """).flat() current_schema_version = int(current_schema_version) if current_schema_version == 0: raise McSchemaIsUpToDateException("Current schema version is 0") # Target schema version sql = open(os.path.join(root_dir, 'schema', 'mediawords.sql'), 'r').read() target_schema_version = schema_version_from_lines(sql) if not target_schema_version: raise McSchemaIsUpToDateException("Invalid target schema version.") # Check if the current schema is up-to-date if current_schema_version != target_schema_version: return self.__should_continue_with_outdated_schema( current_schema_version, target_schema_version) else: # Things are fine at this point. return True
def schema_is_up_to_date(self) -> bool: """Checks if the database schema is up-to-date""" root_dir = mc_root_path() # Check if the database is empty db_vars_table_exists = len(self.query(""" -- noinspection SqlResolve SELECT * FROM information_schema.tables WHERE table_name = 'database_variables' """).flat()) > 0 if not db_vars_table_exists: l.info("Database table 'database_variables' does not exist, probably the database is empty at this point.") return True # Current schema version (current_schema_version,) = self.query(""" SELECT value AS schema_version FROM database_variables WHERE name = 'database-schema-version' LIMIT 1 """).flat() current_schema_version = int(current_schema_version) if current_schema_version == 0: raise Exception("Current schema version is 0") # Target schema version sql = open(os.path.join(root_dir, 'schema', 'mediawords.sql'), 'r').read() target_schema_version = schema_version_from_lines(sql) if not target_schema_version: raise Exception("Invalid target schema version.") # Check if the current schema is up-to-date if current_schema_version != target_schema_version: return self.__should_continue_with_outdated_schema(current_schema_version, target_schema_version) else: # Things are fine at this point. return True
def __set_dynamic_defaults(config: dict) -> dict: """Fill configuration dictionary with some preset values.""" if 'mediawords' not in config or config['mediawords'] is None: raise McConfigException('Configuration does not have "mediawords" key') if 'data_dir' not in config['mediawords'] or config['mediawords']['data_dir'] is None: # FIXME create a helper in 'paths' config['mediawords']['data_dir'] = os.path.join(mc_root_path(), 'data') # MC_REWRITE_TO_PYTHON: probably not needed after Python rewrite if 'Plugin::Authentication' not in config or config['Plugin::Authentication'] is None: config['Plugin::Authentication'] = { "default_realm": 'users', "users": { "credential": { "class": 'MediaWords' }, "store": { "class": 'MediaWords' } } } return config
class McChineseTokenizer(object): """Chinese language tokenizer that uses jieba.""" # Path to jieba dictionary(ies) __dict_path = os.path.join(mc_root_path(), 'lib/MediaWords/Languages/resources/zh/') __jieba_dict_path = os.path.join(__dict_path, 'dict.txt.big') __jieba_userdict_path = os.path.join(__dict_path, 'userdict.txt') # jieba instance __jieba = None # Text -> sentence tokenizer for Chinese text __chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) # Text -> sentence tokenizer for non-Chinese (e.g. English) text __non_chinese_sentence_tokenizer = PunktSentenceTokenizer() def __init__(self): """Initialize jieba tokenizer.""" self.__jieba = JiebaTokenizer() if not os.path.isdir(self.__dict_path): raise McChineseTokenizerException(""" jieba dictionary directory was not found: %s Maybe you forgot to initialize Git submodules? """ % self.__dict_path) if not os.path.isfile(self.__jieba_dict_path): raise McChineseTokenizerException(""" Default dictionary not found in jieba dictionary directory: %s Maybe you forgot to run jieba installation script? """ % self.__dict_path) if not os.path.isfile(self.__jieba_userdict_path): raise McChineseTokenizerException(""" User dictionary not found in jieba dictionary directory: %s Maybe you forgot to run jieba installation script? """ % self.__dict_path) try: # loading dictionary is part of the init process self.__jieba.set_dictionary(os.path.join(self.__jieba_dict_path)) self.__jieba.load_userdict(os.path.join( self.__jieba_userdict_path)) except Exception as ex: raise McChineseTokenizerException( "Unable to initialize jieba: %s" % str(ex)) def tokenize_text_to_sentences(self, text: str) -> list: """Tokenize Chinese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text to tokenize into sentences is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Chinese text chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in chinese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split("\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split("\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Chinese text non_chinese_sentences = self.__non_chinese_sentence_tokenizer.tokenize( list_item) sentences += non_chinese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences def tokenize_sentence_to_words(self, sentence: str) -> list: """Tokenize Chinese sentence into words. Removes punctuation.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence to tokenize into words is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__jieba.lcut(sentence, cut_all=False) parsed_tokens = [x for x in parsed_text if x.strip()] words = [] for parsed_token in parsed_tokens: if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None: words.append(parsed_token) else: pass return words
def test_mc_root_path(): root_path = mc_paths.mc_root_path() assert os.path.exists(root_path) assert os.path.isdir(root_path) assert os.path.isfile(os.path.join(root_path, 'mediawords.yml.dist'))
def __read_static_defaults() -> dict: """Return configuration defaults dictionary.""" defaults_file_yml = os.path.join(mc_root_path(), "mediawords.yml.dist") static_defaults = __parse_yaml(defaults_file_yml) return static_defaults
def rotate_http_request_log(): root_path = mc_root_path() l.debug("Media Cloud root path: %s" % root_path) logs_dir = os.path.join(root_path, "data", "logs") if not os.path.isdir(logs_dir): raise Exception("Logs directory does not exist at path: %s" % logs_dir) l.debug("Logs path: %s" % logs_dir) try: path_to_xz = subprocess.check_output(["/bin/bash", "-c", "command -v xz"]).decode("utf-8").strip() except subprocess.CalledProcessError as ex: raise Exception('"xz" not found on the system: %s' % str(ex)) l.info('Path to "xz": %s' % path_to_xz) try: path_to_unxz = subprocess.check_output(["/bin/bash", "-c", "command -v unxz"]).decode("utf-8").strip() except subprocess.CalledProcessError as ex: raise Exception('"unxz" not found on the system: %s' % str(ex)) l.info('Path to "unxz": %s' % path_to_unxz) http_request_log_path = os.path.join(logs_dir, "http_request.log") if not os.path.isfile(http_request_log_path): raise Exception("HTTP request log does not exist at path: %s" % http_request_log_path) l.info("HTTP request log path: %s" % http_request_log_path) logrotate_state_file = os.path.join(logs_dir, "http_request-logrotate.state") l.debug("logrotate state file: %s" % logrotate_state_file) logrotate_config = """ %(http_request_log_path)s { daily size %(log_max_size)d rotate %(old_log_count)d copytruncate compress compresscmd %(path_to_xz)s compressext .xz compressoptions -9 uncompresscmd %(path_to_unxz)s missingok notifempty } """ % { "http_request_log_path": http_request_log_path, "log_max_size": __LOG_MAX_SIZE, "old_log_count": __OLD_LOG_COUNT, "path_to_xz": path_to_xz, "path_to_unxz": path_to_unxz, } logrotate_temp_fd, logrotate_temp_config_path = tempfile.mkstemp(suffix=".conf", prefix="logrotate") l.debug("Temporary logtorate config path: %s" % logrotate_temp_config_path) with os.fdopen(logrotate_temp_fd, "w") as tmp: tmp.write(logrotate_config) l.info("Running logrotate...") subprocess.check_call(["logrotate", "--verbose", "--state", logrotate_state_file, logrotate_temp_config_path]) l.debug("Cleaning up temporary logrotate config...") os.unlink(logrotate_temp_config_path)
def _word2vec_test_data_dir() -> str: """Return path to word2vec testing data directory.""" return os.path.join(mc_root_path(), 'mediacloud', 'test-data', 'word2vec')
def rotate_http_request_log(): root_path = mc_root_path() log.debug('Media Cloud root path: %s' % root_path) logs_dir = os.path.join(root_path, 'data', 'logs') if not os.path.isdir(logs_dir): raise Exception('Logs directory does not exist at path: %s' % logs_dir) log.debug('Logs path: %s' % logs_dir) try: path_to_xz = subprocess.check_output(['/bin/bash', '-c', 'command -v xz']).decode('utf-8').strip() except subprocess.CalledProcessError as ex: raise Exception('"xz" not found on the system: %s' % str(ex)) log.info('Path to "xz": %s' % path_to_xz) try: path_to_unxz = subprocess.check_output(['/bin/bash', '-c', 'command -v unxz']).decode('utf-8').strip() except subprocess.CalledProcessError as ex: raise Exception('"unxz" not found on the system: %s' % str(ex)) log.info('Path to "unxz": %s' % path_to_unxz) http_request_log_path = os.path.join(logs_dir, 'http_request.log') if not os.path.isfile(http_request_log_path): raise Exception('HTTP request log does not exist at path: %s' % http_request_log_path) log.info('HTTP request log path: %s' % http_request_log_path) logrotate_state_file = os.path.join(logs_dir, 'http_request-logrotate.state') log.debug('logrotate state file: %s' % logrotate_state_file) logrotate_config = ''' %(http_request_log_path)s { daily size %(log_max_size)d rotate %(old_log_count)d copytruncate compress compresscmd %(path_to_xz)s compressext .xz compressoptions -9 uncompresscmd %(path_to_unxz)s missingok notifempty } ''' % { 'http_request_log_path': http_request_log_path, 'log_max_size': __LOG_MAX_SIZE, 'old_log_count': __OLD_LOG_COUNT, 'path_to_xz': path_to_xz, 'path_to_unxz': path_to_unxz, } logrotate_temp_fd, logrotate_temp_config_path = tempfile.mkstemp(suffix='.conf', prefix='logrotate') log.debug('Temporary logtorate config path: %s' % logrotate_temp_config_path) with os.fdopen(logrotate_temp_fd, 'w') as tmp: tmp.write(logrotate_config) log.info('Running logrotate...') subprocess.check_call([ 'logrotate', '--verbose', '--state', logrotate_state_file, logrotate_temp_config_path ]) log.debug('Cleaning up temporary logrotate config...') os.unlink(logrotate_temp_config_path)
class McJapaneseTokenizer(object): """Japanese language tokenizer that uses MeCab.""" # Path to MeCab dictionary # (protected and not private because used by the unit test) _MECAB_DICTIONARY_PATH = os.path.join( mc_root_path(), 'lib/MediaWords/Languages/resources/ja/mecab-ipadic-neologd/') # MeCab instance __mecab = None # Text -> sentence tokenizer for Japanese text __japanese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Japanese text discard_empty=True, ) # Text -> sentence tokenizer for non-Japanese (e.g. English) text __non_japanese_sentence_tokenizer = PunktSentenceTokenizer() __MECAB_TOKEN_POS_SEPARATOR = random_string( length=16) # for whatever reason tab doesn't work __MECAB_EOS_MARK = 'EOS' def __init__(self): """Initialize MeCab tokenizer.""" if not os.path.isdir(self._MECAB_DICTIONARY_PATH): raise McJapaneseTokenizerException(""" MeCab dictionary directory was not found: %s Maybe you forgot to initialize Git submodules? """ % self._MECAB_DICTIONARY_PATH) if not os.path.isfile( os.path.join(self._MECAB_DICTIONARY_PATH, 'sys.dic')): raise McJapaneseTokenizerException(""" MeCab dictionary directory does not contain a dictionary: %s Maybe you forgot to run ./install/install_mecab-ipadic-neologd.sh? """ % self._MECAB_DICTIONARY_PATH) try: self.__mecab = MeCab.Tagger( '--dicdir=%(dictionary_path)s ' '--node-format=%%m%(token_pos_separator)s%%h\\n ' '--eos-format=%(eos_mark)s\\n' % { 'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR, 'eos_mark': self.__MECAB_EOS_MARK, 'dictionary_path': self._MECAB_DICTIONARY_PATH, }) except Exception as ex: raise McJapaneseTokenizerException( "Unable to initialize MeCab: %s" % str(ex)) def tokenize_text_to_sentences(self, text: str) -> list: """Tokenize Japanese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: l.warning("Text to tokenize into sentences is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Japanese text japanese_sentences = self.__japanese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in japanese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split("\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split("\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Japanese text non_japanese_sentences = self.__non_japanese_sentence_tokenizer.tokenize( list_item) sentences += non_japanese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences @staticmethod def _mecab_allowed_pos_ids() -> Dict[int, str]: """Return allowed MeCab part-of-speech IDs and their definitions from pos-id.def. Definitions don't do much in the language module itself, they're used by unit tests to verify that pos-id.def didn't change in some unexpected way and we're not missing out on newly defined POSes. """ return { 36: '名詞,サ変接続,*,*', # noun-verbal 38: '名詞,一般,*,*', # noun 40: '名詞,形容動詞語幹,*,*', # adjectival nouns or quasi-adjectives 41: '名詞,固有名詞,一般,*', # proper nouns 42: '名詞,固有名詞,人名,一般', # proper noun, names of people 43: '名詞,固有名詞,人名,姓', # proper noun, first name 44: '名詞,固有名詞,人名,名', # proper noun, last name 45: '名詞,固有名詞,組織,*', # proper noun, organization 46: '名詞,固有名詞,地域,一般', # proper noun in general 47: '名詞,固有名詞,地域,国', # proper noun, country name } def tokenize_sentence_to_words(self, sentence: str) -> list: """Tokenize Japanese sentence into words. Removes punctuation and words that don't belong to part-of-speech whitelist.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: l.warning("Sentence to tokenize into words is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__mecab.parse(sentence).strip() parsed_tokens = parsed_text.split("\n") allowed_pos_ids = self._mecab_allowed_pos_ids() words = [] for parsed_token_line in parsed_tokens: if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line: primary_form_and_pos_number = parsed_token_line.split( self.__MECAB_TOKEN_POS_SEPARATOR) primary_form = primary_form_and_pos_number[0] pos_number = primary_form_and_pos_number[1] if pos_number.isdigit(): pos_number = int(pos_number) if pos_number in allowed_pos_ids: words.append(primary_form) else: # Ignore all the "EOS" stuff pass return words