Ejemplo n.º 1
0
 def __init__(self, language="en", punkt_data_path=None):
     self.lang2datapath = {"en": "tokenizers/punkt/english.pickle"}
     self.log = log.get_global_console_logger()
     try:
         import nltk.data
     except ImportError:
         self.log.error(
             "Cannot import NLTK data for the sentence splitter. Please "
             "check if the 'punkt' NLTK-package is installed correctly.")
     try:
         if not punkt_data_path:
             punkt_data_path = self.lang2datapath[language]
         self.sent_detector = nltk.data.load(punkt_data_path)
     except KeyError:
         self.log.error(
             "No sentence splitter data for language {}.".format(language))
     except:
         self.log.error("Could not load sentence splitter data: {}".format(
             self.lang2datapath[language]))
Ejemplo n.º 2
0
    def __init__(self, rouge_dir=None, rouge_args=None):
        """
        Create a Rouge155 object.

            rouge_dir:  Directory containing Rouge-1.5.5.pl
            rouge_args: Arguments to pass through to ROUGE if you
                        don't want to use the default pyrouge
                        arguments.

        """
        self.log = log.get_global_console_logger()
        self.__set_dir_properties()
        self._config_file = None
        self._settings_file = os.path.join(
            os.path.dirname(__file__), 'settings.ini')
        self.__set_rouge_dir(rouge_dir)
        self.args = self.__clean_rouge_args(rouge_args)
        self._system_filename_pattern = None
        self._model_filename_pattern = None
Ejemplo n.º 3
0
    def __init__(self, rouge_dir=None, rouge_args=None):
        """
        Create a Rouge155 object.

            rouge_dir:  Directory containing Rouge-1.5.5.pl
            rouge_args: Arguments to pass through to ROUGE if you
                        don't want to use the default pyrouge
                        arguments.

        """
        self.log = log.get_global_console_logger()
        self.__set_dir_properties()
        self._config_file = None
        self._settings_file = os.path.join(os.path.dirname(__file__),
                                           'settings.ini')
        self.__set_rouge_dir(rouge_dir)
        self.args = self.__clean_rouge_args(rouge_args)
        self._system_filename_pattern = None
        self._model_filename_pattern = None
Ejemplo n.º 4
0
 def __init__(self, language="en", punkt_data_path=None):
     self.lang2datapath = {"en": "tokenizers/punkt/english.pickle"}
     self.log = log.get_global_console_logger()
     try:
         import nltk.data
     except ImportError:
         self.log.error(
             "Cannot import NLTK data for the sentence splitter. Please "
             "check if the 'punkt' NLTK-package is installed correctly.")
     try:
         if not punkt_data_path:
             punkt_data_path = self.lang2datapath[language]
         self.sent_detector = nltk.data.load(punkt_data_path)
     except KeyError:
         self.log.error(
             "No sentence splitter data for language {}.".format(language))
     except:
         self.log.error(
             "Could not load sentence splitter data: {}".format(
                 self.lang2datapath[language]))
Ejemplo n.º 5
0
    def process(input_dir, output_dir, function):
        """
        Apply function to all files in input_dir and save the resulting ouput
        files in output_dir.

        """
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        logger = log.get_global_console_logger()
        logger.info("Processing files in {}.".format(input_dir))
        input_file_names = os.listdir(input_dir)
        for input_file_name in input_file_names:
            logger.info("Processing {}.".format(input_file_name))
            input_file = os.path.join(input_dir, input_file_name)
            with codecs.open(input_file, "r", encoding="UTF-8") as f:
                input_string = f.read()
            output_string = function(input_string)
            output_file = os.path.join(output_dir, input_file_name)
            with codecs.open(output_file, "w", encoding="UTF-8") as f:
                f.write(output_string)
        logger.info("Saved processed files to {}.".format(output_dir))
Ejemplo n.º 6
0
    def process(input_dir, output_dir, function):
        """
        Apply function to all files in input_dir and save the resulting ouput
        files in output_dir.

        """
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        logger = log.get_global_console_logger()
        logger.info("Processing files in {}.".format(input_dir))
        input_file_names = os.listdir(input_dir)
        for input_file_name in input_file_names:
            logger.info("Processing {}.".format(input_file_name))
            input_file = os.path.join(input_dir, input_file_name)
            with codecs.open(input_file, "r", encoding="UTF-8") as f:
                input_string = f.read()
            output_string = function(input_string)
            output_file = os.path.join(output_dir, input_file_name)
            with codecs.open(output_file, "w", encoding="UTF-8") as f:
                f.write(output_string)
        logger.info("Saved processed files to {}.".format(output_dir))