Beispiel #1
0
    def save_xml(self, input_file, output_file):
        """Decompress the archive and store its content to file"""

        self.logger.info('Decompress wikipedia dump')

        io_utils.create_path(output_file)
        io_utils.decompress(input_file, output_file)
Beispiel #2
0
    def save_archive(self, input_file, output_file):
        """Download archive and store it locally"""

        self.logger.info('Download wikipedia dump')

        io_utils.create_path(output_file)
        io_utils.download(input_file, output_file)
Beispiel #3
0
def store_csv(data, output, quoting=1):
    """Store data to csv file"""

    LOGGER.info("Store data to '{}' csv file".format(output))

    io_utils.create_path(output)
    with open(output, 'w', encoding='utf-8') as ostream:
        data.to_csv(ostream, index=0, header=1, quoting=quoting)
Beispiel #4
0
def store_jsonlines(data, output):
    """Store data to json file"""

    LOGGER.info("Store data to '{}' json file".format(output))

    io_utils.create_path(output)
    with open(output, 'w', encoding='utf-8') as ostream:
        for _, row in data.iterrows():
            ostream.write(row.to_json(force_ascii=False) + '\n')
Beispiel #5
0
    def save_sentences(self, input_file, output_file, field, **clean_kwargs):
        """Extract and preprocess sentences from Wikipedia jsonl file"""

        self.logger.info('Extract clean sentences')

        io_utils.create_path(output_file)
        with open(output_file, 'w', encoding='utf-8') as ostream:
            for doc in stream_field(input_file, field):
                for sent in str_utils.sentences(doc):
                    sent = str_utils.clean_text(sent, **clean_kwargs)
                    if sent:
                        ostream.write(sent + '\n')
Beispiel #6
0
def store_text(data, output):
    """Store single-field data to text file"""

    if not isinstance(data, list) \
            and not isinstance(data, types.GeneratorType):
        raise ConfigError(
            "Method expects list / generator object instead of {}".format(
                data.__class__))

    LOGGER.info("Store data to '{}' text file".format(output))
    io_utils.create_path(output)
    with open(output, 'w', encoding='utf-8') as ostream:
        for entry in data:
            ostream.write(entry + '\n')
Beispiel #7
0
    def save_content(self, input_file, output_file, args):
        """Extract plain text from Wikipedia xml file"""

        io_utils.create_path(output_file)
        command = "{} {} {} -o - > {}".format(EXTRACTSCRIPT, input_file,
                                              cfg_utils.expand_to_string(args),
                                              output_file)

        self.logger.info(
            "Extract plain text from Wikipedia "\
            "by executing the command:\n{}".format(command))

        # launch extractor script with given configuration
        os.system(command)
Beispiel #8
0
    def save_tokens(self, output):
        """Save the token counts to json file"""

        io_utils.create_path(output)

        if output.endswith('.json'):
            # save words and frequencies under json file
            self.logger.info("Save {} counts in json file".format(self.token))
            with open(output, 'w', encoding='utf-8') as ostream:
                json.dump(self.tokens,
                          ostream,
                          ensure_ascii=False,
                          indent=4,
                          sort_keys=True)
        else:
            # save the list of words under text file
            self.logger.info("Save {}s in text file".format(self.token))
            with open(output, 'w', encoding='utf-8') as ostream:
                for token in sorted(self.tokens.keys()):
                    ostream.write(token + '\n')
Beispiel #9
0
    def test_create_delete(self):
        io_utils.create_path(self.empty_file)
        io_utils.create_folder(self.cur_dir)

        self.assertEqual(None, io_utils.check_file_readable(self.empty_file))
        self.assertEqual(None, io_utils.check_folder_readable(self.cur_dir))

        cfolder = 'test_folder'
        cfile = os.path.join(cfolder, 'test_file.txt')

        io_utils.create_path(cfile)
        with open(cfile, 'w') as ostream:
            ostream.write('')
        self.assertEqual(None, io_utils.check_file_readable(cfile))

        io_utils.delete_file(cfile)
        with self.assertRaises(Exception) as context:
            io_utils.check_file_readable(cfile)
        self.assertTrue('missing or not readable' in str(context.exception))

        io_utils.delete_folder(cfolder)
        io_utils.create_folder(cfolder)
        self.assertEqual(None, io_utils.check_folder_readable(cfolder))

        io_utils.delete_folder(cfolder)
        with self.assertRaises(Exception) as context:
            io_utils.check_folder_readable(cfolder)
        self.assertTrue('missing' in str(context.exception))

        io_utils.create_path('nofile.txt')