def process(files): corpus_store = corpus_utils.CorpusStore(func_validate = func_validate) for idx, file in enumerate(files): f_abst = file[0] # absolute path f_rel = file[1] # relative path print ("(", idx, "of", len(files), ") file", f_rel) file_content = file_utils.read_file_any_encoding(f_abst) if (len(file_content) == 0): continue # 1st, process the data with Atomic Parser parser = pat.Parser(corpus_store) parser.parse(file_content) # Process the same data with Atomic HeaderBody Parser # parser = pah.Parser(corpus_store) # parser.parse(file_content) # Process the same data with HeaderBody Parser parser = phb.Parser(corpus_store, target_tag) parser.parse(file_content) # Export the parsed data into file print ("Exporting the result...") return corpus_store.export_corpus(output_dir, size_limit_KB = size_limit_KB)
def test_split(self): corpus_store = corpus_utils.CorpusStore() corpus_store.store_data( ['source a', 'source b', 'source c', 'source d'], ['target a', 'target b', 'target c', 'target d']) ratio = (0.5, 0.5) actual = corpus_store.split(ratio) self.assertEqual(len(actual), 2) expected = [['source a', 'target a'], ['source b', 'target b']] self.assertListEqual(expected, actual[0].data) expected = [['source c', 'target c'], ['source d', 'target d']] self.assertListEqual(expected, actual[1].data) ratio = (0.5, 0.25, 0.25) actual = corpus_store.split(ratio) self.assertEqual(len(actual), 3) expected = [['source a', 'target a'], ['source b', 'target b']] self.assertListEqual(expected, actual[0].data) expected = [['source c', 'target c']] self.assertListEqual(expected, actual[1].data) expected = [['source d', 'target d']] self.assertListEqual(expected, actual[2].data) corpus_store.clear()
def test_store_data(self): corpus_store = corpus_utils.CorpusStore() # If the input to the corpus store is either # - a line of text or # - a list of text # the result will be the same shaped list of text. corpus_store.store_data('source text', 'target text') self.assertTrue(corpus_store.data, ['source text', 'target text']) corpus_store.clear() corpus_store.store_data(['source text'], ['target text']) self.assertTrue(corpus_store.data, ['source text', 'target text']) corpus_store.clear() # The result is broken into multiple lines if multiple inputs are stored. corpus_store.store_data(['source text1', 'source text2'], ['target text1', 'target text2']) self.assertTrue(corpus_store.data, [['source text1', 'target text1'], ['source text2', 'target text2']]) corpus_store.clear() # The result is broken into words space ' ' separated. # If the input text is multi-byte character, each character is separated. corpus_store.store_data(['source text', 'ソース'], ['target text', 'ターゲット']) self.assertTrue(corpus_store.data, [['source text', 'target text'], ['ソ ー ス', 'ターゲット']]) corpus_store.clear() # The input is single line of source and multi lines of target. corpus_store.store_data('source line', 'target line1\ntarget line2') self.assertTrue(corpus_store.data, ['source line', 'target line <br> target line2']) corpus_store.clear()
def generate(output_dir, myname, yourname, pair_loaders, func_validate=None, size_limit_KB=None): """ Generate the corpus files from template resources of data. The template resources include salute, nodding, themed conversation etc. myname is to replace '{myname}' tag appearing in the sentenses. {myname} is a phrase that is used to call yourself, for example, "Hi, my name is {myname}." yourname is to replace '{yourname}' tag appearing in the sentenses. {yourname} is a phrase that is used when someone calls you, for example, "Hi {yourname}, nice to see you." pair_loaders is a list containing sentense pair resource loaders. Sentense pair resouces are a bunch of source/target text pairs that are used to generate corpus data. func_validate is a function that takes source/target text pairs from HTML parsers, and it returns the validated and cleaned texts. The validated texts are only stored into corpus. If omitted any texts will be stored. size_limit_KB is the limit of each corpus file to export. The size is in Kilo bite (1024 bytes). If the size exceeds the limit, the corpus data is devided and multiple files are exported. """ if not os.path.exists(output_dir): os.makedirs(output_dir) corpus_store = corpus_utils.CorpusStore(func_validate=func_validate) def process(pairs): """ Loop through the list of src/tgt pairs, replace the resource tags (name, city etc) and store the result into CorpusStore. """ multipliers = [ mpl.NameMultiplier(), mpl.CityMultiplier(), mpl.CountryMultiplier(), mpl.LocationMultiplier(), mpl.ThingMultiplier() ] for src, tgt in pairs: src = src.replace('{myname}', myname) tgt = tgt.replace('{myname}', myname) src = src.replace('{yourname}', yourname) tgt = tgt.replace('{yourname}', yourname) srcs = [src] tgts = [tgt] for multiplier in multipliers: srcs, tgts = multiplier.multiply(srcs, tgts) corpus_store.store_data(srcs, tgts) for pair_loader in pair_loaders: pairs = pair_loader.load() print(len(pairs), "of pairs to process...") process(pairs) print("Exporting corpus...") exported_files = corpus_store.export_corpus(output_dir, size_limit_KB=size_limit_KB) corpus_store.print_report() print("Exported:", exported_files)
def test_parse_h3(self): """Test H3""" vocab = vocab_utils.VocabStore() corpus_store = corpus_utils.CorpusStore(vocab) parser = phb.Parser(corpus_store, 'h3') parser.parse(self.htmlStruct) expected = [ ['H3-A', 'body H3-A-1'], ['H3-A', 'body H3-A-1'] ] self.assertListEqual(expected, corpus_store.data)
def test_parse_h1(self): """Test H1""" vocab = vocab_utils.VocabStore() corpus_store = corpus_utils.CorpusStore(vocab) parser = phb.Parser(corpus_store, 'h1') parser.parse(self.htmlStruct) expected = [ ['H1-A', 'body H1-A-1\nbody H2-A-1\nbody H2-B-1\nbody H2-B-2\nbody H3-A-1\nbody H2-C-1'], ['H1-B', 'body H3-A-1\nbody H2-A-1'] ] self.assertListEqual(expected, corpus_store.data)
def test_AtomicParserJpn(self): """Test the targetted atomic parser with Japanese sentence.""" vocab = vocab_utils.VocabStore() corpus_store = corpus_utils.CorpusStore(vocab) parser = pat.Parser(corpus_store) parser.parse(self.html) expected = [ ['test', 'ヘッダー1'], ['ヘッダー1', 'こんにちは。'], ['こんにちは。', 'さようなら。'], ] self.assertListEqual(expected, corpus_store.data) expected = ['test', 'ヘッダー', '1', 'こんにちは', '。', 'さようなら'] self.assertListEqual(expected, vocab.words_new)
def test_validation(self): """ Test if validation works. Validator checks src and tgt texts, and only if they are valid, the texts are stored. """ def validator(source, target): if 'boom' in source or 'boom' in target: return '', '' else: return source, target corpus_store = corpus_utils.CorpusStore(func_validate=validator) corpus_store.store_data('source text', 'target text') corpus_store.store_data('boom', 'target text') corpus_store.store_data('source text', 'boom') self.assertTrue(corpus_store.data, ['source text', 'target text']) corpus_store.clear()
def test_AtomicHeaderBodyParser(self): """Test the targetted atomic parser which is going to be in the actual use.""" vocab = vocab_utils.VocabStore() corpus_store = corpus_utils.CorpusStore(vocab) parser = pah.Parser(corpus_store) parser.parse(self.htmlStruct) expected = [ ['H4-A', 'body H4-A-1'], ['H1-A', 'body H1-A-1'], ['H1-A', 'H3-A'], ['H1-A', 'H2-A'], ['H3-A', 'body H3-A-1'], ['H2-A', 'body H2-A-1'], ['H1-B', 'body H1-B-1'] ] self.assertListEqual(expected, corpus_store.data)
def test_AtomicHeaderBodyParserJpn(self): """Test the targetted atomic parser with Japanese sentence.""" vocab = vocab_utils.VocabStore() corpus_store = corpus_utils.CorpusStore(vocab) parser = pah.Parser(corpus_store) html = ('<html><head><title>test</title></head>' '<body>' '<h1>ヘッダー1</h1>' '<p>こんにちは。さようなら。</p>' '</body></html>') parser.parse(html) expected = [ ['ヘッダー1','こんにちは。'], ['ヘッダー1','さようなら。'] ] self.assertListEqual(expected, corpus_store.data)
def test_parse_script(self): """Test script element within a font element""" vocab = vocab_utils.VocabStore() corpus_store = corpus_utils.CorpusStore(vocab) parser = phb.Parser(corpus_store) html = ('<html><head><title>test</title></head>' '<body>' '<h1>H1-A</h1>' '<font>font element body' '<script>script shouldn\'t be parsed</script>' '</font>' '</body></html>') parser.parse(html) expected = [ ['H1-A', 'font element body'], ] self.assertListEqual(expected, corpus_store.data)
def compile(input_path, vocab_path, output_dir): """ Compile the corpus files and generate a set of NMT data files (train/dev/test). input_path is the corpus data, either a folder path or file path, both in absolute path. vocab_path is the vocaburary file, either a folder path or file path, both in absolute path. If folder path is given, the file name defaults 'vocab.src'. output_dir is the path to the folder where the data set is generated. """ # Create output directory if not exist if not os.path.exists(output_dir): os.makedirs(output_dir) # Create vocab file directory if not exist. And get the file path. if not os.path.isfile(vocab_path): if not os.path.exists(vocab_path): os.makedirs(vocab_path) vocab_path = os.path.join(vocab_path, 'vocab.src') # Store the compilation details into log file. with open(os.path.join(output_dir, 'compile.log'), 'w') as lf: def log_print(*arg): """ Log print function. """ texts = [str(elem) for elem in arg] log = ' '.join(texts) print(log) timeString = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) lf.write("{0} {1}\n".format(timeString, log)) if os.path.isfile(input_path): log_print("The input file is", input_path) log_print("The vocab file is", vocab_path) log_print("The output directory is", output_dir) files = [[input_path, os.path.basename(input_path)]] else: input_dir = input_path log_print("The input directory is", input_dir) log_print("The vocab file is", vocab_path) log_print("The output directory is", output_dir) log_print("Searching corpus files in the input directory...") files = file_utils.get_filelist_in_path("cor", input_dir, True) vocab = vocab_utils.VocabStore(vocab_path) corpus_store = corpus_utils.CorpusStore(vocab) log_print("Total", len(files), "files to process. Loading...") for idx, file in enumerate(files): f_abst = file[0] # absolute path f_rel = file[1] # relative path log_print("(", idx, "of", len(files), ") file", f_rel) # Import and restore corpus store. # Don't restore vocaburary here. It's time consuming. It'll be restored during export later on. corpus_store.import_corpus(f_abst, False) # Split the corpus data randomly into 3 blocks - train, dev and test. # The distribution ratio is train 98%, dev 1% and test 1%. # Be careful not to make dev and test files too big otherwise Tensorflow training # fails with out-of-memory (even with GPU machine). train, dev, test = corpus_store.split_rnd((0.98, 0.01, 0.01)) def process(corpus_store, subject, size_limit_KB=None): """ size_limit_KB is the limit of file size to be written. The size is in Kilo bite (1024 bytes) """ # Export the corpus data into file. Also vocaburary is restored here. log_print("Exporting the", subject, "data into file...") corpus_store.export_to_file(output_dir, subject, size_limit_KB, True) corpus_store.print_report(log_print) # Generate each file set process(train, "train") process(dev, "dev", 100) process(test, "test", 100) # Generate vocaburary file that contains words detected in all 3 file lists. vocab.sort_by_unicode() vocab.save_to_file() vocab.print_report(log_print) vocab.save_unicode_list(vocab_path + '.txt')