def test_create_training_data_from_txt(self): training_data_fact = training_data_factory.TrainingDataFactory() parser = epub_to_txt_parser.EPubToTxtParser() text_preprocessor = txt_pre_processor.TxtPreProcessor() with open(os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'), 'r', encoding="utf-8") as csv_infile: csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"') tmp_txt_files = [] headers = next(csv_reader) for row in csv_reader: text = parser.narrative_from_epub_to_txt( training_data_fact.lookup_epub_filename(row[1])) text = text_preprocessor.transform(text) tmp_txt_file_name = os.path.join(sfsf_config.get_txt_dir(), '{i}.txt'.format(i=row[1])) tmp_txt_files.append(tmp_txt_file_name) txt_file = open(tmp_txt_file_name, 'w', encoding='utf-8') txt_file.write(text) txt_file.close() print(row[1]) training_result = training_data_fact.create('wpg_data.csv', 2, source=sfsf_config.TXT) for file_name in tmp_txt_files: os.remove(file_name) self.assertEqual((253, 21627), training_result['x'].shape) self.assertEqual((253, ), training_result['y'].shape) self.assertEqual(253, len(training_result['isbns']))
def print_info(items, message): file_sizes = [] for item in items: file_size = os.path.getsize( os.path.join(sfsf_config.get_txt_dir(), '{i}.txt'.format(i=item[1]))) file_sizes.append([item[1], item[4], file_size, item[2]]) file_sizes.sort(key=lambda x: int(x[2])) print('-------', message, '------') for item in file_sizes: txt_file = open(os.path.join(sfsf_config.get_txt_dir(), '{i}.txt'.format(i=item[0])), 'r', encoding="utf-8") nwords = len(re.findall('\s+', txt_file.read())) print(item[0], item[1], round(item[2] / 1024), nwords, item[3], sep='\t')
def sample_txts(self, isbn_data, sample_size): samples = [] for isbn_info in isbn_data: try: txt_file = open(os.path.join(sfsf_config.get_txt_dir(), '{i}.txt'.format(i=isbn_info[1])), 'r', encoding='utf-8', errors='ignore') narrative_text = txt_file.read() txt_file.close() samples.append( self.sample_string(isbn_info[1], narrative_text, sample_size)) except FileNotFoundError: print("Skipping ISBN {0}".format(isbn_info[1])) return samples
def test_config(self): self.assertEqual('txt', sfsf_config.TXT) self.assertEqual('epub', sfsf_config.EPUB) self.assertEqual('production', sfsf_config.PRODUCTION) self.assertEqual('test', sfsf_config.DEVELOPMENT) path_to_here = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) path_to_test = os.path.join(path_to_here, 'sfsf/../data/test') path_to_data = os.path.join(path_to_here, 'sfsf/../data/production') path_to_test_epub = os.path.join(path_to_here, 'sfsf/../data/test/epub') path_to_test_txt = os.path.join(path_to_here, 'sfsf/../data/test/txt') self.assertEqual(path_to_test, sfsf_config.get_data_dir()) sfsf_config.set_env(sfsf_config.PRODUCTION) self.assertEqual(path_to_data, sfsf_config.get_data_dir()) sfsf_config.set_env(sfsf_config.DEVELOPMENT) self.assertEqual(path_to_test, sfsf_config.get_data_dir()) self.assertEqual(path_to_test_epub, sfsf_config.get_epub_dir()) self.assertEqual(path_to_test_txt, sfsf_config.get_txt_dir())
from sfsf import epub_to_txt_parser from sfsf import txt_pre_processor from sfsf import training_data_factory # sfsf_config.set_env( sfsf_config.DEVELOPMENT ) with open(os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'), 'r', encoding="utf-8") as csv_infile: training_data_fact = training_data_factory.TrainingDataFactory() parser = epub_to_txt_parser.EPubToTxtParser() text_preprocessor = txt_pre_processor.TxtPreProcessor() csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"') tmp_txt_files = [] headers = next(csv_reader) for row in csv_reader: try: text = parser.narrative_from_epub_to_txt( training_data_fact.lookup_epub_filename(row[1])) text = text_preprocessor.transform(text) tmp_txt_file_name = os.path.join(sfsf_config.get_txt_dir(), '{i}.txt'.format(i=row[1])) tmp_txt_files.append(tmp_txt_file_name) txt_file = open(tmp_txt_file_name, 'w', encoding='utf8') txt_file.write(text) txt_file.close() print(row[1], end=' ') except: print('\n', 'Caught an error for {r}'.format(r=row[1]), '\n', traceback.format_exc())