Ejemplo n.º 1
0
 def test_create_training_data_from_txt(self):
     training_data_fact = training_data_factory.TrainingDataFactory()
     parser = epub_to_txt_parser.EPubToTxtParser()
     text_preprocessor = txt_pre_processor.TxtPreProcessor()
     with open(os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'),
               'r',
               encoding="utf-8") as csv_infile:
         csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"')
         tmp_txt_files = []
         headers = next(csv_reader)
         for row in csv_reader:
             text = parser.narrative_from_epub_to_txt(
                 training_data_fact.lookup_epub_filename(row[1]))
             text = text_preprocessor.transform(text)
             tmp_txt_file_name = os.path.join(sfsf_config.get_txt_dir(),
                                              '{i}.txt'.format(i=row[1]))
             tmp_txt_files.append(tmp_txt_file_name)
             txt_file = open(tmp_txt_file_name, 'w', encoding='utf-8')
             txt_file.write(text)
             txt_file.close()
             print(row[1])
     training_result = training_data_fact.create('wpg_data.csv',
                                                 2,
                                                 source=sfsf_config.TXT)
     for file_name in tmp_txt_files:
         os.remove(file_name)
     self.assertEqual((253, 21627), training_result['x'].shape)
     self.assertEqual((253, ), training_result['y'].shape)
     self.assertEqual(253, len(training_result['isbns']))
Ejemplo n.º 2
0
def print_info(items, message):
    file_sizes = []
    for item in items:
        file_size = os.path.getsize(
            os.path.join(sfsf_config.get_txt_dir(),
                         '{i}.txt'.format(i=item[1])))
        file_sizes.append([item[1], item[4], file_size, item[2]])

    file_sizes.sort(key=lambda x: int(x[2]))
    print('-------', message, '------')
    for item in file_sizes:
        txt_file = open(os.path.join(sfsf_config.get_txt_dir(),
                                     '{i}.txt'.format(i=item[0])),
                        'r',
                        encoding="utf-8")
        nwords = len(re.findall('\s+', txt_file.read()))
        print(item[0],
              item[1],
              round(item[2] / 1024),
              nwords,
              item[3],
              sep='\t')
Ejemplo n.º 3
0
 def sample_txts(self, isbn_data, sample_size):
     samples = []
     for isbn_info in isbn_data:
         try:
             txt_file = open(os.path.join(sfsf_config.get_txt_dir(),
                                          '{i}.txt'.format(i=isbn_info[1])),
                             'r',
                             encoding='utf-8',
                             errors='ignore')
             narrative_text = txt_file.read()
             txt_file.close()
             samples.append(
                 self.sample_string(isbn_info[1], narrative_text,
                                    sample_size))
         except FileNotFoundError:
             print("Skipping ISBN {0}".format(isbn_info[1]))
     return samples
Ejemplo n.º 4
0
 def test_config(self):
     self.assertEqual('txt', sfsf_config.TXT)
     self.assertEqual('epub', sfsf_config.EPUB)
     self.assertEqual('production', sfsf_config.PRODUCTION)
     self.assertEqual('test', sfsf_config.DEVELOPMENT)
     path_to_here = os.path.abspath(
         os.path.join(os.path.dirname(os.path.abspath(__file__)),
                      os.pardir))
     path_to_test = os.path.join(path_to_here, 'sfsf/../data/test')
     path_to_data = os.path.join(path_to_here, 'sfsf/../data/production')
     path_to_test_epub = os.path.join(path_to_here,
                                      'sfsf/../data/test/epub')
     path_to_test_txt = os.path.join(path_to_here, 'sfsf/../data/test/txt')
     self.assertEqual(path_to_test, sfsf_config.get_data_dir())
     sfsf_config.set_env(sfsf_config.PRODUCTION)
     self.assertEqual(path_to_data, sfsf_config.get_data_dir())
     sfsf_config.set_env(sfsf_config.DEVELOPMENT)
     self.assertEqual(path_to_test, sfsf_config.get_data_dir())
     self.assertEqual(path_to_test_epub, sfsf_config.get_epub_dir())
     self.assertEqual(path_to_test_txt, sfsf_config.get_txt_dir())
Ejemplo n.º 5
0
from sfsf import epub_to_txt_parser
from sfsf import txt_pre_processor
from sfsf import training_data_factory

# sfsf_config.set_env( sfsf_config.DEVELOPMENT )

with open(os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'),
          'r',
          encoding="utf-8") as csv_infile:
    training_data_fact = training_data_factory.TrainingDataFactory()
    parser = epub_to_txt_parser.EPubToTxtParser()
    text_preprocessor = txt_pre_processor.TxtPreProcessor()
    csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"')
    tmp_txt_files = []
    headers = next(csv_reader)
    for row in csv_reader:
        try:
            text = parser.narrative_from_epub_to_txt(
                training_data_fact.lookup_epub_filename(row[1]))
            text = text_preprocessor.transform(text)
            tmp_txt_file_name = os.path.join(sfsf_config.get_txt_dir(),
                                             '{i}.txt'.format(i=row[1]))
            tmp_txt_files.append(tmp_txt_file_name)
            txt_file = open(tmp_txt_file_name, 'w', encoding='utf8')
            txt_file.write(text)
            txt_file.close()
            print(row[1], end=' ')
        except:
            print('\n', 'Caught an error for {r}'.format(r=row[1]), '\n',
                  traceback.format_exc())