Esempio n. 1
0
 def test_create_training_data_from_txt(self):
     training_data_fact = training_data_factory.TrainingDataFactory()
     parser = epub_to_txt_parser.EPubToTxtParser()
     text_preprocessor = txt_pre_processor.TxtPreProcessor()
     with open(os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'),
               'r',
               encoding="utf-8") as csv_infile:
         csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"')
         tmp_txt_files = []
         headers = next(csv_reader)
         for row in csv_reader:
             text = parser.narrative_from_epub_to_txt(
                 training_data_fact.lookup_epub_filename(row[1]))
             text = text_preprocessor.transform(text)
             tmp_txt_file_name = os.path.join(sfsf_config.get_txt_dir(),
                                              '{i}.txt'.format(i=row[1]))
             tmp_txt_files.append(tmp_txt_file_name)
             txt_file = open(tmp_txt_file_name, 'w', encoding='utf-8')
             txt_file.write(text)
             txt_file.close()
             print(row[1])
     training_result = training_data_fact.create('wpg_data.csv',
                                                 2,
                                                 source=sfsf_config.TXT)
     for file_name in tmp_txt_files:
         os.remove(file_name)
     self.assertEqual((253, 21627), training_result['x'].shape)
     self.assertEqual((253, ), training_result['y'].shape)
     self.assertEqual(253, len(training_result['isbns']))
Esempio n. 2
0
 def test_get_top_bottom(self):
     training_data = training_data_factory.TrainingDataFactory()
     samples_tuple = training_data.get_top_bottom('wpg_data.csv', cull=2)
     csv = pandas.read_csv(
         os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'))
     smallest_sale = min(csv['totaal afzet'])
     self.assertEqual(smallest_sale, int(samples_tuple[1][1][4]))
     highest_sale = max(csv['totaal afzet'])
     self.assertEqual(highest_sale, int(samples_tuple[0][0][4]))
Esempio n. 3
0
 def test_get_linear_items_data(self):
     parser = epub_to_txt_parser.EPubToTxtParser()
     dir_path = os.path.dirname(os.path.realpath(__file__))
     items = []
     path_to_nantas = os.path.join(
         sfsf_config.get_data_dir(),
         'epub/20150602093137_9789460422515.epub')
     items = parser.get_linear_items_data(path_to_nantas)
     self.assertEqual(len(items), 5)
Esempio n. 4
0
 def test_config(self):
     self.assertEqual('txt', sfsf_config.TXT)
     self.assertEqual('epub', sfsf_config.EPUB)
     self.assertEqual('production', sfsf_config.PRODUCTION)
     self.assertEqual('test', sfsf_config.DEVELOPMENT)
     path_to_here = os.path.abspath(
         os.path.join(os.path.dirname(os.path.abspath(__file__)),
                      os.pardir))
     path_to_test = os.path.join(path_to_here, 'sfsf/../data/test')
     path_to_data = os.path.join(path_to_here, 'sfsf/../data/production')
     path_to_test_epub = os.path.join(path_to_here,
                                      'sfsf/../data/test/epub')
     path_to_test_txt = os.path.join(path_to_here, 'sfsf/../data/test/txt')
     self.assertEqual(path_to_test, sfsf_config.get_data_dir())
     sfsf_config.set_env(sfsf_config.PRODUCTION)
     self.assertEqual(path_to_data, sfsf_config.get_data_dir())
     sfsf_config.set_env(sfsf_config.DEVELOPMENT)
     self.assertEqual(path_to_test, sfsf_config.get_data_dir())
     self.assertEqual(path_to_test_epub, sfsf_config.get_epub_dir())
     self.assertEqual(path_to_test_txt, sfsf_config.get_txt_dir())
Esempio n. 5
0
 def get_isbn_data(self, wpg_data_file):
     isbn_data = []
     with open(os.path.join(sfsf_config.get_data_dir(), wpg_data_file),
               'r',
               encoding="ISO-8859-1") as csv_infile:
         csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"')
         headers = next(csv_reader)
         for row in csv_reader:
             # select NUR, ISBN, Title, Author, Total sales
             isbn_data.append([row[0], row[1], row[2], row[3], row[11]])
         # sort on total copies sold
         isbn_data.sort(key=lambda x: int(x[4]), reverse=True)
     return isbn_data
 def test_save_load(self):
     factory = training_data_factory.TrainingDataFactory()
     training_data = factory.create('wpg_data.csv', 2)
     model = deep_learning_model.DeepLearningModel()
     # training_data, batch_size, epochs
     accuracy = model.build((training_data['x'], training_data['y']), 10, 5)
     model.save('test_save_load_model')
     del model
     vect = training_data['vectorizer']
     isbn_info = [['', 9789023449416, '']]
     test_tuples = factory.sample_epubs(isbn_info, 1000)[-4:]
     test_samples = [
         test_sample for tupel in test_tuples for test_sample in tupel[1]
     ]
     test_tdm = vect.transform(test_samples)
     model = deep_learning_model.DeepLearningModel()
     model.load('test_save_load_model')
     predictions = model.predict(numpy.array(test_tdm.toarray()))
     for idx, prediction in enumerate(predictions):
         assert (prediction[0] > 0.9)
     os.remove(
         os.path.join(sfsf_config.get_data_dir(),
                      'test_save_load_model.h5'))
 def load(self, name):
     del self.model
     self.model = load_model(
         os.path.join(sfsf_config.get_data_dir(), '{n}.h5'.format(n=name)))
     self.x_dim = self.model.get_layer('primary_input').input_shape[1]
 def save(self, name):
     self.model.save(
         os.path.join(sfsf_config.get_data_dir(), '{n}.h5'.format(n=name)))
Esempio n. 9
0
import csv
import os
import traceback
from sfsf import sfsf_config
from sfsf import epub_to_txt_parser
from sfsf import txt_pre_processor
from sfsf import training_data_factory

# sfsf_config.set_env( sfsf_config.DEVELOPMENT )

with open(os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'),
          'r',
          encoding="utf-8") as csv_infile:
    training_data_fact = training_data_factory.TrainingDataFactory()
    parser = epub_to_txt_parser.EPubToTxtParser()
    text_preprocessor = txt_pre_processor.TxtPreProcessor()
    csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"')
    tmp_txt_files = []
    headers = next(csv_reader)
    for row in csv_reader:
        try:
            text = parser.narrative_from_epub_to_txt(
                training_data_fact.lookup_epub_filename(row[1]))
            text = text_preprocessor.transform(text)
            tmp_txt_file_name = os.path.join(sfsf_config.get_txt_dir(),
                                             '{i}.txt'.format(i=row[1]))
            tmp_txt_files.append(tmp_txt_file_name)
            txt_file = open(tmp_txt_file_name, 'w', encoding='utf8')
            txt_file.write(text)
            txt_file.close()
            print(row[1], end=' ')