def test_build_deep_learning_model(self): factory = training_data_factory.TrainingDataFactory() training_data = factory.create('wpg_data.csv', 2) model = deep_learning_model.DeepLearningModel() # training_data, batch_size, epochs accuracy = model.build((training_data['x'], training_data['y']), 10, 5) self.assertEqual(100.00, accuracy)
def test_create_training_data_from_txt(self): training_data_fact = training_data_factory.TrainingDataFactory() parser = epub_to_txt_parser.EPubToTxtParser() text_preprocessor = txt_pre_processor.TxtPreProcessor() with open(os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'), 'r', encoding="utf-8") as csv_infile: csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"') tmp_txt_files = [] headers = next(csv_reader) for row in csv_reader: text = parser.narrative_from_epub_to_txt( training_data_fact.lookup_epub_filename(row[1])) text = text_preprocessor.transform(text) tmp_txt_file_name = os.path.join(sfsf_config.get_txt_dir(), '{i}.txt'.format(i=row[1])) tmp_txt_files.append(tmp_txt_file_name) txt_file = open(tmp_txt_file_name, 'w', encoding='utf-8') txt_file.write(text) txt_file.close() print(row[1]) training_result = training_data_fact.create('wpg_data.csv', 2, source=sfsf_config.TXT) for file_name in tmp_txt_files: os.remove(file_name) self.assertEqual((253, 21627), training_result['x'].shape) self.assertEqual((253, ), training_result['y'].shape) self.assertEqual(253, len(training_result['isbns']))
def do_train(train_sample): training_factory = training_data_factory.TrainingDataFactory() training_data = training_factory.create_by_sample('wpg_data.csv', train_sample["top"], train_sample["bottom"], sample_size=5000, source=sfsf_config.TXT) pickle.dump(training_data, open('sfsf_training_data.pickle', 'wb'))
def test_prediction_wrong_test_dimension(self): factory = training_data_factory.TrainingDataFactory() training_data = factory.create('wpg_data.csv', 2) model = deep_learning_model.DeepLearningModel() # training_data, batch_size, epochs accuracy = model.build((training_data['x'], training_data['y']), 10, 5) with self.assertRaises(deep_learning_model.TestingDimensionError): model.predict(numpy.array([[1, 2, 3], [1, 2, 3]]))
def test_get_top_bottom(self): training_data = training_data_factory.TrainingDataFactory() samples_tuple = training_data.get_top_bottom('wpg_data.csv', cull=2) csv = pandas.read_csv( os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv')) smallest_sale = min(csv['totaal afzet']) self.assertEqual(smallest_sale, int(samples_tuple[1][1][4])) highest_sale = max(csv['totaal afzet']) self.assertEqual(highest_sale, int(samples_tuple[0][0][4]))
def test_sampling(self): isbn_info = [['', 9789023449416, '']] training_data = training_data_factory.TrainingDataFactory() samples = training_data.sample_epubs(isbn_info, 1000) # expect a tuple (isbn, list of strings) self.assertEqual(1, len(samples)) self.assertEqual(9789023449416, samples[0][0]) self.assertEqual(72, len(samples[0][1])) for sample in samples[0][1]: self.assertEqual(str, type(sample))
def test_file_name_lookup(self): training_data = training_data_factory.TrainingDataFactory() assert (training_data.lookup_epub_filename('9789044964264').endswith( '20160113113032_9789044964264.epub')) # one should be returned in the case of multiples, don't care # which one. any_which = lambda file_name: file_name.endswith( '20150602093137_9789023449416.epub') or file_name.endswith( '20160113113032_9789023449416.epub') result = training_data.lookup_epub_filename('9789023449416') assert (any_which(result))
def do_sample(top_chunk_scores, bottom_chunk_scores, wpg_data_file): training_factory = training_data_factory.TrainingDataFactory() isbn_data = training_factory.get_isbn_data( wpg_data_file) # returns data sorted by sales top_isbn_data = [ isbn_row for isbn_row in isbn_data if isbn_row[1] in top_chunk_scores ] bottom_isbn_data = [ isbn_row for isbn_row in isbn_data if isbn_row[1] in bottom_chunk_scores ] return top_isbn_data, bottom_isbn_data
def test_predictions(self): factory = training_data_factory.TrainingDataFactory() training_data = factory.create('wpg_data.csv', 2) model = deep_learning_model.DeepLearningModel() # training_data, batch_size, epochs accuracy = model.build((training_data['x'], training_data['y']), 10, 5) vect = training_data['vectorizer'] isbn_info = [['', 9789023449416, '']] test_tuples = factory.sample_epubs(isbn_info, 1000)[-4:] test_samples = [ test_sample for tupel in test_tuples for test_sample in tupel[1] ] test_tdm = vect.transform(test_samples) predictions = model.predict(numpy.array(test_tdm.toarray())) for idx, prediction in enumerate(predictions): assert (prediction[0] > 0.9)
def do_sample(wpg_data_file, train_size, test_size, total_size): training_factory = training_data_factory.TrainingDataFactory() isbn_data = training_factory.get_isbn_data( wpg_data_file) # returns data sorted by sales top_data = isbn_data[:total_size] bottom_data = isbn_data[-total_size:] shuffle(top_data) shuffle(bottom_data) train_sample = { "top": top_data[:train_size["top"]], "bottom": bottom_data[:train_size["bottom"]], } test_sample = { "top": top_data[-test_size["top"]:], "bottom": bottom_data[-test_size["bottom"]:] } return train_sample, test_sample
def do_test(test_sample, train_size, test_size, total_size, iteration): # Test training data and predict training_factory = training_data_factory.TrainingDataFactory() training_data = pickle.load(open('sfsf_training_data.pickle', 'rb')) vectorizer = training_data['vectorizer'] testing_data_top = training_factory.sample_txts(test_sample["top"], 5000) testing_data_bottom = training_factory.sample_txts(test_sample["bottom"], 5000) isbns_top = [tuple[0] for tuple in testing_data_top for sample in tuple[1]] isbns_bottom = [ tuple[0] for tuple in testing_data_bottom for sample in tuple[1] ] testing_tdm_top = vectorizer.transform( [sample for tuple in testing_data_top for sample in tuple[1]]) testing_tdm_bottom = vectorizer.transform( [sample for tuple in testing_data_bottom for sample in tuple[1]]) model = deep_learning_model.DeepLearningModel() # training_data, batch_size, epochs accuracy = model.build((training_data['x'], training_data['y']), 10, 5) model.save('sfsf_deeplearning_model_{d}'.format( d=datetime.now().strftime('%Y%m%d_%H%M'))) predictions = model.predict(numpy.array(testing_tdm_top.toarray())) predictions_top = [] predictions_bottom = [] for idx, prediction in enumerate(predictions): predictions_top.append((isbns_top[idx], prediction[0])) predictions = model.predict(numpy.array(testing_tdm_bottom.toarray())) for idx, prediction in enumerate(predictions): predictions_bottom.append((isbns_bottom[idx], prediction[0])) # reporting report_file_name = 'report-total-{total}-train-{train}-test-{test}-iteration-{i}-date-{d}.csv'.format( total=total_size, train=train_size, test=test_size, i=iteration, d=datetime.now().strftime('%Y%m%d_%H%M')) report_file = open(report_file_name, 'w', encoding='utf8') csv_writer = csv.writer(report_file, delimiter=',') training_factory = training_data_factory.TrainingDataFactory() isbn_data = training_factory.get_isbn_data('wpg_data.csv') headers = [ 'deep_learning_data_type', 'NUR', 'ISBN', 'title', 'author', 'total sold', 'prediction' ] csv_writer.writerow(headers) print('\t'.join(headers)) training_isbns_reported = [] # combine training data y and isbns for idx, item in enumerate(training_data['y']): if not training_data['isbns'][idx] in training_isbns_reported: report_row = [] # 1 is top, 0 is flop if item == 1: report_row.append('training_top') else: report_row.append('training_bottom') #isbn: find the data isbn_info = next(isbn_inf for isbn_inf in isbn_data if isbn_inf[1] == training_data['isbns'][idx]) report_row.extend(isbn_info) report_row.append('NA') report(report_row, csv_writer) training_isbns_reported.append(training_data['isbns'][idx]) for prediction in predictions_top: report_row = [] report_row.append('testing_top') #isbn: find the data isbn_info = next(isbn_inf for isbn_inf in isbn_data if isbn_inf[1] == prediction[0]) report_row.extend(isbn_info) report_row.append(str(prediction[1])) report(report_row, csv_writer) for prediction in predictions_bottom: report_row = [] report_row.append('testing_bottom') #isbn: find the data isbn_info = next(isbn_inf for isbn_inf in isbn_data if isbn_inf[1] == prediction[0]) report_row.extend(isbn_info) report_row.append(str(prediction[1])) report(report_row, csv_writer) report_file.close()
def get_text_chunks(sample_data): training_factory = training_data_factory.TrainingDataFactory() return training_factory.sample_txts(sample_data, sample_size=5000)
def test_create_training_data(self): training_data = training_data_factory.TrainingDataFactory() training_result = training_data.create('wpg_data.csv', 2) self.assertEqual((253, 21627), training_result['x'].shape) self.assertEqual((253, ), training_result['y'].shape)
import csv import os import traceback from sfsf import sfsf_config from sfsf import epub_to_txt_parser from sfsf import txt_pre_processor from sfsf import training_data_factory # sfsf_config.set_env( sfsf_config.DEVELOPMENT ) with open(os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'), 'r', encoding="utf-8") as csv_infile: training_data_fact = training_data_factory.TrainingDataFactory() parser = epub_to_txt_parser.EPubToTxtParser() text_preprocessor = txt_pre_processor.TxtPreProcessor() csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"') tmp_txt_files = [] headers = next(csv_reader) for row in csv_reader: try: text = parser.narrative_from_epub_to_txt( training_data_fact.lookup_epub_filename(row[1])) text = text_preprocessor.transform(text) tmp_txt_file_name = os.path.join(sfsf_config.get_txt_dir(), '{i}.txt'.format(i=row[1])) tmp_txt_files.append(tmp_txt_file_name) txt_file = open(tmp_txt_file_name, 'w', encoding='utf8') txt_file.write(text) txt_file.close() print(row[1], end=' ')