def run_train_data(train_data_csv): aristo_train_data = AristoData(train_data_csv, range(0,2000)) aristo_test_data = AristoData(train_data_csv, range(100,110)) aristo_test_data.print_summary() aristo_train_data.print_summary() pipeline = SimilarityPipeline(train_data=aristo_train_data, test_data=aristo_test_data) pipeline.run_pipeline() out_dir=os.path.join(os.path.dirname(__file__),"../../../outputdata/train_{}".format(time.strftime('%Y%m%d_%H%M%S'))) os.makedirs(out_dir) pipeline.write_to_disk((out_dir))
def run_test_data(data_csv): out_dir=os.path.join(os.path.dirname(__file__),"../../../outputdata/test_{}".format(time.strftime('%Y%m%d_%H%M%S'))) os.makedirs(out_dir) logger = setup_log(out_dir) aristo_data = AristoData(data_csv) aristo_data.print_summary() pipeline = SolrWikipediaAllAnswerThenQuestionPipeline(data=aristo_data, logger = logger) pipeline.run_pipeline() pipeline.write_to_disk((out_dir)) print(pipeline.score())
class IntegrationTestAristoData(unittest.TestCase): def setUp(self): data_file_path = os.path.join(os.path.dirname(__file__), "../../../inputdata/training_set.tsv") print(os.path.abspath(data_file_path)) self._aristo_data = AristoData(data_file_path) def test_should_print_summary(self): self._aristo_data.print_summary() def test_should_get_x(self): self.assertEqual(len(self._aristo_data.x.columns), 5, "The expected number of columns does not match the actual") def test_should_get_y_columns(self): self.assertEqual(len(self._aristo_data.y.columns), 1, "The expected number of columns does not match the actual") def test_should_get_all_questions_as_raw(self): self.assertTrue(type(self._aristo_data.get_all_questions_as_raw()) is str) def test_should_get_all_questions_as_list(self): self.assertEqual(len(self._aristo_data.get_all_questions_answers_as_list()),2500*5)