def download_corpus (data_csv): aristo_data = AristoData(data_csv ) data = " ".join([aristo_data.get_column_as_raw("A",join_rows_by=","), aristo_data.get_all_questions_as_raw() , aristo_data.get_column_as_raw("B",join_rows_by=",") , aristo_data.get_column_as_raw("C",join_rows_by=",") , aristo_data.get_column_as_raw("D",join_rows_by=",")]) #sentence_list = [ aristo_data.get_all_questions_as_raw()]# aristo_data.get_column_as_raw("B",join_rows_by=",") , aristo_data.get_column_as_raw("C",join_rows_by=",") , aristo_data.get_column_as_raw("D",join_rows_by=",")] kc = KnowledgeCreator() key_words=get_key_words(data) corpus_file=os.path.join(os.path.dirname(__file__),"../../../corpus2/mediafile_{}.xml".format(time.strftime('%Y%m%d_%H%M%S'))) kc.download_wikipedia_articles(key_words, corpus_file)
def run_test_data(data_csv): out_dir=os.path.join(os.path.dirname(__file__),"../../../outputdata/test_{}".format(time.strftime('%Y%m%d_%H%M%S'))) os.makedirs(out_dir) logger = setup_log(out_dir) aristo_data = AristoData(data_csv) aristo_data.print_summary() pipeline = SolrWikipediaAllAnswerThenQuestionPipeline(data=aristo_data, logger = logger) pipeline.run_pipeline() pipeline.write_to_disk((out_dir)) print(pipeline.score())
class IntegrationTestAristoData(unittest.TestCase): def setUp(self): data_file_path = os.path.join(os.path.dirname(__file__), "../../../inputdata/training_set.tsv") print(os.path.abspath(data_file_path)) self._aristo_data = AristoData(data_file_path) def test_should_print_summary(self): self._aristo_data.print_summary() def test_should_get_x(self): self.assertEqual(len(self._aristo_data.x.columns), 5, "The expected number of columns does not match the actual") def test_should_get_y_columns(self): self.assertEqual(len(self._aristo_data.y.columns), 1, "The expected number of columns does not match the actual") def test_should_get_all_questions_as_raw(self): self.assertTrue(type(self._aristo_data.get_all_questions_as_raw()) is str) def test_should_get_all_questions_as_list(self): self.assertEqual(len(self._aristo_data.get_all_questions_answers_as_list()),2500*5)
def run_train_data(train_data_csv): aristo_train_data = AristoData(train_data_csv, range(0,2000)) aristo_test_data = AristoData(train_data_csv, range(100,110)) aristo_test_data.print_summary() aristo_train_data.print_summary() pipeline = SimilarityPipeline(train_data=aristo_train_data, test_data=aristo_test_data) pipeline.run_pipeline() out_dir=os.path.join(os.path.dirname(__file__),"../../../outputdata/train_{}".format(time.strftime('%Y%m%d_%H%M%S'))) os.makedirs(out_dir) pipeline.write_to_disk((out_dir))
def setUp(self): data_file_path = os.path.join(os.path.dirname(__file__), "../../../inputdata/training_set.tsv") print(os.path.abspath(data_file_path)) self._aristo_data = AristoData(data_file_path)