예제 #1
0
def download_corpus (data_csv):
    aristo_data = AristoData(data_csv )
    data = " ".join([aristo_data.get_column_as_raw("A",join_rows_by=","), aristo_data.get_all_questions_as_raw() ,   aristo_data.get_column_as_raw("B",join_rows_by=",") , aristo_data.get_column_as_raw("C",join_rows_by=",") , aristo_data.get_column_as_raw("D",join_rows_by=",")])
    #sentence_list = [ aristo_data.get_all_questions_as_raw()]# aristo_data.get_column_as_raw("B",join_rows_by=",") , aristo_data.get_column_as_raw("C",join_rows_by=",") , aristo_data.get_column_as_raw("D",join_rows_by=",")]
    kc = KnowledgeCreator()
    key_words=get_key_words(data)

    corpus_file=os.path.join(os.path.dirname(__file__),"../../../corpus2/mediafile_{}.xml".format(time.strftime('%Y%m%d_%H%M%S')))
    kc.download_wikipedia_articles(key_words, corpus_file)
class IntegrationTestAristoData(unittest.TestCase):
    def setUp(self):
        data_file_path = os.path.join(os.path.dirname(__file__), "../../../inputdata/training_set.tsv")
        print(os.path.abspath(data_file_path))
        self._aristo_data = AristoData(data_file_path)

    def test_should_print_summary(self):
        self._aristo_data.print_summary()

    def test_should_get_x(self):
        self.assertEqual(len(self._aristo_data.x.columns), 5,
                         "The expected number of columns does not match the actual")

    def test_should_get_y_columns(self):
        self.assertEqual(len(self._aristo_data.y.columns), 1,
                         "The expected number of columns does not match the actual")

    def test_should_get_all_questions_as_raw(self):
        self.assertTrue(type(self._aristo_data.get_all_questions_as_raw()) is str)

    def test_should_get_all_questions_as_list(self):
        self.assertEqual(len(self._aristo_data.get_all_questions_answers_as_list()),2500*5)