def test_cluster_ncluster(): ''' We test if 3 sets of data are clustered into 3 sets. n_clusters is specified. ''' CLASSIFIERS = ('cluster.ward', 'cluster.spectral') G1 = 'First set' G2 = 'Second group' G3 = 'Third cluster' data = enumerate((G1, G1, G1, G2, G2, G2, G3, G3, G3)) data = list(data) for classifier in CLASSIFIERS: baf = Chain().load('data.simple',data) \ .process('vectorize.sklearn') \ .process(classifier, n_clusters=3) result = baf.get_chain('result') assert result[0] == result[1] == result[2] assert result[3] == result[4] == result[5] assert result[6] == result[7] == result[8] assert len(set(result)) == 3
def test_cluster_ncluster(): ''' We test if 3 sets of data are clustered into 3 sets. n_clusters is specified. ''' CLASSIFIERS = ('cluster.ward', 'cluster.spectral') G1 = 'First set' G2 = 'Second group' G3 = 'Third cluster' data = enumerate((G1,G1,G1,G2,G2,G2,G3,G3,G3)) data = list(data) for classifier in CLASSIFIERS: baf = Chain().load('data.simple',data) \ .process('vectorize.sklearn') \ .process(classifier, n_clusters=3) result = baf.get_chain('result') assert result[0] == result[1] == result[2] assert result[3] == result[4] == result[5] assert result[6] == result[7] == result[8] assert len(set(result)) == 3
def test_core_data(): """ Load dummy data and verify it. """ data = ((0, "0"), (1, "1")) test_subject = Chain() test_subject.load("data.simple", data) assert test_subject.get("base_data").data == data
def test_core_chain(): ''' Create a processor and test if .get on processor will propagate the call ''' data = ( (0, '0'), (1, '1'), ) test_subject = Chain() with pytest.raises(KeyError) as excinfo: assert test_subject.get('test') == 'ok' test_subject.data['test'] = 'ok' assert test_subject.get('test') == 'ok' test_subject.load('data.simple', data) # get data from the main class through the chain assert test_subject.chain[-1].get('base_data').data == data assert test_subject.get_chain('base_data').data == data # get data from the chain test_subject.chain[-1]._data['last_data'] = 'last_data' assert test_subject.get_chain('last_data') == 'last_data'
def test_core_data(): ''' Load dummy data and verify it. ''' data = ( (0, '0'), (1, '1'), ) test_subject = Chain() test_subject.load('data.simple', data) assert test_subject.get('base_data').data == data
def test_cluster(): data = ((0, 'data test 1'), (1, 'Data test 2'), (2, 'other data test 3.')) test_subject = Chain().load( 'data.simple', data).process('vectorize.sklearn').process('cluster.ward') assert isinstance(test_subject.chain[0], SimpleDataSource) assert isinstance(test_subject.chain[1], CountVectorizer) assert isinstance(test_subject.chain[2], WardClusterizer) result = test_subject.get_chain('result') assert len(result) == len(data) assert result.tolist() == test_subject.chain[-1].get('result').tolist() assert result.tolist() == test_subject.chain[-1]._data['result'].tolist()
def test_cluster(): data=((0,'data test 1'), (1,'Data test 2'), (2,'other data test 3.') ) test_subject = Chain().load('data.simple',data).process('vectorize.sklearn').process('cluster.ward') assert isinstance(test_subject.chain[0],SimpleDataSource) assert isinstance(test_subject.chain[1],CountVectorizer) assert isinstance(test_subject.chain[2],WardClusterizer) result = test_subject.get_chain('result') assert len(result) == len(data) assert result.tolist() == test_subject.chain[-1].get('result').tolist() assert result.tolist() == test_subject.chain[-1]._data['result'].tolist()
def test_data_simple(): data = ((0, 'data test 1'), (1, 'Data test 2'), (2, 'other data test 3.')) test_subject = Chain().load('data.simple', data) data = test_subject.data['base_data'] assert test_subject.data['base_data'] == test_subject.data['main_data'] assert data.get_data() == [ 'data test 1', 'Data test 2', 'other data test 3.' ] assert data.get_uids() == [0, 1, 2]
def test_core_data_chain(): ''' When overriding data the last data should be used. ''' data = ( (0, '0'), (1, '1'), ) data_new = ((0, "new"), (1, "new")) test_subject = Chain() test_subject.load('data.simple', data) assert test_subject.get_chain('data_source').data == data test_subject.load('data.simple', data_new) assert test_subject.get_chain('data_source').data == data_new
def test_core_data_chain(): """ When overriding data the last data should be used. """ data = ((0, "0"), (1, "1")) data_new = ((0, "new"), (1, "new")) test_subject = Chain() test_subject.load("data.simple", data) assert test_subject.get_chain("data_source").data == data test_subject.load("data.simple", data_new) assert test_subject.get_chain("data_source").data == data_new
def test_core_load_json(): ''' Load a chain from a json file. ''' baf = Chain.load_json(path= os.path.join(os.path.dirname(__file__), "data1.json") ) assert baf.data['lang'] == 'fr' assert baf.data_sources[-1].data == [ (1, [u'ceci', u'\xeatre', u'un', u'test', u'.']), (2, [u'cette', u'phrase', u'\xeatre', u'un', u'autre', u'test', u'.']) ]
def test_data_save_load(): data = ((0, "data test 1"), (1, "Data test 2"), (2, "other data test 3.")) test_subject = Chain().load("data.simple", data) data = test_subject.data["base_data"] test_subject.process("vectorize.sklearn") with tempfile.TemporaryDirectory() as tmpdirname: save_path = os.path.join(tmpdirname, "save.pkl") test_subject.process("pickle.save", save_path) load_test = Chain().load("pickle.load", save_path) assert ( load_test.get_chain("vectorizer").transform(("test",)).toarray()[0].tolist() == test_subject.get_chain("vectorizer").transform(("test",)).toarray()[0].tolist() == [0, 0, 1] )
def test_core_load_yaml(): ''' Load a chain from a yaml file. ''' baf = Chain.load_yaml(path= os.path.join(os.path.dirname(__file__), "data2.yaml") ) assert baf.data['lang'] == 'en' print(baf.data_sources[-1].data ) assert baf.data_sources[-1].data == [ (1, [u'this', u'be', u'a', u'test', u'.']), (2, [u'another', u'test', u'sentence', u'.']), (3, [u'yet', u'another', u'sentence', u'.']), (4, [u'two', u'sentence', u'this', u'time', u'.']) ]
def test_core_chain(): """ Create a processor and test if .get on processor will propagate the call """ data = ((0, "0"), (1, "1")) test_subject = Chain() with pytest.raises(KeyError) as excinfo: assert test_subject.get("test") == "ok" test_subject.data["test"] = "ok" assert test_subject.get("test") == "ok" test_subject.load("data.simple", data) # get data from the main class through the chain assert test_subject.chain[-1].get("base_data").data == data assert test_subject.get_chain("base_data").data == data # get data from the chain test_subject.chain[-1]._data["last_data"] = "last_data" assert test_subject.get_chain("last_data") == "last_data"
def fixture(): ''' Creates a processor chain with test data. ''' data = ((0, 'data test 1'), (1, 'Data test 2'), (2, 'other data test 3.')) return Chain().load('data.simple', data)