def test_cluster_ncluster(): ''' We test if 3 sets of data are clustered into 3 sets. n_clusters is specified. ''' CLASSIFIERS = ('cluster.ward', 'cluster.spectral') G1 = 'First set' G2 = 'Second group' G3 = 'Third cluster' data = enumerate((G1, G1, G1, G2, G2, G2, G3, G3, G3)) data = list(data) for classifier in CLASSIFIERS: baf = Chain().load('data.simple',data) \ .process('vectorize.sklearn') \ .process(classifier, n_clusters=3) result = baf.get_chain('result') assert result[0] == result[1] == result[2] assert result[3] == result[4] == result[5] assert result[6] == result[7] == result[8] assert len(set(result)) == 3
def test_core_chain(): ''' Create a processor and test if .get on processor will propagate the call ''' data = ( (0, '0'), (1, '1'), ) test_subject = Chain() with pytest.raises(KeyError) as excinfo: assert test_subject.get('test') == 'ok' test_subject.data['test'] = 'ok' assert test_subject.get('test') == 'ok' test_subject.load('data.simple', data) # get data from the main class through the chain assert test_subject.chain[-1].get('base_data').data == data assert test_subject.get_chain('base_data').data == data # get data from the chain test_subject.chain[-1]._data['last_data'] = 'last_data' assert test_subject.get_chain('last_data') == 'last_data'
def test_data_simple(): data = ((0, 'data test 1'), (1, 'Data test 2'), (2, 'other data test 3.')) test_subject = Chain().load('data.simple', data) data = test_subject.data['base_data'] assert test_subject.data['base_data'] == test_subject.data['main_data'] assert data.get_data() == [ 'data test 1', 'Data test 2', 'other data test 3.' ] assert data.get_uids() == [0, 1, 2]
def test_core_data(): ''' Load dummy data and verify it. ''' data = ( (0, '0'), (1, '1'), ) test_subject = Chain() test_subject.load('data.simple', data) assert test_subject.get('base_data').data == data
def test_core_data_chain(): ''' When overriding data the last data should be used. ''' data = ( (0, '0'), (1, '1'), ) data_new = ((0, "new"), (1, "new")) test_subject = Chain() test_subject.load('data.simple', data) assert test_subject.get_chain('data_source').data == data test_subject.load('data.simple', data_new) assert test_subject.get_chain('data_source').data == data_new
def test_cluster(): data = ((0, 'data test 1'), (1, 'Data test 2'), (2, 'other data test 3.')) test_subject = Chain().load( 'data.simple', data).process('vectorize.sklearn').process('cluster.ward') assert isinstance(test_subject.chain[0], SimpleDataSource) assert isinstance(test_subject.chain[1], CountVectorizer) assert isinstance(test_subject.chain[2], WardClusterizer) result = test_subject.get_chain('result') assert len(result) == len(data) assert result.tolist() == test_subject.chain[-1].get('result').tolist() assert result.tolist() == test_subject.chain[-1]._data['result'].tolist()
def fixture(): ''' Creates a processor chain with test data. ''' data = ((0, 'data test 1'), (1, 'Data test 2'), (2, 'other data test 3.')) return Chain().load('data.simple', data)