def clusters(tmpdir):
    vector_path = 'tests/resources/twos.vectors.txt'
    put_path = str(tmpdir.join('clusters_unit_test.hdf'))
    cluster_vectors(vector_path, put_path, n_clusters=4, n_jobs=1)

    clusters = pd.read_hdf(put_path, key='clusters').clusters
    for i in range(4):
        # a and e, b and f, etc,  belong to the same cluster
        assert clusters[i] == clusters[i + 4]
    return put_path
def test_distributional_with_vector_clusters(conf, tmpdir):
    # generate random vectors for the the appropriate features and cluster them first
    x_tr, _, _, _ = get_tokenized_data(conf['training_data'], conf['tokenizer'])
    feats = FeatureExtractor().extract_features_from_tree_list([foo[0] for foo in x_tr])
    vectors = np.random.random((len(feats), 10))
    v = DenseVectors(pd.DataFrame(vectors, index=feats))
    tmpfile = str(tmpdir.join('tmp_random_vectors'))
    v.to_tsv(tmpfile, dense_hd5=True)

    tmpclusters = str(tmpdir.join('tmp_random_clusters'))
    cluster_vectors(tmpfile, tmpclusters, n_clusters=5, n_jobs=1)

    conf['vector_sources']['neighbours_file'] = []
    conf['vectorizer']['class'] = 'eval.pipeline.multivectors.KmeansVectorizer'
    conf['vector_sources']['clusters_file'] = tmpclusters
    # the features of the document are cluster ids, not phrases
    # no point in checking in they are in the thesaurus
    conf['feature_selection']['must_be_in_thesaurus'] = False

    for debug_level in [0, 1, 2]:
        conf['debug_level'] = debug_level
        run_experiment(conf)