def clusters(tmpdir): vector_path = 'tests/resources/twos.vectors.txt' put_path = str(tmpdir.join('clusters_unit_test.hdf')) cluster_vectors(vector_path, put_path, n_clusters=4, n_jobs=1) clusters = pd.read_hdf(put_path, key='clusters').clusters for i in range(4): # a and e, b and f, etc, belong to the same cluster assert clusters[i] == clusters[i + 4] return put_path
def test_distributional_with_vector_clusters(conf, tmpdir): # generate random vectors for the the appropriate features and cluster them first x_tr, _, _, _ = get_tokenized_data(conf['training_data'], conf['tokenizer']) feats = FeatureExtractor().extract_features_from_tree_list([foo[0] for foo in x_tr]) vectors = np.random.random((len(feats), 10)) v = DenseVectors(pd.DataFrame(vectors, index=feats)) tmpfile = str(tmpdir.join('tmp_random_vectors')) v.to_tsv(tmpfile, dense_hd5=True) tmpclusters = str(tmpdir.join('tmp_random_clusters')) cluster_vectors(tmpfile, tmpclusters, n_clusters=5, n_jobs=1) conf['vector_sources']['neighbours_file'] = [] conf['vectorizer']['class'] = 'eval.pipeline.multivectors.KmeansVectorizer' conf['vector_sources']['clusters_file'] = tmpclusters # the features of the document are cluster ids, not phrases # no point in checking in they are in the thesaurus conf['feature_selection']['must_be_in_thesaurus'] = False for debug_level in [0, 1, 2]: conf['debug_level'] = debug_level run_experiment(conf)