def random_cluster(input_path, output_path, num_clusters, cluster_path, num_reducers): def inc_path(): global iter_cnt iter_cnt +=1 return '%s/%d' % (output_path, iter_cnt) hadoopy.freeze(script_path='random_cluster.py', shared_libs=SHARED_LIBS, modules=['vitrieve_algorithms'], remove_dir=True) hadoopy.launch(in_name=input_path, out_name=inc_path(), cmdenvs=['NUM_CLUSTERS=%d' % (num_clusters)], script_path='random_cluster.py', #combiner=True, frozen_path='frozen')
def canopy(input_path, output_path, num_clusters, cluster_path, num_reducers): def inc_path(): global iter_cnt iter_cnt +=1 return '%s/%d' % (output_path, iter_cnt) def prev_path(): return '%s/%d' % (output_path, iter_cnt) soft = str(4000.) hard = str(250.) hadoopy.freeze(script_path='canopy_cluster.py', shared_libs=SHARED_LIBS, modules=['vitrieve_algorithms', 'nn_l2sqr_c'], remove_dir=True) hadoopy.launch(in_name=input_path, out_name=inc_path(), script_path='canopy_cluster.py', files='nn_l2sqr.py', cmdenvs=['NN_MODULE=nn_l2sqr_c', 'CANOPY_SOFT_DIST=%s' % (soft), 'CANOPY_HARD_DIST=%s' % (hard)], frozen_path='frozen') consolidate_clusters(prev_path(), 'canopies.pkl') hadoopy.freeze(script_path='canopy_cluster_assign.py', remove_dir=True) hadoopy.launch(in_name=input_path, out_name=inc_path(), script_path='canopy_cluster_assign.py', cmdenvs=['CANOPY_SOFT_DIST=%s' % (soft), 'CANOPIES_PKL=' + 'canopies.pkl'], files='canopies.pkl', reducer=None, frozen_path='frozen') input_path = prev_path() hadoopy.launch(in_name=cluster_path, out_name=inc_path(), script_path='canopy_cluster_assign.py', cmdenvs=['CANOPY_SOFT_DIST=%s' % (soft), 'CANOPIES_PKL=' + 'canopies.pkl'], files='canopies.pkl', reducer=None, frozen_path='frozen') consolidate_canopy_clusters(prev_path(), 'clusters.pkl') hadoopy.freeze(script_path='kmeans_canopy_cluster.py', shared_libs=SHARED_LIBS, modules=['vitrieve_algorithms', 'nn_l2sqr_c',], remove_dir=True) hadoopy.launch(in_name=input_path, out_name=inc_path(), script_path='kmeans_canopy_cluster.py', cmdenvs=['CLUSTERS_PKL=%s' % ('clusters.pkl'), 'CANOPY_SOFT_DIST=%s' % (soft), 'NN_MODULE=nn_l2sqr_c'], files=['nn_l2sqr_c.py', 'clusters.pkl'], frozen_path='frozen')