Esempio n. 1
0
def random_cluster(input_path, output_path, num_clusters, cluster_path, num_reducers):
    def inc_path():
        global iter_cnt
        iter_cnt +=1
        return '%s/%d' % (output_path, iter_cnt)
    hadoopy.freeze(script_path='random_cluster.py',
                   shared_libs=SHARED_LIBS,
                   modules=['vitrieve_algorithms'],
                   remove_dir=True)
    hadoopy.launch(in_name=input_path,
                       out_name=inc_path(),
                       cmdenvs=['NUM_CLUSTERS=%d' % (num_clusters)],
                       script_path='random_cluster.py',
                       #combiner=True,
                       frozen_path='frozen')
Esempio n. 2
0
def canopy(input_path, output_path, num_clusters, cluster_path, num_reducers):
    def inc_path():
        global iter_cnt
        iter_cnt +=1
        return '%s/%d' % (output_path, iter_cnt)
    def prev_path():
        return '%s/%d' % (output_path, iter_cnt)
    soft = str(4000.)
    hard = str(250.)

    hadoopy.freeze(script_path='canopy_cluster.py',
                   shared_libs=SHARED_LIBS,
                   modules=['vitrieve_algorithms', 'nn_l2sqr_c'],
                   remove_dir=True)
    hadoopy.launch(in_name=input_path,
                       out_name=inc_path(),
                       script_path='canopy_cluster.py',
                       files='nn_l2sqr.py',
                       cmdenvs=['NN_MODULE=nn_l2sqr_c',
                                'CANOPY_SOFT_DIST=%s' % (soft),
                                'CANOPY_HARD_DIST=%s' % (hard)],
                       frozen_path='frozen')
    consolidate_clusters(prev_path(), 'canopies.pkl')

    hadoopy.freeze(script_path='canopy_cluster_assign.py',
                   remove_dir=True)
    hadoopy.launch(in_name=input_path,
                       out_name=inc_path(),
                       script_path='canopy_cluster_assign.py',
                       cmdenvs=['CANOPY_SOFT_DIST=%s' % (soft),
                                'CANOPIES_PKL=' + 'canopies.pkl'],
                       files='canopies.pkl',
                       reducer=None,
                       frozen_path='frozen')
    input_path = prev_path()

    hadoopy.launch(in_name=cluster_path,
                       out_name=inc_path(),
                       script_path='canopy_cluster_assign.py',
                       cmdenvs=['CANOPY_SOFT_DIST=%s' % (soft),
                                'CANOPIES_PKL=' + 'canopies.pkl'],
                       files='canopies.pkl',
                       reducer=None,
                       frozen_path='frozen')
    consolidate_canopy_clusters(prev_path(), 'clusters.pkl')

    hadoopy.freeze(script_path='kmeans_canopy_cluster.py',
                   shared_libs=SHARED_LIBS,
                   modules=['vitrieve_algorithms', 'nn_l2sqr_c',],
                   remove_dir=True)
    hadoopy.launch(in_name=input_path,
                       out_name=inc_path(),
                       script_path='kmeans_canopy_cluster.py',
                       cmdenvs=['CLUSTERS_PKL=%s' % ('clusters.pkl'),
                                'CANOPY_SOFT_DIST=%s' % (soft),
                                 'NN_MODULE=nn_l2sqr_c'],
                       files=['nn_l2sqr_c.py', 'clusters.pkl'],
                       frozen_path='frozen')