sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_ACCESS_KEY_ID) sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET_ACCESS_KEY) for item in d: name = item fp = "s3n://bucket-data/"+name p = d[item] data = read_song_text(sc, fp, p) # sparse vector from data data_sparse_list = data.map(lambda x:(x[0][1], x[1])) rdd_1 = data.map(lambda x:(x[0][0], x[1])) # initialize the parameters m, n, b, c = 1000,1000,25,2 # create the model for LSH model = lsh.run(data_sparse_list, p, m, n, b, c) print ("start printing filename %s" % name) # Get similar buckets cnt = model.buckets.count() # print result time taken timetaken = (time.time() - start_time) print 'Found %s clusters.' % cnt print("--- %s seconds ---" % timetaken) tup = cnt, timetaken d[name] = cnt , (time.time() - start_time) # write log file f = open('output.txt', 'a') mystr = str(time.time())
help = "Number of times to hash the elements. Larger numbers diversify " + "signatures, increasing likelihood similar vectors will be hashed together. " + "This is also the length of the signature. [DEFAULT: 1000]") parser.add_argument("-b", "--bands", type = int, default = 25, help = "Number of bands. Each band will have (n / b) elements. Larger " + "numbers of elements increase confidence in element similarity. [DEFAULT: 25]") parser.add_argument("-c", "--minbucketsize", type = int, default = 2, help = "Minimum bucket size (0 to disable). Buckets with fewer than this " + "number of elements will be dropped. [DEFAULT: 2]") args = vars(parser.parse_args()) sc = SparkContext(conf = SparkConf()) # Read the input data. print now(), 'Starting' raw_lines, data = read_text(sc, args['input']) p = 65537 m, n, b, c = args['bins'], args['numrows'], args['bands'], args['minbucketsize'] vector_buckets = lsh.run(data, p, m, n, b, c) bucket_ids = vector_buckets.map(lambda (vector, bucket): bucket).distinct() print now(), 'Found %s clusters.' % bucket_ids.count() bucket_vectors = vector_buckets.map(lambda (vector, bucket): (bucket, vector)).groupByKey() for (bucket, vectors) in bucket_vectors.collect(): print 'Bucket %s' % bucket for vector in vectors: print '\tDocument %s: %s ...' % (vector, raw_lines[vector][:100]) print '*' * 40
parser = argparse.ArgumentParser(description = 'Spark LSH', epilog = 'lol lsh', add_help = 'How to use', prog = 'python driver.py <arguments>') parser.add_argument("-i", "--input", required = True, help = "Input directory of text files.") # Optional parameters. parser.add_argument("-m", "--bins", type = int, default = 1000, help = "Number of bins into which to hash the data. Smaller numbers " + "increase collisions, producing larger clusters. [DEFAULT: 1000]") parser.add_argument("-n", "--numrows", type = int, default = 1000, help = "Number of times to hash the elements. Larger numbers diversify " + "signatures, increasing likelihood similar vectors will be hashed together. " + "This is also the length of the signature. [DEFAULT: 1000]") parser.add_argument("-b", "--bands", type = int, default = 25, help = "Number of bands. Each band will have (n / b) elements. Larger " + "numbers of elements increase confidence in element similarity. [DEFAULT: 25]") parser.add_argument("-c", "--minbucketsize", type = int, default = 2, help = "Minimum bucket size (0 to disable). Buckets with fewer than this " + "number of elements will be dropped. [DEFAULT: 2]") args = vars(parser.parse_args()) sc = SparkContext(conf = SparkConf()) # Read the input data. data = read_text(sc, args['input']) p = 65537 m, n, b, c = args['bins'], args['numrows'], args['bands'], args['minbucketsize'] model = lsh.run(data, p, m, n, b, c) print 'Found %s clusters.' % model.buckets.count()
help="Number of times to hash the elements. Larger numbers diversify " + "signatures, increasing likelihood similar vectors will be hashed together. " + "This is also the length of the signature. [DEFAULT: 1000]") parser.add_argument( "-b", "--bands", type=int, default=25, help="Number of bands. Each band will have (n / b) elements. Larger " + "numbers of elements increase confidence in element similarity. [DEFAULT: 25]" ) parser.add_argument( "-c", "--minbucketsize", type=int, default=2, help="Minimum bucket size (0 to disable). Buckets with fewer than this " + "number of elements will be dropped. [DEFAULT: 2]") args = vars(parser.parse_args()) sc = SparkContext(conf=SparkConf()) # Read the input data. data = read_text(sc, args['input']) p = 65537 m, n, b, c = args['bins'], args['numrows'], args['bands'], args[ 'minbucketsize'] model = lsh.run(data, p, m, n, b, c) print 'Found %s clusters.' % model.buckets.count()