Esempio n. 1
0
def getStatsForSSAMR():
    batchSize = 50000
    default_experts_twitter_stream_settings['ssa_threshold'] = 0.75
    for id in range(0, 10):
        ts = time.time()
        fileName = time_to_process_points + '%s/%s' % (batchSize, id)
        iteration_file = '%s_%s' % (batchSize, id)
        print 'Generating data for ', iteration_file
        with open(iteration_file, 'w') as fp:
            [
                fp.write(CJSONProtocol.write('x', [doc1, doc2]) + '\n') for
                doc1, doc2 in combinations(iterateUserDocuments(fileName), 2)
            ]
        os.system('hadoop fs -put %s %s' % (iteration_file, hdfsUnzippedPath))
        StreamSimilarityAggregationMR.estimate(
            hdfsUnzippedPath + '/%s' % iteration_file,
            args='-r hadoop'.split(),
            jobconf={
                'mapred.map.tasks': 25,
                'mapred.task.timeout': 7200000,
                'mapred.reduce.tasks': 25
            })

        os.system('hadoop fs -rmr %s' %
                  (hdfsUnzippedPath + '/%s' % iteration_file))
        os.system('rm -rf %s' % iteration_file)
        iteration_data = {
            'iteration_time': time.time() - ts,
            'type': 'ssa_mr',
            'number_of_messages': batchSize * (id + 1),
            'batch_size': batchSize
        }
        FileIO.writeToFileAsJson(iteration_data, ssa_mr_stats_file)
 def generateDocsForSSAMR():
     for length in [1000000, 1100000, 1200000]:
         tf = TweetsFile(length, **experts_twitter_stream_settings)
         iteration_file = clustering_quality_experts_ssa_mr_folder + str(length)
         print "Generating data for ", iteration_file
         with open(iteration_file, "w") as fp:
             [
                 fp.write(CJSONProtocol.write("x", [doc1, doc2]) + "\n")
                 for doc1, doc2 in combinations(tf._iterateUserDocuments(), 2)
             ]
         os.system("gzip %s" % iteration_file)
         print "hadoop fs -put %s.gz %s" % (iteration_file, hdfsPath)
         os.system("hadoop fs -put %s.gz %s" % (iteration_file, hdfsPath))
 def generateDocsForSSAMR():
     for length in [1000000, 1100000, 1200000]:
         tf = TweetsFile(length, **experts_twitter_stream_settings)
         iteration_file = clustering_quality_experts_ssa_mr_folder + str(
             length)
         print 'Generating data for ', iteration_file
         with open(iteration_file, 'w') as fp:
             [
                 fp.write(CJSONProtocol.write('x', [doc1, doc2]) + '\n') for
                 doc1, doc2 in combinations(tf._iterateUserDocuments(), 2)
             ]
         os.system('gzip %s' % iteration_file)
         print 'hadoop fs -put %s.gz %s' % (iteration_file, hdfsPath)
         os.system('hadoop fs -put %s.gz %s' % (iteration_file, hdfsPath))
def getStatsForSSAMR():
    batchSize = 50000
    default_experts_twitter_stream_settings['ssa_threshold']=0.75
    for id in range(0, 10):
        ts = time.time()
        fileName = time_to_process_points+'%s/%s'%(batchSize,id)
        iteration_file = '%s_%s'%(batchSize, id)
        print 'Generating data for ', iteration_file
        with open(iteration_file, 'w') as fp: [fp.write(CJSONProtocol.write('x', [doc1, doc2])+'\n') for doc1, doc2 in combinations(iterateUserDocuments(fileName),2)]
        os.system('hadoop fs -put %s %s'%(iteration_file, hdfsUnzippedPath))    
        StreamSimilarityAggregationMR.estimate(hdfsUnzippedPath+'/%s'%iteration_file, args='-r hadoop'.split(), 
                                        jobconf={'mapred.map.tasks':25, 'mapred.task.timeout': 7200000, 'mapred.reduce.tasks':25})
        
        os.system('hadoop fs -rmr %s'%(hdfsUnzippedPath+'/%s'%iteration_file))
        os.system('rm -rf %s'%iteration_file)
        iteration_data = {'iteration_time': time.time()-ts, 'type': 'ssa_mr', 'number_of_messages': batchSize*(id+1), 'batch_size': batchSize}
        FileIO.writeToFileAsJson(iteration_data, ssa_mr_stats_file)
Esempio n. 5
0
def createFileForNextIteration(data):
    with open(iteration_file, 'w') as fp: [fp.write(CJSONProtocol.write(k, v)+'\n') for k, v in data.iteritems()]
Esempio n. 6
0
def createFileForNextIteration(data):
    with open(iteration_file, 'w') as fp:
        [
            fp.write(CJSONProtocol.write(k, v) + '\n')
            for k, v in data.iteritems()
        ]