Example #1
0
def benchmark_naive_bayes(ctx, timer):
  
  print "#worker:", ctx.num_workers
  N = 100000 * ctx.num_workers
  D = 128
  
  # create data
  data = expr.randint(N, D, low=0, high=D, tile_hint=(N/ctx.num_workers, D))
  labels = expr.eager(expr.shuffle(data, _init_label_mapper))
    
  #util.log_warn('data:%s, label:%s', data.glom(), labels.glom())   
  
  util.log_warn('begin train')
  t1 = datetime.now()
  model = fit(data, labels, D)
  t2 = datetime.now()
  util.log_warn('train time:%s ms', millis(t1,t2))

  correct = 0
  for i in range(10):
    new_data = expr.randint(1, D, low=0, high=D, tile_hint=(1, D))
    new_label = predict(model, new_data)
    #print 'point %s, predict %s' % (new_data.glom(), new_label)
   
    new_data = new_data.glom()
    if np.isclose(new_data[0, new_label], np.max(new_data)):
      correct += 1
  print 'predict precision:', correct * 1.0 / 10
Example #2
0
def benchmark_naive_bayes(ctx, timer):
  
  print "#worker:", ctx.num_workers
  #N = 100000 * ctx.num_workers
  N = 10000 * 64
  D = 128
  
  # create data
  data = expr.randint(N, D, low=0, high=D, tile_hint=(N, D/ctx.num_workers))
  labels = expr.shuffle(expr.ndarray((data.shape[0], 1), dtype=np.int), _init_label_mapper,
                        kw={'data': data}, shape_hint=(data.shape[0], 1), 
                        cost_hint={hash(data):{'00': 0, '10': np.prod(data.shape)}}
                       )
    
  #util.log_warn('data:%s, label:%s', data.glom(), labels.glom())   
  
  util.log_warn('begin train')
  t1 = datetime.now()
  model = fit(data, labels, D)
  t2 = datetime.now()
  util.log_warn('train time:%s ms', millis(t1,t2))

  correct = 0
  for i in range(10):
    new_data = expr.randint(1, D, low=0, high=D, tile_hint=(1, D))
    new_label = predict(model, new_data)
    #print 'point %s, predict %s' % (new_data.glom(), new_label)
   
    new_data = new_data.glom()
    if np.isclose(new_data[0, new_label], np.max(new_data)):
      correct += 1
  print 'predict precision:', correct * 1.0 / 10
Example #3
0
def benchmark_lda(ctx, timer):

    print "#worker:", ctx.num_workers
    NUM_TERMS = 160
    NUM_DOCS = 200 * ctx.num_workers
    #NUM_DOCS = 10 * 64

    # create data
    # NUM_TERMS = 41807
    # NUM_DOCS = 21578
    # terms_docs_matrix = from_file("/scratch/cq/numpy_dense_matrix", sparse = False, tile_hint = (NUM_TERMS, int((NUM_DOCS + ctx.num_workers - 1) / ctx.num_workers))).evaluate()

    terms_docs_matrix = expr.randint(NUM_TERMS, NUM_DOCS, low=0, high=100)

    max_iter = 3
    k_topics = 16

    t1 = datetime.now()
    doc_topics, topic_term_count = learn_topics(terms_docs_matrix,
                                                k_topics,
                                                max_iter=max_iter)
    doc_topics.optimized().evaluate()
    topic_term_count.optimized().evaluate()
    t2 = datetime.now()
    time_cost = millis(t1, t2)
    util.log_warn('total_time:%s ms, train time per iteration:%s ms',
                  time_cost, time_cost / max_iter)
Example #4
0
def benchmark_als(ctx, timer):
  print "#worker:", ctx.num_workers
  #USER_SIZE = 400 * ctx.num_workers
  USER_SIZE = 200 * 64
  MOVIE_SIZE = 12800
  num_features = 20
  num_iter = 5
  
  A = expr.eager(expr.randint(USER_SIZE, MOVIE_SIZE, low=0, high=5, tile_hint=(USER_SIZE/ctx.num_workers, MOVIE_SIZE)))
  
  util.log_warn('begin als!')
  t1 = datetime.now()
  U, M = als(A, implicit_feedback=True, num_features=num_features, num_iter=num_iter)
  U.force()
  M.force()
  t2 = datetime.now()
  cost_time = millis(t1,t2)
  print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/num_iter)
Example #5
0
def benchmark_als(ctx, timer):
  print "#worker:", ctx.num_workers
  #USER_SIZE = 100 * ctx.num_workers
  USER_SIZE = 320
  #USER_SIZE = 200 * 64
  MOVIE_SIZE = 12800
  num_features = 20
  num_iter = 2
  
  A = expr.randint(USER_SIZE, MOVIE_SIZE, low=0, high=5, tile_hint=(USER_SIZE, util.divup(MOVIE_SIZE, ctx.num_workers)))
  #A = expr.randint(USER_SIZE, MOVIE_SIZE, low=0, high=5)
  
  util.log_warn('begin als!')
  t1 = datetime.now()
  U, M = als(A, implicit_feedback=True, num_features=num_features, num_iter=num_iter)
  U.force()
  M.force()
  t2 = datetime.now()
  cost_time = millis(t1,t2)
  print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/num_iter)
Example #6
0
def benchmark_lda(ctx, timer):
  
  print "#worker:", ctx.num_workers
  NUM_TERMS = 160
  NUM_DOCS = 200 * ctx.num_workers
  #NUM_DOCS = 10 * 64

  # create data
  # NUM_TERMS = 41807
  # NUM_DOCS = 21578
  # terms_docs_matrix = from_file("/scratch/cq/numpy_dense_matrix", sparse = False, tile_hint = (NUM_TERMS, int((NUM_DOCS + ctx.num_workers - 1) / ctx.num_workers))).force()
  
  terms_docs_matrix = expr.randint(NUM_TERMS, NUM_DOCS, low=0, high=100)
  
  max_iter = 3
  k_topics = 16
  
  t1 = datetime.now()
  doc_topics, topic_term_count = learn_topics(terms_docs_matrix, k_topics, max_iter=max_iter)
  doc_topics.optimized().force()
  topic_term_count.optimized().force()
  t2 = datetime.now()
  time_cost = millis(t1,t2)
  util.log_warn('total_time:%s ms, train time per iteration:%s ms', time_cost, time_cost/max_iter)