def monitor(event): st = datetime.datetime.now() stt = time.time() nworker = event['nworker'] pid = event['pid'] if 'roundtime' not in list(event.keys()): event['roundtime'] = 250 if 'waittime' not in list(event.keys()): event['waittime'] = event['roundtime'] * 2 / 3 timer = s3func.timer([event['waittime'], event['roundtime']]) s3 = boto3.client('s3') s32 = boto3.resource('s3') flag = s3func.s3_download_file(s3, AWS_S3_bucket, 'flag/work_pt', '/tmp/work', 0, 1, 0) if flag == 0: print('monitor terminated!!!!!!') return s3.put_object(Bucket=AWS_S3_bucket, Body=str(st), Key='timestamp/timestamp_monitor') finished = [0 for i in range(nworker)] timer.local_start(0) bresult = 0.0 bpos = 0 while 1: if sum(finished) == nworker: break for now in range(nworker): if finished[now] == 0: tresult = timer.query() if tresult[0] == 1: return 0 flag = s3func.s3_download_file( s3, AWS_S3_bucket, 'timestamp/timestamp_trainresult_' + str(pid) + '_' + str(now), '/tmp/result', 0, 1, 0) if flag == 1: finished[now] = 1 with open('/tmp/result', 'r') as f: temp = f.read() r = float(temp) if r > bresult: bresult = r bpos = now s3.put_object(Bucket=AWS_S3_bucket, Body=str([bresult, bpos]), Key='timestamp/timestamp_final_result') et = time.time() st = s3func.s3_read_file_v2(s3, AWS_S3_bucket, 'timestamp/timestamp_startup.tsp', 0, 1, 0) filerecord = s3func.s3_read_file_v2(s3, AWS_S3_bucket, 'results/timecost', 0, 1, 0) if filerecord == 0: filerecord = '' filerecord += str(et - float(st)) + '\n' s3.put_object(Bucket=AWS_S3_bucket, Body=filerecord, Key='results/timecost') """
def train(event): st = datetime.datetime.now() stt = time.time() tcount = time.time() if 'roundtime' not in list(event.keys()): event['roundtime'] = 250 tend = event['roundtime'] ns = event['ns'] pos = event['pos'] mlayers = event['mlayers'] maxiter = event['maxiter'] nowiter = event['nowiter'] funcname = event['funcname'] if 'batchnumber' not in list(event.keys()): event['batchnumber'] = 1 bn = event['batchnumber'] pid = event['pid'] if 'testtime' not in list(event.keys()): event['testtime'] = 10 if 'waittime' not in list(event.keys()): event['waittime'] = tend * 2 / 3 if 'learningrate' not in list(event.keys()): event['learningrate'] = 0.1 waittime = event['waittime'] timer = s3func.timer([waittime, tend]) if 'round' not in list(event.keys()): event['round'] = 0 else: event['round'] += 1 rounditer = event['rounditer'] s3 = boto3.client('s3') s32 = boto3.resource('s3') client = boto3.client('lambda', region_name=AWS_region) response = client.get_function(FunctionName=funcname, ) if nowiter == 0 and pos == 0: s3.put_object(Bucket=AWS_S3_bucket, Body=str(stt), Key='timestamp/timestamp_train_start_' + str(pid)) response = client.get_function(FunctionName=funcname, ) filerecord = s3func.s3_read_file_v2(s3, AWS_S3_bucket, 'results/result', 0, 1, 0) if filerecord == 0: filerecord = '' filerecord += '=====' + ' merge: ' + str(mlayers) + ' samples: ' + str( ns) + ' memory: ' + str(event['memory']) + ' testtime left :' + str( event['testtime']) + ' starttime: ' + str(st) + '\n' filerecord += '=====' + str(stt) + '\n' data = 'train round ' + str(event['round']) + ', round time ' + str( event['roundtime']) + ', start at ' + str(st) + ' ##' + str( time.time()) + '\n' data += 'info: pos ' + str(pos) + ', memory ' + str( response['Configuration']['MemorySize']) + ', mlayers ' + str( mlayers) + ', ns ' + str(ns) + '\n' print('=' * 5, 'train node', pos, '=' * 5, 'train phase start') split = 500 base = int(ns / mlayers[-1]) remin = ns % mlayers[-1] sn = 0 for n in range(pos): sn += base if remin: sn += 1 remin -= 1 en = sn + base if remin: en += 1 print('=' * 5, 'train node', pos, '=' * 5, 'read samples from', sn, 'to', en) train_x = [] train_y = [] sfile = int(sn / split) efile = int((en - 1) / split) print('=' * 5, 'train node', pos, '=' * 5, 'read files from', sfile, 'to', efile) data += 'start up time: ' + str(time.time() - stt) + ' ##' + str( time.time()) + ' ##' + str(stt) + '--' + str(time.time()) + '\n' s3.put_object(Bucket=AWS_S3_bucket, Body=data, Key='timestamp/timestamp_train_' + str(pid) + '_' + str(pos) + '_' + str(event['round']) + '.tsp') #=========================================read======================================== stt = time.time() if os.path.exists('/tmp/samples_save'): print('=' * 5, 'train node', pos, '=' * 5, 'found samples!!!') with open('/tmp/samples_save', 'r') as f: temp = pickle.load(f) #os.remove('/tmp/samples_save_'+str(pos)) train_x = temp['data'] train_y = temp['label'] data += 'found samples!!! time: ' + str( time.time() - stt) + ' ##' + str( time.time()) + ' ##' + str(stt) + '--' + str( time.time()) + '\n' else: print('=' * 5, 'train node', pos, '=' * 5, 'samples not found, downloading') for now in range(sfile, efile + 1): print('downloading', now, 'from range', sfile, efile + 1) flag = s3func.s3_download_file(s3, AWS_S3_bucket, 'data/samples_cifar_' + str(now), '/tmp/samples', 0, 1, 0) if flag == 0: print('=' * 5, 'train node', pos, '=' * 5, 'ERROR!!!: fail to read sample file:', now) with open('/tmp/samples', 'r') as f: temp = pickle.load(f) sread = max([split * now, sn]) - split * now eread = min([split * (now + 1), en]) - split * now if train_x == []: train_x = temp['data'][sread:eread] train_y = temp['label'][sread:eread] else: train_x = np.append(temp['data'], temp['data'][sread:eread], axis=0) train_y = np.append(temp['label'], temp['label'][sread:eread], axis=0) if os.path.exists('/tmp/samples'): os.remove('/tmp/samples') with open('/tmp/samples_save', 'w') as f: pickle.dump({'data': train_x, 'label': train_y}, f) data += 'read from ' + str(sfile) + ' to ' + str( efile) + ' time: ' + str(time.time() - stt) + ' ##' + str( time.time()) + ' ##' + str(stt) + '--' + str( time.time()) + '\n' data += 'samples length: ' + str(len(train_x)) + '\n' if nowiter == 0 and pos == 0: s3.put_object(Bucket=AWS_S3_bucket, Body=str(time.time()), Key='timestamp/timestamp_train_start_' + str(pid)) #=========================================read======================================== #=========================================initialize================================== stt = time.time() x, y, output, global_step, y_pred_cls, params = model() loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=y)) #optimizer = tf.train.RMSPropOptimizer(learning_rate=1e-3).minimize(loss, global_step=global_step) #optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(loss, global_step=global_step) sess = tf.Session() sess.run(tf.global_variables_initializer()) data += 'training initialize time: ' + str( time.time() - stt) + ' ##' + str(stt) + '--' + str(time.time()) + '\n' #=========================================initialize================================== #=========================================LOOP start======================================== avgitertime = 0.0 avgitertimereal = 0.0 minitertimereal = 100000.0 timerecord = [] smt = 0.0 while nowiter < maxiter: itertime = [0.0] stiter = time.time() flag = s3func.s3_download_file(s3, AWS_S3_bucket, 'flag/work_' + str(pid), '/tmp/work', 0, 1, 0) if flag == 0: print('=' * 5, 'train node', pos, '=' * 5, 'Abandon!!!! pid:', pid) return stt = time.time() print('+' * 5, 'train node', pos, 'pid', pid, '+' * 5, 'now start iteration', nowiter) print('=' * 5, 'train node', pos, '=' * 5, 'now start iteration', nowiter) stt2 = time.time() flag = s3func.s3_download_file_timer( s3, AWS_S3_bucket, 'data/modelcifar_' + str(pid) + '_' + str(nowiter), '/tmp/model', timer, 0, 0) itertime[0] += time.time() - stt2 data += 'training ' + str(nowiter) + ' model waiting time: ' + str( time.time() - stt2) + ' ##' + str(stt2) + '--' + str( time.time()) + '\n' if flag == 0: if timer.query()[1] > waittime / 4: print('++++++++lambda train', pos, 'at iteration', nowiter, 'end at', datetime.datetime.now()) s3.put_object(Bucket=AWS_S3_bucket, Body=data, Key='timestamp/timestamp_train_' + str(pid) + '_' + str(pos) + '_' + str(event['round']) + '.tsp') event['nowiter'] = nowiter return else: print('=' * 5, 'train node', pos, '=' * 5, 'ERROR!!!: fail to read model', nowiter) s3.put_object(Bucket=AWS_S3_bucket, Body='fail to read model ' + str(nowiter), Key='error/error_train_' + str(pid) + '_' + str(pos)) return if nowiter >= (event['round'] + 1) * rounditer: if [0, 0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket, Body=filerecord, Key='results/result') print('++++++++lambda train', pos, 'at iteration', nowiter, 'end at', datetime.datetime.now()) s3.put_object(Bucket=AWS_S3_bucket, Body=data, Key='timestamp/timestamp_train_' + str(pid) + '_' + str(pos) + '_' + str(event['round']) + '.tsp') event['nowiter'] = nowiter invoke_lambda(client, funcname, event) return stt2 = time.time() with open('/tmp/model', 'r') as f: temp = pickle.load(f) if temp[0] == []: print('=' * 5, 'train node', pos, '=' * 5, 'ERROR!!!: model format wrong', nowiter) s3.put_object(Bucket=AWS_S3_bucket, Body='model format wrong ' + str(nowiter), Key='error/error_train_' + str(pid) + '_' + str(pos)) return for i in range(len(temp)): sess.run(tf.assign(params[i], temp[i])) itertime[0] += time.time() - stt2 data += 'training ' + str(nowiter) + ' download model time: ' + str( time.time() - stt) + ' ##' + str(stt) + '--' + str( time.time()) + '\n' stt = time.time() bs = len(train_x) / bn print('=' * 5, 'train node', pos, '=' * 5, 'train start') optimizer = tf.train.GradientDescentOptimizer( learning_rate=event['learningrate']).minimize( loss, global_step=global_step) for b in range(bn): i_global, _ = sess.run( [global_step, optimizer], feed_dict={ x: train_x[b * bs:(b + 1) * bs, :], y: train_y[b * bs:(b + 1) * bs, :] }) #=========================train event['learningrate'] = event['learningrate'] itertime[0] += time.time() - stt data += 'training ' + str(nowiter) + ' train time: ' + str( time.time() - stt) + ' ##' + str(stt) + '--' + str( time.time()) + '\n' stt = time.time() pw = [] for i in range(len(params)): pw.append(sess.run(params[i])) with open('/tmp/model', 'w') as f: pickle.dump(pw, f) print('=' * 5, 'train node', pos, '=' * 5, 'write result as layer', len(mlayers) - 1, 'node', pos) s32.Bucket(AWS_S3_bucket).upload_file( '/tmp/model', 'data/modelcifar_' + str(pid) + '_' + str(len(mlayers) - 1) + '_' + str(pos)) itertime[0] += time.time() - stt data += 'training ' + str(nowiter) + ' model write time: ' + str( time.time() - stt) + ' ##' + str(stt) + '--' + str( time.time()) + '\n' if len(event['mergepos']) > 0: mergepos = copy.deepcopy(event['mergepos']) thismergepos = mergepos[0] del mergepos[0] #tempd=merge(mlayers,thismergepos,mergepos,nowiter,timer,max(waittime,tend-time.time()+tcount),itertime,pid) smt = time.time() tempd = merge(mlayers, thismergepos, mergepos, nowiter, timer, waittime, itertime, pid) smt = time.time() - smt if tempd == 0: return elif tempd == 1: print('++++++++lambda train', pos, 'at iteration', nowiter, 'end at', datetime.datetime.now()) s3.put_object(Bucket=AWS_S3_bucket, Body=data, Key='timestamp/timestamp_train_' + str(pid) + '_' + str(pos) + '_' + str(event['round']) + '.tsp') event['nowiter'] = nowiter return else: data += tempd data += 'training ' + str(nowiter) + ' valid iteration time: ' + str( itertime[0]) + '\n' print('-' * 5, 'train node', pos, '-' * 5, 'now end iteration', nowiter) avgitertime += itertime[0] """ if nowiter>=min(10,maxiter-1) and [0,0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertime/(nowiter+1)), Key='timestamp/timestamp_iteration_'+str(pid)+'.tsp') """ thisitertime = time.time() - stiter filerecord += str(time.time()) + '\n' #filerecord+=str(time.time()-stiter)+'\n' if thisitertime < minitertimereal: minitertimereal = thisitertime if nowiter >= 2: avgitertimereal += time.time() - stiter insert_sort(time.time() - stiter, timerecord) #filerecord+=str(time.time()-stiter)+'\n' #filerecord+=str(smt)+'\n' if nowiter >= min(10, maxiter - 1) and [0, 0] in event['mergepos']: aaaa = 0 #s3.put_object(Bucket=AWS_S3_bucket,Body=str(timerecord), Key='timestamp/timestamp_iteration_each_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(avg_no_abnormal(timerecord,2)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(find_median(timerecord)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertimereal/(nowiter-2+1)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(minitertimereal), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') nowiter += 1 if [0, 0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket, Body=filerecord, Key='results/result') """ event['testtime']-=1 if event['testtime']>0: inputs={'state':0,'mod':0,'slayers':[],'mlayers':event['mlayers'],'layers':[100,100,100,100,100],'pos':[0,0],'ns':1000000,'maxiter':10,'nowiter':0,'roundtime':250} inputs['testtime']=event['testtime'] invoke_lambda(client,'testfunc',inputs) time.sleep(10) """ if [0, 0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket, Body=str(time.time()), Key='timestamp/timestamp_train_end_' + str(pid)) s3.put_object(Bucket=AWS_S3_bucket, Body=data, Key='timestamp/timestamp_train_' + str(pid) + '_' + str(pos) + '_' + str(event['round']) + '.tsp')
def train(event): st=datetime.datetime.now() stt=time.time() tcount=time.time() if 'roundtime' not in event.keys(): event['roundtime']=250 tend=event['roundtime'] ns=event['ns'] pos=event['pos'] layers=event['layers'] lr=event['lr'] if 'batchnumber' not in event.keys(): event['batchnumber']=1 pid=event['pid'] if 'testtime' not in event.keys(): event['testtime']=10 if 'waittime' not in event.keys(): event['waittime']=tend*2/3 waittime=event['waittime'] timer=s3func.timer([waittime,tend]) #waittime=tend/4 if 'round' not in event.keys(): event['round']=0 else: event['round']+=1 rounditer=event['rounditer'] s3 = boto3.client('s3') s32 = boto3.resource('s3') flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_pt','/tmp/work',0,1,0) if flag==0: print 'terminated!!!!!!' return client=boto3.client('lambda',region_name = AWS_region) filerecord=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'results/result',0,1,0) if filerecord==0: filerecord='' filerecord+='====='+' starttime: '+str(st)+'\n' filerecord+='====='+str(stt)+'\n' data='train round '+str(event['round'])+', round time '+str(event['roundtime'])+', start at '+str(st)+' ##'+str(time.time())+'\n' data+='info: pos '+str(pos)+'\n' print '='*5,'train node',pos,'='*5,'train phase start' data+='start up time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n' s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])) #=========================================read======================================== stt=time.time() print '='*5,'train node',pos,'='*5,'downloading samples' flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_x','/tmp/mnist_x',0,1,0) if flag==0: print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read sample file x' flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_y','/tmp/mnist_y',0,1,0) if flag==0: print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read sample file y' flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_test_x','/tmp/mnist_test_x',0,1,0) if flag==0: print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read test file x' flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_test_y','/tmp/mnist_test_y',0,1,0) if flag==0: print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read test file y' train_x = extract_data('/tmp/mnist_x', 60000) train_y = extract_labels('/tmp/mnist_y', 60000) test_x = extract_data('/tmp/mnist_test_x', 10000) test_y = extract_labels('/tmp/mnist_test_y', 10000) train_x=train_x.reshape([60000,28*28]) test_x=test_x.reshape([10000,28*28]) data+='samples length: '+str(len(train_x))+'\n' #=========================================read======================================== #=========================================initialize================================== stt=time.time() outputs,x,y,labels,train_op,weights,biases=model(layers,lr) sess = tf.Session() sess.run(tf.global_variables_initializer()) data+='training initialize time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n' #=========================================initialize================================== #=========================================LOOP start======================================== num_iterations=event['maxiter'] bn=event['batchnumber'] alltime=0.0 for it in range(num_iterations): st=time.time() bs=len(train_x)/bn for b in range(bn): batch_xs = train_x[b*bs:(b+1)*bs] batch_ys = train_y[b*bs:(b+1)*bs] sess.run(train_op, feed_dict={x: batch_xs, y: batch_ys}) alltime+=time.time()-st result=sess.run(outputs,feed_dict={x: test_x}) acc=(np.argmax(result[-1], axis=1)==test_y).mean() s3.put_object(Bucket=AWS_S3_bucket,Body=str(acc), Key='timestamp/timestamp_trainresult_'+str(pid)+'_'+str(pos)) s3.put_object(Bucket=AWS_S3_bucket,Body=str([event['layers'],event['lr'],event['batchnumber'],event['maxiter']]), Key='timestamp/timestamp_traininfo_'+str(pid)+'_'+str(pos)) if pos==event['nworker']-1: event['state']=2 cevent=copy.deepcopy(event) invoke_lambda(client,'nntffunc_1',cevent) s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round']))
def train(event): st=datetime.datetime.now() stt=time.time() tcount=time.time() if 'roundtime' not in event.keys(): event['roundtime']=250 tend=event['roundtime'] ns=event['ns'] pos=event['pos'] mlayers=event['mlayers'] layers=event['layers'] maxiter=event['maxiter'] nowiter=event['nowiter'] funcname=event['funcname'] if 'batchnumber' not in event.keys(): event['batchnumber']=1 bn=event['batchnumber'] pid=event['pid'] if 'testtime' not in event.keys(): event['testtime']=10 if 'waittime' not in event.keys(): event['waittime']=tend*2/3 waittime=event['waittime'] timer=s3func.timer([waittime,tend]) #waittime=tend/4 if 'round' not in event.keys(): event['round']=0 else: event['round']+=1 rounditer=event['rounditer'] s3 = boto3.client('s3') s32 = boto3.resource('s3') client=boto3.client('lambda',region_name = AWS_region) if nowiter==0 and pos==0: s3.put_object(Bucket=AWS_S3_bucket,Body=str(stt), Key='timestamp/timestamp_train_start_'+str(pid)) response = client.get_function( FunctionName=funcname, ) filerecord=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'results/result',0,1,0) if filerecord==0: filerecord='' filerecord+='====='+' merge: '+str(mlayers)+' samples: '+str(ns)+' memory: '+str(event['memory'])+' testtime left :'+str(event['testtime'])+' starttime: '+str(st)+'\n' filerecord+='====='+str(stt)+'\n' data='train round '+str(event['round'])+', round time '+str(event['roundtime'])+', start at '+str(st)+' ##'+str(time.time())+'\n' data+='info: pos '+str(pos)+', memory '+str(response['Configuration']['MemorySize'])+', mlayers '+str(mlayers)+', ns '+str(ns)+', layers '+str(layers)+'\n' print '='*5,'train node',pos,'='*5,'train phase start' split=10000 base=int(ns/mlayers[-1]) remin=ns%mlayers[-1] sn=0 for n in range(pos): sn+=base if remin: sn+=1 remin-=1 en=sn+base if remin: en+=1 print '='*5,'train node',pos,'='*5,'read samples from',sn,'to',en training_inputs=[] training_outputs=[] sfile=int(sn/split) efile=int((en-1)/split) print '='*5,'train node',pos,'='*5,'read files from',sfile,'to',efile data+='start up time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n' s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp') #=========================================read======================================== stt=time.time() if os.path.exists('/tmp/samples_save'): print '='*5,'train node',pos,'='*5,'found samples!!!' with open('/tmp/samples_save', 'r') as f: temp=pickle.load(f) #os.remove('/tmp/samples_save_'+str(pos)) training_inputs=temp[0] training_outputs=temp[1] data+='found samples!!! time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n' else: print '='*5,'train node',pos,'='*5,'samples not found, downloading' for now in range(sfile,efile+1): print 'downloading',now,'from range',sfile,efile+1 #flag=s3func.s3_download_file_v2(s3,s32,AWS_S3_bucket,'data/samples_'+str(now),'/tmp/samples',0,1,0) flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_'+str(now),'/tmp/samples',0,1,0) if flag==0: print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read sample file:',now with open('/tmp/samples', 'r') as f: temp=pickle.load(f) sread=max([split*now,sn])-split*now eread=min([split*(now+1),en])-split*now if training_inputs==[]: training_inputs=temp[0][sread:eread] training_outputs=temp[1][sread:eread] else: training_inputs=np.append(training_inputs,temp[0][sread:eread],axis=0) training_outputs=np.append(training_outputs,temp[1][sread:eread],axis=0) #training_inputs.extend(temp[0][sread:eread]) #training_outputs.extend(temp[1][sread:eread]) if os.path.exists('/tmp/samples'): os.remove('/tmp/samples') with open('/tmp/samples_save', 'w') as f: pickle.dump([training_inputs,training_outputs], f) data+='read from '+str(sfile)+' to '+str(efile)+' time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n' data+='samples length: '+str(len(training_inputs))+'\n' if nowiter==0 and pos==0: s3.put_object(Bucket=AWS_S3_bucket,Body=str(time.time()), Key='timestamp/timestamp_train_start_2_'+str(pid)) #=========================================read======================================== #=========================================initialize================================== stt=time.time() weights=[[] for i in range(len(layers))] biases=[[] for i in range(len(layers))] outputs=[[] for i in range(len(layers))] for l in range(len(layers)): if l>0: weights[l]=tf.Variable(tf.random_normal([layers[l-1], layers[l]])) biases[l]=tf.Variable(tf.random_normal([1, layers[l]])) x=tf.placeholder(tf.float32,[None,layers[0]]) y=tf.placeholder(tf.float32,[None,layers[-1]]) outputs[0]=x for l in range(len(layers)): if l>0: outputs[l]= tf.nn.tanh(tf.add(tf.matmul(outputs[l-1], weights[l]), biases[l])) cost=0.5*(y-outputs[-1])**2 #train=tf.train.AdamOptimizer(0.1).minimize(cost) train=tf.train.GradientDescentOptimizer(1e-1).minimize(cost) init=tf.global_variables_initializer() session=tf.Session() session.run(init) data+='training initialize time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n' #=========================================initialize================================== #=========================================LOOP start======================================== avgitertime=0.0 avgitertimereal=0.0 minitertimereal=100000.0 timerecord=[] smt=0.0 while nowiter<maxiter: itertime=[0.0] stiter=time.time() #flag=s3func.s3_download_file_v2(s3,s32,AWS_S3_bucket,'flag/work_'+str(pid),'/tmp/work',0,1,0) flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_'+str(pid),'/tmp/work',0,1,0) if flag==0: print '='*5,'train node',pos,'='*5,'Abandon!!!! pid:',pid return stt=time.time() print '+'*5,'train node',pos,'pid',pid,'+'*5,'now start iteration',nowiter print '='*5,'train node',pos,'='*5,'now start iteration',nowiter stt2=time.time() #flag=s3func.s3_download_file_v2(s3,s32,AWS_S3_bucket,'data/model_'+str(pid)+'_'+str(nowiter),'/tmp/model',max(waittime,tend-time.time()+tcount),0,0) #flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/model_'+str(pid)+'_'+str(nowiter),'/tmp/model',max(waittime,tend-time.time()+tcount),0,0) flag=s3func.s3_download_file_timer(s3,AWS_S3_bucket,'data/model_'+str(pid)+'_'+str(nowiter),'/tmp/model',timer,0,0) itertime[0]+=time.time()-stt2 data+='training '+str(nowiter)+' model waiting time: '+str(time.time()-stt2)+' ##'+str(stt2)+'--'+str(time.time())+'\n' #print 'flag',flag if flag==0: if timer.query()[1]>waittime/4: print '++++++++lambda train',pos,'at iteration',nowiter,'end at',datetime.datetime.now() """ with open('/tmp/samples_save_'+str(pos), 'w') as f: pickle.dump([training_inputs,training_outputs], f) """ s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp') event['nowiter']=nowiter #invoke_lambda(client,funcname,event) return else: print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read model',nowiter s3.put_object(Bucket=AWS_S3_bucket,Body='fail to read model '+str(nowiter), Key='error/error_train_'+str(pid)+'_'+str(pos)) return if nowiter>=(event['round']+1)*rounditer: print '++++++++lambda train',pos,'at iteration',nowiter,'end at',datetime.datetime.now() if [0,0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket,Body=filerecord, Key='results/result') """ with open('/tmp/samples_save_'+str(pos), 'w') as f: pickle.dump([training_inputs,training_outputs], f) """ s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp') event['nowiter']=nowiter invoke_lambda(client,funcname,event) return stt2=time.time() with open('/tmp/model', 'r') as f: temp=pickle.load(f) if temp[0]==[]: print '='*5,'train node',pos,'='*5,'ERROR!!!: model format wrong',nowiter s3.put_object(Bucket=AWS_S3_bucket,Body='model format wrong '+str(nowiter), Key='error/error_train_'+str(pid)+'_'+str(pos)) return for l in range(len(layers)): if l>0: session.run(tf.assign(weights[l],temp[0][l])) session.run(tf.assign(biases[l],temp[1][l])) itertime[0]+=time.time()-stt2 data+='training '+str(nowiter)+' download model time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n' stt=time.time() print '='*5,'train node',pos,'='*5,'train start' bs=len(training_inputs)/bn for b in range(bn): session.run(train,feed_dict={x:training_inputs[b*bs:(b+1)*bs,:],y:training_outputs[b*bs:(b+1)*bs,:]})#=========================train itertime[0]+=time.time()-stt data+='training '+str(nowiter)+' train time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n' stt=time.time() rew=session.run(weights) reb=session.run(biases) model=[rew,reb] with open('/tmp/model', 'w') as f: pickle.dump(model, f) print '='*5,'train node',pos,'='*5,'write result as layer',len(mlayers)-1,'node',pos s32.Bucket(AWS_S3_bucket).upload_file('/tmp/model', 'data/model_'+str(pid)+'_'+str(len(mlayers)-1)+'_'+str(pos)) #s3.put_object(Bucket=AWS_S3_bucket,Body='true', Key='flag/flag_'+str(pid)+'_'+str(nowiter)+'_'+str(len(mlayers)-1)+'_'+str(pos)) itertime[0]+=time.time()-stt data+='training '+str(nowiter)+' model write time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n' if len(event['mergepos'])>0: mergepos=copy.deepcopy(event['mergepos']) thismergepos=mergepos[0] del mergepos[0] #tempd=merge(mlayers,thismergepos,mergepos,nowiter,timer,max(waittime,tend-time.time()+tcount),itertime,pid) smt=time.time() tempd=merge(mlayers,thismergepos,mergepos,nowiter,timer,waittime,itertime,pid) smt=time.time()-smt if tempd==0: return elif tempd==1: print '++++++++lambda train',pos,'at iteration',nowiter,'end at',datetime.datetime.now() """ with open('/tmp/samples_save_'+str(pos), 'w') as f: pickle.dump([training_inputs,training_outputs], f) """ s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp') event['nowiter']=nowiter #invoke_lambda(client,funcname,event) return else: data+=tempd data+='training '+str(nowiter)+' valid iteration time: '+str(itertime[0])+'\n' print '-'*5,'train node',pos,'-'*5,'now end iteration',nowiter avgitertime+=itertime[0] """ if nowiter>=min(10,maxiter-1) and [0,0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertime/(nowiter+1)), Key='timestamp/timestamp_iteration_'+str(pid)+'.tsp') """ thisitertime=time.time()-stiter filerecord+=str(time.time())+'\n' #filerecord+=str(time.time()-stiter)+'\n' if thisitertime<minitertimereal: minitertimereal=thisitertime #insert_sort(time.time()-stiter,timerecord) if nowiter>=2: avgitertimereal+=time.time()-stiter insert_sort(time.time()-stiter,timerecord) #filerecord+=str(smt)+'\n' if nowiter>=min(10,maxiter-1) and [0,0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket,Body=str(timerecord), Key='timestamp/timestamp_iteration_each_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(avg_no_abnormal(timerecord,2)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(float(np.mean(timerecord))), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') s3.put_object(Bucket=AWS_S3_bucket,Body=str(float(np.mean(extract_valid(timerecord)))), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertimereal/(nowiter-2+1)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(minitertimereal), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') nowiter+=1 if [0,0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket,Body=filerecord, Key='results/result') """ event['testtime']-=1 if event['testtime']>0: inputs={'state':0,'mod':0,'batchnumber':20,'slayers':[],'mlayers':[1,100],'layers':[20,100,100,100,100,100,1],'pos':[0,0],'ns':1000000,'maxiter':10,'nowiter':0,'roundtime':250,'rounditer':15} inputs['testtime']=event['testtime'] invoke_lambda(client,'testfunc',inputs) time.sleep(10) """ if [0,0] in event['mergepos'] and nowiter>=maxiter: s3.put_object(Bucket=AWS_S3_bucket,Body=str(time.time()), Key='timestamp/timestamp_train_end_'+str(pid)) s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp')
def monitor(event): s3 = boto3.client('s3') client=boto3.client('lambda',region_name = AWS_region) data='' print 'monitor begin' #====================================================load info=================================================================== if 'process' not in event.keys(): print 'ERROR!!!!! process key not found' s3.put_object(Bucket=AWS_S3_bucket,Body='process key not found', Key='error/error_monitor.tsp') return if 'round' not in event.keys(): event['round']=0 process=event['process'] tend=250 cratio=event['cratio']# cost ratio pcratio=event['pcratio']# performance/cost ratio rangen=event['rangen'] rangemem=event['rangemem'] maxchange=event['maxchange'] changestep=event['changestep'] lastpid=[] data+='basic info: '+'pcratio: '+str(pcratio)+'rangen: '+str(rangen)+'maxchange: '+str(maxchange)+'changestep: '+str(changestep)+'\n' if 'avgrecord' not in event.keys(): event['avgrecord']=[[0.0,0] for i in range(20)] if 'lastslop' not in event.keys(): event['lastslop']=[] if 'regmod' not in event.keys(): event['regmod']=0 if 'regcount' not in event.keys(): event['regcount']=[0,0,0] diffrange=event['diffrange'] tcount=time.time() data+='UTC:'+str(datetime.datetime.now())+'. '+str(tcount)+'\n' s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp') if event['round']>0: temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_monitor_'+str(event['round']-1)+'.tsp',0,1,0) if temp!=0: data+=temp data+='==============================' best=process[0] #====================================================load info=================================================================== while time.time()-tcount<tend: #load the control file "work_monitor". If it is deleted manually, monitor stops. flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_monitor','/tmp/work',0,1,0) if flag==0: print 'monitor terminated!!!!!!' return if event['regmod']==0: if event['regcount'][0]>=event['regtimes'][0]: event['regcount'][0]=0 event['regmod']=1 if event['regmod']==1: if event['regcount'][1]>=event['regtimes'][1]: event['regcount'][1]=0 event['regmod']=0 event['regcount'][2]+=1 if event['regcount'][2]>=event['regtimes'][2]: print 'final result:',best data+='final result: '+str(best)+'\n' print 'achieve the maximum regression constraint, terminated!!!!!' data+='achieve the maximum regression constraint, terminated!!!!!\n' s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp') return times=[-1,-1] print 'regression times:',event['regcount'] data+='regression times: '+str(event['regcount'])+'\n' #+++++++++++++++++++++++++++++++++++++++++++++++++++++++ temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_startstructure_'+str(process[0][0])+'.tsp',0,1,0) if temp==0: cevent=copy.deepcopy(event) cevent['pid']=process[0][0] cevent['mlayers']=[1,process[0][1]] cevent['state']=2 cevent['funcname']=process[0][3] cevent['memory']=process[0][2] cevent['round']=-1 if lastpid!=[]: cevent['modelname']='data/model_'+str(lastpid)+'_new' else: cevent['modelname']='data/model' print 'invoke',process[0][3] invoke_lambda(client,process[0][3],cevent) temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_startstructure_'+str(process[0][0])+'.tsp',tend-time.time()+tcount,0,0) data+='waiting the first iteration data:'+'timestamp/timestamp_iteration_real_'+str(process[0][0])+'.tsp'+'\n' s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp') temp=0 while tend-time.time()+tcount>0 and temp==0: flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_monitor','/tmp/work',0,1,0) if flag==0: print 'monitor terminated!!!!!!' return temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_iteration_each_'+str(process[0][0])+'.tsp',0,1,0) time.sleep(5) if temp==0: print 'ERROR!!!!! connot read timestamp',process[0] s3.put_object(Bucket=AWS_S3_bucket,Body='connot read timestamp: '+str(process[0]), Key='error/error_monitor.tsp') break #return times[0]=json.loads(temp) s3func.s3_delete_file(s3,AWS_S3_bucket,'flag/work_'+str(process[0][0])) s3func.s3_clear_bucket(AWS_S3_bucket,'timestamp/timestamp_train_'+str(process[0][0])) lastpid=process[0][0] data+='the time cost is: '+str(times[0])+'\n' #-------------------------------------------------------- temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_startstructure_'+str(process[1][0])+'.tsp',0,1,0) if temp==0: cevent=copy.deepcopy(event) cevent['pid']=process[1][0] cevent['mlayers']=[1,process[1][1]] cevent['state']=2 cevent['funcname']=process[1][3] cevent['memory']=process[1][2] cevent['round']=-1 if lastpid!=[]: cevent['modelname']='data/model_'+str(lastpid)+'_new' else: cevent['modelname']='data/model' print 'invoke',process[1][3] invoke_lambda(client,process[1][3],cevent) temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_startstructure_'+str(process[1][0])+'.tsp',tend-time.time()+tcount,0,0) data+='waiting the second iteration data:'+'timestamp/timestamp_iteration_real_'+str(process[1][0])+'.tsp'+'\n' s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp') temp=0 while tend-time.time()+tcount>0 and temp==0: flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_monitor','/tmp/work',0,1,0) if flag==0: print 'monitor terminated!!!!!!' return temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_iteration_each_'+str(process[1][0])+'.tsp',0,1,0) time.sleep(5) if temp==0: print 'ERROR!!!!! connot read timesteamp',process[1] s3.put_object(Bucket=AWS_S3_bucket,Body='connot read timestamp: '+str(process[1]), Key='error/error_monitor.tsp') break #return times[1]=json.loads(temp) s3func.s3_delete_file(s3,AWS_S3_bucket,'flag/work_'+str(process[1][0])) s3func.s3_clear_bucket(AWS_S3_bucket,'timestamp/timestamp_train_'+str(process[1][0])) lastpid=process[1][0] data+='the time cost is: '+str(times[1])+'\n' #+++++++++++++++++++++++++++++++++++++++++++++++++++++++ if event['regmod']==0: times[0]=float(np.median(times[0])) times[1]=float(np.median(times[1])) if times[1]<times[0]: process.reverse() times.reverse() best=process[0] nextfunc=process[1][3] F=times[0]-times[1] data+=str(process[0])+': '+str(times[0])+'. '+str(process[1])+': '+str(times[1])+'. \n' data+='delta target value is '+str(F)+'\n' if process[0][1]==process[1][1]: nextn=process[0][1] else: if event['lastslop']==[]: change=-pcratio[0]*F/(process[0][1]-process[1][1]) event['lastslop']=F/(process[0][1]-process[1][1]) else: change=-pcratio[0]*F/(process[0][1]-process[1][1])+pcratio[1]*(F/(process[0][1]-process[1][1])-event['lastslop']) event['lastslop']=F/(process[0][1]-process[1][1]) change=ceil_step(change,changestep[0]) if change>maxchange[0]: change=maxchange[0] elif change<-maxchange[0]: change=-maxchange[0] nextn=int(process[0][1]+change) nextmem=process[0][2] else: metric=[0.0,0.0] if OPTIMIZATION==0: metric[0]=float(np.median([(process[0][2]*ts) for ts in times[0]])) metric[1]=float(np.median([(process[1][2]*ts) for ts in times[1]])) else: metric[0]=float(np.median([1.0/(process[0][2]*ts**2) for ts in times[0]])) metric[1]=float(np.median([1.0/(process[1][2]*ts**2) for ts in times[1]])) if metric[1]<metric[0]: #if process[1][2]*times[1]<process[0][2]*times[0]: process.reverse() times.reverse() metric.reverse() best=process[0] nextfunc=process[1][3] F=metric[0]-metric[1] data+=str(process[0])+': '+str(metric[0])+'. '+str(process[1])+': '+str(metric[1])+'. \n' data+='delta target value is '+str(F)+'\n' nextn=process[0][1] if process[0][2]==process[1][2]: nextmem=process[0][2] else: #change=-pcratio[2]*F/(process[0][2]-process[1][2]) if event['lastslop']==[]: change=-pcratio[2]*F/(process[0][2]-process[1][2]) event['lastslop']=F/(process[0][2]-process[1][2]) else: change=-pcratio[2]*F/(process[0][2]-process[1][2])+pcratio[3]*(F/(process[0][2]-process[1][2])-event['lastslop']) event['lastslop']=F/(process[0][2]-process[1][2]) #change=-change#for 1/mt2 change=ceil_step(change,changestep[1]) if change>maxchange[1]: change=maxchange[1] elif change<-maxchange[1]: change=-maxchange[1] nextmem=int(process[0][2]+change) nextpid=max(process[0][0],process[1][0])+1 if nextn>rangen[1]: nextn=rangen[1] elif nextn<rangen[0]: nextn=rangen[0] if nextmem>rangemem[1]: nextmem=rangemem[1] elif nextmem<rangemem[0]: nextmem=rangemem[0] #s3func.s3_clear_bucket(AWS_S3_bucket,'data/model_'+str(process[1][0])) process[1]=process[0] process[0]=[nextpid,nextn,nextmem,nextfunc] #process[1][0]=max(process[0][0],process[1][0])+1 event['process']=process response = client.update_function_configuration( FunctionName=nextfunc, MemorySize=nextmem, ) time.sleep(5) print '---------',process if abs(process[0][1]-process[1][1])<1 and process[0][2]==process[1][2]: print 'terminated!!!' print 'final result:',best data+='final result: '+str(best)+'\n' #data+='the final result is '+str(process[0]) s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp') return data+= '---------'+str(process)+'\n' if event['regmod']==0: event['regcount'][0]+=1 else: event['regcount'][1]+=1 s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp') event['round']+=1 invoke_lambda(client,event['funcname'],event)