def start_structure(event): st=datetime.datetime.now() pid=event['pid'] mlayers=event['mlayers'] nowiter=event['nowiter'] s3 = boto3.client('s3') s3.put_object(Bucket=AWS_S3_bucket,Body='true', Key='flag/work_'+str(pid)) data='' print 'structure start:',pid if 'modelname' in event.keys(): data+='modelname found: '+event['modelname']+'\n' flag=s3func.s3_download_file(0,AWS_S3_bucket,event['modelname'],'/tmp/model',0,1,0) if flag==0: print 'ERROR!!!',event['modelname'],'not found' s3.put_object(Bucket=AWS_S3_bucket,Body=event['modelname']+' not found', Key='error/error_start_'+str(pid)) else: data+='modelname not found\n' s3func.s3_download_file(0,AWS_S3_bucket,'data/modelcifar','/tmp/model',0,1,0) s32 = boto3.resource('s3') s32.Bucket(AWS_S3_bucket).upload_file('/tmp/model', 'data/modelcifar_'+str(pid)+'_'+str(nowiter)) client=boto3.client('lambda',region_name = AWS_region) if ('fixedmlayers' not in event.keys()) or event['fixedmlayers']==0: mlayers=estimate_merging(mlayers[-1]) if len(mlayers)>4: mlayers=estimate_best_layers(mlayers[-1],2) event['mlayers']=mlayers data+='mlayers is: '+str(mlayers)+'\n' search=range(len(mlayers)-1) search.reverse() mstart=[] for l in search: base=int(mlayers[l+1]/mlayers[l]) remin=mlayers[l+1]%mlayers[l] temp=[] now=0 for i in range(mlayers[l]): now+=base if remin>0: now+=1 remin-=1 temp.append(now-1) if l!=len(mlayers)-2: for i in range(len(temp)): temp[i]=mstart[-1][temp[i]] mstart.append(temp) mstart.reverse() for i in range(mlayers[-1]): event['pos']=i event['state']=1 event['mergepos']=[] for l in range(len(mstart)): if i in mstart[l]: event['mergepos'].append([l,mstart[l].index(i)]) event['mergepos'].reverse() data+='work node '+str(event['pos'])+', mergepos: '+str(event['mergepos'])+'\n' invoke_lambda(client,event['funcname'],event) s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_startstructure_'+str(pid)+'.tsp')
def monitor(event): st = datetime.datetime.now() stt = time.time() nworker = event['nworker'] pid = event['pid'] if 'roundtime' not in list(event.keys()): event['roundtime'] = 250 if 'waittime' not in list(event.keys()): event['waittime'] = event['roundtime'] * 2 / 3 timer = s3func.timer([event['waittime'], event['roundtime']]) s3 = boto3.client('s3') s32 = boto3.resource('s3') flag = s3func.s3_download_file(s3, AWS_S3_bucket, 'flag/work_pt', '/tmp/work', 0, 1, 0) if flag == 0: print('monitor terminated!!!!!!') return s3.put_object(Bucket=AWS_S3_bucket, Body=str(st), Key='timestamp/timestamp_monitor') finished = [0 for i in range(nworker)] timer.local_start(0) bresult = 0.0 bpos = 0 while 1: if sum(finished) == nworker: break for now in range(nworker): if finished[now] == 0: tresult = timer.query() if tresult[0] == 1: return 0 flag = s3func.s3_download_file( s3, AWS_S3_bucket, 'timestamp/timestamp_trainresult_' + str(pid) + '_' + str(now), '/tmp/result', 0, 1, 0) if flag == 1: finished[now] = 1 with open('/tmp/result', 'r') as f: temp = f.read() r = float(temp) if r > bresult: bresult = r bpos = now s3.put_object(Bucket=AWS_S3_bucket, Body=str([bresult, bpos]), Key='timestamp/timestamp_final_result') et = time.time() st = s3func.s3_read_file_v2(s3, AWS_S3_bucket, 'timestamp/timestamp_startup.tsp', 0, 1, 0) filerecord = s3func.s3_read_file_v2(s3, AWS_S3_bucket, 'results/timecost', 0, 1, 0) if filerecord == 0: filerecord = '' filerecord += str(et - float(st)) + '\n' s3.put_object(Bucket=AWS_S3_bucket, Body=filerecord, Key='results/timecost') """
def merge(mlayers, pos, mergepos, nowiter, timer, waittime, itertime, pid): stt = time.time() tcount = time.time() layer = pos[0] node = pos[1] data = '' base = int(mlayers[layer + 1] / mlayers[layer]) remin = mlayers[layer + 1] % mlayers[layer] print('=' * 5, 'merge node at layer', layer, 'node', node, '=' * 5, 'merge phase start') sn = 0 for n in range(node): sn += base if remin: sn += 1 remin -= 1 en = sn + base if remin: en += 1 print('=' * 5, 'merge node at layer', layer, 'node', node, '=' * 5, 'merge model file at layer', layer + 1, 'from', sn, 'to', en) s3 = boto3.client('s3') s32 = boto3.resource('s3') itertime[0] += time.time() - stt data += 'merge ' + str(nowiter) + ' layer ' + str(layer) + ' node ' + str( node) + ' merge ' + str([sn, en - 1]) + ' start up time: ' + str( time.time() - stt) + ' ##' + str(stt) + '--' + str( time.time()) + '\n' #============================================start============================================== stt = time.time() print('=' * 5, 'merge node at layer', layer, 'node', node, '=' * 5, 'iteration', nowiter) params = [] flagt = 0.0 modelt = 0.0 itertime[0] += 1 #files=['data/modelcifar_'+str(pid)+'_'+str(layer+1)+'_'+str(now) for now in range(sn,en)] finished = [0 for i in range(en - sn)] timer.local_start(0) while 1: if sum(finished) == (en - sn): break for now in range(sn, en): if finished[now - sn] == 0: tresult = timer.query() if tresult[0] == 1: if tresult[1] > waittime / 4: return 0 else: print('=' * 5, 'merge node at layer', layer, 'node', node, '=' * 5, 'ERROR!!!: fail to read model: layer', layer + 1, 'finished state', str(finished)) s3.put_object( Bucket=AWS_S3_bucket, Body='fail to read model at iteration ' + str(nowiter) + ' at layer ' + str(layer + 1) + ', finished state' + str(finished), Key='error/error_merge_' + str(pid) + '_' + str(layer) + '_' + str(node)) return 0 flag = s3func.s3_download_file( s3, AWS_S3_bucket, 'data/modelcifar_' + str(pid) + '_' + str(layer + 1) + '_' + str(now), '/tmp/model', 0, 1, 1) if flag == 1: finished[now - sn] = 1 with open('/tmp/model', 'r') as f: temp = pickle.load(f) if not temp[0] == []: if params == []: params = temp else: for i in range(len(temp)): params[i] = params[i] + temp[i] data += 'merge ' + str(nowiter) + ' layer ' + str(layer) + ' node ' + str( node) + ' model read time: ' + str(time.time() - stt) + ' ##' + str( stt) + '--' + str(time.time()) + '\n' stt = time.time() if layer == 0: print('=' * 5, 'merge node at layer', layer, 'node', node, '=' * 5, 'now is the final node') if not params == []: for i in range(len(params)): params[i] = params[i] / mlayers[-1] with open('/tmp/model', 'w') as f: pickle.dump(params, f) s32 = boto3.resource('s3') s32.Bucket(AWS_S3_bucket).upload_file( '/tmp/model', 'data/modelcifar_' + str(pid) + '_new') s32.Bucket(AWS_S3_bucket).upload_file( '/tmp/model', 'data/modelcifar_' + str(pid) + '_' + str(nowiter + 1)) else: with open('/tmp/model', 'w') as f: pickle.dump(params, f) print('=' * 5, 'merge node at layer', layer, 'node', node, '=' * 5, 'write model as layer', layer, 'node', node) s32.Bucket(AWS_S3_bucket).upload_file( '/tmp/model', 'data/modelcifar_' + str(pid) + '_' + str(layer) + '_' + str(node)) itertime[0] += time.time() - stt data += 'merge ' + str(nowiter) + ' layer ' + str(layer) + ' node ' + str( node) + ' model write time: ' + str(time.time() - stt) + ' ##' + str( stt) + '--' + str(time.time()) + '\n' if len(mergepos) > 0: thismergepos = mergepos[0] del mergepos[0] tempd = merge(mlayers, thismergepos, mergepos, nowiter, timer, waittime, itertime, pid) if type(tempd) == int: return tempd else: data += tempd return data
def train(event): st = datetime.datetime.now() stt = time.time() tcount = time.time() if 'roundtime' not in list(event.keys()): event['roundtime'] = 250 tend = event['roundtime'] ns = event['ns'] pos = event['pos'] mlayers = event['mlayers'] maxiter = event['maxiter'] nowiter = event['nowiter'] funcname = event['funcname'] if 'batchnumber' not in list(event.keys()): event['batchnumber'] = 1 bn = event['batchnumber'] pid = event['pid'] if 'testtime' not in list(event.keys()): event['testtime'] = 10 if 'waittime' not in list(event.keys()): event['waittime'] = tend * 2 / 3 if 'learningrate' not in list(event.keys()): event['learningrate'] = 0.1 waittime = event['waittime'] timer = s3func.timer([waittime, tend]) if 'round' not in list(event.keys()): event['round'] = 0 else: event['round'] += 1 rounditer = event['rounditer'] s3 = boto3.client('s3') s32 = boto3.resource('s3') client = boto3.client('lambda', region_name=AWS_region) response = client.get_function(FunctionName=funcname, ) if nowiter == 0 and pos == 0: s3.put_object(Bucket=AWS_S3_bucket, Body=str(stt), Key='timestamp/timestamp_train_start_' + str(pid)) response = client.get_function(FunctionName=funcname, ) filerecord = s3func.s3_read_file_v2(s3, AWS_S3_bucket, 'results/result', 0, 1, 0) if filerecord == 0: filerecord = '' filerecord += '=====' + ' merge: ' + str(mlayers) + ' samples: ' + str( ns) + ' memory: ' + str(event['memory']) + ' testtime left :' + str( event['testtime']) + ' starttime: ' + str(st) + '\n' filerecord += '=====' + str(stt) + '\n' data = 'train round ' + str(event['round']) + ', round time ' + str( event['roundtime']) + ', start at ' + str(st) + ' ##' + str( time.time()) + '\n' data += 'info: pos ' + str(pos) + ', memory ' + str( response['Configuration']['MemorySize']) + ', mlayers ' + str( mlayers) + ', ns ' + str(ns) + '\n' print('=' * 5, 'train node', pos, '=' * 5, 'train phase start') split = 500 base = int(ns / mlayers[-1]) remin = ns % mlayers[-1] sn = 0 for n in range(pos): sn += base if remin: sn += 1 remin -= 1 en = sn + base if remin: en += 1 print('=' * 5, 'train node', pos, '=' * 5, 'read samples from', sn, 'to', en) train_x = [] train_y = [] sfile = int(sn / split) efile = int((en - 1) / split) print('=' * 5, 'train node', pos, '=' * 5, 'read files from', sfile, 'to', efile) data += 'start up time: ' + str(time.time() - stt) + ' ##' + str( time.time()) + ' ##' + str(stt) + '--' + str(time.time()) + '\n' s3.put_object(Bucket=AWS_S3_bucket, Body=data, Key='timestamp/timestamp_train_' + str(pid) + '_' + str(pos) + '_' + str(event['round']) + '.tsp') #=========================================read======================================== stt = time.time() if os.path.exists('/tmp/samples_save'): print('=' * 5, 'train node', pos, '=' * 5, 'found samples!!!') with open('/tmp/samples_save', 'r') as f: temp = pickle.load(f) #os.remove('/tmp/samples_save_'+str(pos)) train_x = temp['data'] train_y = temp['label'] data += 'found samples!!! time: ' + str( time.time() - stt) + ' ##' + str( time.time()) + ' ##' + str(stt) + '--' + str( time.time()) + '\n' else: print('=' * 5, 'train node', pos, '=' * 5, 'samples not found, downloading') for now in range(sfile, efile + 1): print('downloading', now, 'from range', sfile, efile + 1) flag = s3func.s3_download_file(s3, AWS_S3_bucket, 'data/samples_cifar_' + str(now), '/tmp/samples', 0, 1, 0) if flag == 0: print('=' * 5, 'train node', pos, '=' * 5, 'ERROR!!!: fail to read sample file:', now) with open('/tmp/samples', 'r') as f: temp = pickle.load(f) sread = max([split * now, sn]) - split * now eread = min([split * (now + 1), en]) - split * now if train_x == []: train_x = temp['data'][sread:eread] train_y = temp['label'][sread:eread] else: train_x = np.append(temp['data'], temp['data'][sread:eread], axis=0) train_y = np.append(temp['label'], temp['label'][sread:eread], axis=0) if os.path.exists('/tmp/samples'): os.remove('/tmp/samples') with open('/tmp/samples_save', 'w') as f: pickle.dump({'data': train_x, 'label': train_y}, f) data += 'read from ' + str(sfile) + ' to ' + str( efile) + ' time: ' + str(time.time() - stt) + ' ##' + str( time.time()) + ' ##' + str(stt) + '--' + str( time.time()) + '\n' data += 'samples length: ' + str(len(train_x)) + '\n' if nowiter == 0 and pos == 0: s3.put_object(Bucket=AWS_S3_bucket, Body=str(time.time()), Key='timestamp/timestamp_train_start_' + str(pid)) #=========================================read======================================== #=========================================initialize================================== stt = time.time() x, y, output, global_step, y_pred_cls, params = model() loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=y)) #optimizer = tf.train.RMSPropOptimizer(learning_rate=1e-3).minimize(loss, global_step=global_step) #optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(loss, global_step=global_step) sess = tf.Session() sess.run(tf.global_variables_initializer()) data += 'training initialize time: ' + str( time.time() - stt) + ' ##' + str(stt) + '--' + str(time.time()) + '\n' #=========================================initialize================================== #=========================================LOOP start======================================== avgitertime = 0.0 avgitertimereal = 0.0 minitertimereal = 100000.0 timerecord = [] smt = 0.0 while nowiter < maxiter: itertime = [0.0] stiter = time.time() flag = s3func.s3_download_file(s3, AWS_S3_bucket, 'flag/work_' + str(pid), '/tmp/work', 0, 1, 0) if flag == 0: print('=' * 5, 'train node', pos, '=' * 5, 'Abandon!!!! pid:', pid) return stt = time.time() print('+' * 5, 'train node', pos, 'pid', pid, '+' * 5, 'now start iteration', nowiter) print('=' * 5, 'train node', pos, '=' * 5, 'now start iteration', nowiter) stt2 = time.time() flag = s3func.s3_download_file_timer( s3, AWS_S3_bucket, 'data/modelcifar_' + str(pid) + '_' + str(nowiter), '/tmp/model', timer, 0, 0) itertime[0] += time.time() - stt2 data += 'training ' + str(nowiter) + ' model waiting time: ' + str( time.time() - stt2) + ' ##' + str(stt2) + '--' + str( time.time()) + '\n' if flag == 0: if timer.query()[1] > waittime / 4: print('++++++++lambda train', pos, 'at iteration', nowiter, 'end at', datetime.datetime.now()) s3.put_object(Bucket=AWS_S3_bucket, Body=data, Key='timestamp/timestamp_train_' + str(pid) + '_' + str(pos) + '_' + str(event['round']) + '.tsp') event['nowiter'] = nowiter return else: print('=' * 5, 'train node', pos, '=' * 5, 'ERROR!!!: fail to read model', nowiter) s3.put_object(Bucket=AWS_S3_bucket, Body='fail to read model ' + str(nowiter), Key='error/error_train_' + str(pid) + '_' + str(pos)) return if nowiter >= (event['round'] + 1) * rounditer: if [0, 0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket, Body=filerecord, Key='results/result') print('++++++++lambda train', pos, 'at iteration', nowiter, 'end at', datetime.datetime.now()) s3.put_object(Bucket=AWS_S3_bucket, Body=data, Key='timestamp/timestamp_train_' + str(pid) + '_' + str(pos) + '_' + str(event['round']) + '.tsp') event['nowiter'] = nowiter invoke_lambda(client, funcname, event) return stt2 = time.time() with open('/tmp/model', 'r') as f: temp = pickle.load(f) if temp[0] == []: print('=' * 5, 'train node', pos, '=' * 5, 'ERROR!!!: model format wrong', nowiter) s3.put_object(Bucket=AWS_S3_bucket, Body='model format wrong ' + str(nowiter), Key='error/error_train_' + str(pid) + '_' + str(pos)) return for i in range(len(temp)): sess.run(tf.assign(params[i], temp[i])) itertime[0] += time.time() - stt2 data += 'training ' + str(nowiter) + ' download model time: ' + str( time.time() - stt) + ' ##' + str(stt) + '--' + str( time.time()) + '\n' stt = time.time() bs = len(train_x) / bn print('=' * 5, 'train node', pos, '=' * 5, 'train start') optimizer = tf.train.GradientDescentOptimizer( learning_rate=event['learningrate']).minimize( loss, global_step=global_step) for b in range(bn): i_global, _ = sess.run( [global_step, optimizer], feed_dict={ x: train_x[b * bs:(b + 1) * bs, :], y: train_y[b * bs:(b + 1) * bs, :] }) #=========================train event['learningrate'] = event['learningrate'] itertime[0] += time.time() - stt data += 'training ' + str(nowiter) + ' train time: ' + str( time.time() - stt) + ' ##' + str(stt) + '--' + str( time.time()) + '\n' stt = time.time() pw = [] for i in range(len(params)): pw.append(sess.run(params[i])) with open('/tmp/model', 'w') as f: pickle.dump(pw, f) print('=' * 5, 'train node', pos, '=' * 5, 'write result as layer', len(mlayers) - 1, 'node', pos) s32.Bucket(AWS_S3_bucket).upload_file( '/tmp/model', 'data/modelcifar_' + str(pid) + '_' + str(len(mlayers) - 1) + '_' + str(pos)) itertime[0] += time.time() - stt data += 'training ' + str(nowiter) + ' model write time: ' + str( time.time() - stt) + ' ##' + str(stt) + '--' + str( time.time()) + '\n' if len(event['mergepos']) > 0: mergepos = copy.deepcopy(event['mergepos']) thismergepos = mergepos[0] del mergepos[0] #tempd=merge(mlayers,thismergepos,mergepos,nowiter,timer,max(waittime,tend-time.time()+tcount),itertime,pid) smt = time.time() tempd = merge(mlayers, thismergepos, mergepos, nowiter, timer, waittime, itertime, pid) smt = time.time() - smt if tempd == 0: return elif tempd == 1: print('++++++++lambda train', pos, 'at iteration', nowiter, 'end at', datetime.datetime.now()) s3.put_object(Bucket=AWS_S3_bucket, Body=data, Key='timestamp/timestamp_train_' + str(pid) + '_' + str(pos) + '_' + str(event['round']) + '.tsp') event['nowiter'] = nowiter return else: data += tempd data += 'training ' + str(nowiter) + ' valid iteration time: ' + str( itertime[0]) + '\n' print('-' * 5, 'train node', pos, '-' * 5, 'now end iteration', nowiter) avgitertime += itertime[0] """ if nowiter>=min(10,maxiter-1) and [0,0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertime/(nowiter+1)), Key='timestamp/timestamp_iteration_'+str(pid)+'.tsp') """ thisitertime = time.time() - stiter filerecord += str(time.time()) + '\n' #filerecord+=str(time.time()-stiter)+'\n' if thisitertime < minitertimereal: minitertimereal = thisitertime if nowiter >= 2: avgitertimereal += time.time() - stiter insert_sort(time.time() - stiter, timerecord) #filerecord+=str(time.time()-stiter)+'\n' #filerecord+=str(smt)+'\n' if nowiter >= min(10, maxiter - 1) and [0, 0] in event['mergepos']: aaaa = 0 #s3.put_object(Bucket=AWS_S3_bucket,Body=str(timerecord), Key='timestamp/timestamp_iteration_each_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(avg_no_abnormal(timerecord,2)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(find_median(timerecord)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertimereal/(nowiter-2+1)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(minitertimereal), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') nowiter += 1 if [0, 0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket, Body=filerecord, Key='results/result') """ event['testtime']-=1 if event['testtime']>0: inputs={'state':0,'mod':0,'slayers':[],'mlayers':event['mlayers'],'layers':[100,100,100,100,100],'pos':[0,0],'ns':1000000,'maxiter':10,'nowiter':0,'roundtime':250} inputs['testtime']=event['testtime'] invoke_lambda(client,'testfunc',inputs) time.sleep(10) """ if [0, 0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket, Body=str(time.time()), Key='timestamp/timestamp_train_end_' + str(pid)) s3.put_object(Bucket=AWS_S3_bucket, Body=data, Key='timestamp/timestamp_train_' + str(pid) + '_' + str(pos) + '_' + str(event['round']) + '.tsp')
def tfpredict(layers, num, modelnum): [samples, ds] = gen_random_sample(layers, num) samples = np.array(samples) ds = np.array(ds) ds[0:len(ds) / 2, :] = -1.0 ds[len(ds) / 2:, :] = 1.0 print ds samples = np.append(samples, ds, axis=1) np.random.shuffle(samples) ds = samples[:, -layers[-1]:] samples = samples[:, 0:-layers[-1]] print samples.shape, ds.shape training_inputs = samples training_outputs = ds weights = [[] for i in range(len(layers))] biases = [[] for i in range(len(layers))] outputs = [[] for i in range(len(layers))] for l in range(len(layers)): if l > 0: weights[l] = tf.Variable( tf.random_normal([layers[l - 1], layers[l]])) biases[l] = tf.Variable(tf.random_normal([1, layers[l]])) x = tf.placeholder(tf.float32, [None, layers[0]]) y = tf.placeholder(tf.float32, [None, layers[-1]]) outputs[0] = x for l in range(len(layers)): if l > 0: outputs[l] = tf.nn.tanh( tf.add(tf.matmul(outputs[l - 1], weights[l]), biases[l])) cost = 0.5 * (y - outputs[-1])**2 #train=tf.train.AdamOptimizer(0.01).minimize(cost) train = tf.train.GradientDescentOptimizer(1e-2).minimize(cost) init = tf.global_variables_initializer() session = tf.Session() session.run(init) st = time.time() for mo in range(modelnum + 1): s3func.s3_download_file(0, 'lf-source', 'data/model_0_' + str(mo), '/tmp/model', 0, 1, 0) with open('/tmp/model', 'r') as f: temp = pickle.load(f) for l in range(len(layers)): if l > 0: session.run(tf.assign(weights[l], temp[0][l])) session.run(tf.assign(biases[l], temp[1][l])) results = session.run(outputs, feed_dict={x: training_inputs}) test_output = results[-1] test_output = list(test_output) for i in range(len(test_output)): for j in range(len(test_output[i])): if test_output[i][j] >= 0: test_output[i][j] = 1 else: test_output[i][j] = -1 error = 0 for i in range(len(training_outputs)): if training_outputs[i] != test_output[i]: #print training_outputs[i],test_output[i] error += 1 print 1.0 - float(error) / len(training_outputs)
def tfmultitrain(layers, num): #simulate lambda training [samples, ds] = gen_random_sample(layers, num) samples = np.array(samples) ds = np.array(ds) ds[0:len(ds) / 2, :] = -1.0 ds[len(ds) / 2:, :] = 1.0 print ds samples = np.append(samples, ds, axis=1) np.random.shuffle(samples) ds = samples[:, -layers[-1]:] samples = samples[:, 0:-layers[-1]] print samples.shape, ds.shape training_inputs = samples training_outputs = ds weights = [[] for i in range(len(layers))] biases = [[] for i in range(len(layers))] outputs = [[] for i in range(len(layers))] for l in range(len(layers)): if l > 0: weights[l] = tf.Variable( tf.random_normal([layers[l - 1], layers[l]])) biases[l] = tf.Variable(tf.random_normal([1, layers[l]])) x = tf.placeholder(tf.float32, [None, layers[0]]) y = tf.placeholder(tf.float32, [None, layers[-1]]) outputs[0] = x for l in range(len(layers)): if l > 0: outputs[l] = tf.nn.tanh( tf.add(tf.matmul(outputs[l - 1], weights[l]), biases[l])) cost = 0.5 * (y - outputs[-1])**2 #train=tf.train.AdamOptimizer(0.01).minimize(cost) train = tf.train.GradientDescentOptimizer(1e-2).minimize(cost) init = tf.global_variables_initializer() session = tf.Session() session.run(init) st = time.time() s3func.s3_download_file(0, 'lf-source', 'data/model_0_10', '/tmp/model', 0, 1, 0) with open('/tmp/model', 'r') as f: temp = pickle.load(f) for l in range(len(layers)): if l > 0: session.run(tf.assign(weights[l], temp[0][l])) session.run(tf.assign(biases[l], temp[1][l])) rew = session.run(weights) reb = session.run(biases) startw = rew startb = reb nn = 10 bn = 10 bs = len(samples) / nn / bn for i in range(0): allw = [] allb = [] for n in range(nn): #simulate every parallel worker for l in range(len(layers)): if l > 0: session.run(tf.assign(weights[l], startw[l])) session.run(tf.assign(biases[l], startb[l])) for b in range(bn): #every batch rs = n * bn * bs + b * bs re = n * bn * bs + (b + 1) * bs print[i, n, b, rs, re] session.run(train, feed_dict={ x: training_inputs[rs:re, :], y: training_outputs[rs:re, :] }) if allw == []: allw = session.run(weights) allb = session.run(biases) else: add_weights(allw, session.run(weights)) add_weights(allb, session.run(biases)) startw = allw startb = allb print 'time cost:----------------', time.time() - st results = session.run(outputs, feed_dict={x: training_inputs}) test_output = results[-1] test_output = list(test_output) for i in range(len(test_output)): for j in range(len(test_output[i])): if test_output[i][j] >= 0: test_output[i][j] = 1 else: test_output[i][j] = -1 error = 0 for i in range(len(training_outputs)): if training_outputs[i] != test_output[i]: #print training_outputs[i],test_output[i] error += 1 print float(error) / len(training_outputs) """
def train(event): st=datetime.datetime.now() stt=time.time() tcount=time.time() if 'roundtime' not in event.keys(): event['roundtime']=250 tend=event['roundtime'] ns=event['ns'] pos=event['pos'] layers=event['layers'] lr=event['lr'] if 'batchnumber' not in event.keys(): event['batchnumber']=1 pid=event['pid'] if 'testtime' not in event.keys(): event['testtime']=10 if 'waittime' not in event.keys(): event['waittime']=tend*2/3 waittime=event['waittime'] timer=s3func.timer([waittime,tend]) #waittime=tend/4 if 'round' not in event.keys(): event['round']=0 else: event['round']+=1 rounditer=event['rounditer'] s3 = boto3.client('s3') s32 = boto3.resource('s3') flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_pt','/tmp/work',0,1,0) if flag==0: print 'terminated!!!!!!' return client=boto3.client('lambda',region_name = AWS_region) filerecord=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'results/result',0,1,0) if filerecord==0: filerecord='' filerecord+='====='+' starttime: '+str(st)+'\n' filerecord+='====='+str(stt)+'\n' data='train round '+str(event['round'])+', round time '+str(event['roundtime'])+', start at '+str(st)+' ##'+str(time.time())+'\n' data+='info: pos '+str(pos)+'\n' print '='*5,'train node',pos,'='*5,'train phase start' data+='start up time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n' s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])) #=========================================read======================================== stt=time.time() print '='*5,'train node',pos,'='*5,'downloading samples' flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_x','/tmp/mnist_x',0,1,0) if flag==0: print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read sample file x' flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_y','/tmp/mnist_y',0,1,0) if flag==0: print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read sample file y' flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_test_x','/tmp/mnist_test_x',0,1,0) if flag==0: print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read test file x' flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_test_y','/tmp/mnist_test_y',0,1,0) if flag==0: print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read test file y' train_x = extract_data('/tmp/mnist_x', 60000) train_y = extract_labels('/tmp/mnist_y', 60000) test_x = extract_data('/tmp/mnist_test_x', 10000) test_y = extract_labels('/tmp/mnist_test_y', 10000) train_x=train_x.reshape([60000,28*28]) test_x=test_x.reshape([10000,28*28]) data+='samples length: '+str(len(train_x))+'\n' #=========================================read======================================== #=========================================initialize================================== stt=time.time() outputs,x,y,labels,train_op,weights,biases=model(layers,lr) sess = tf.Session() sess.run(tf.global_variables_initializer()) data+='training initialize time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n' #=========================================initialize================================== #=========================================LOOP start======================================== num_iterations=event['maxiter'] bn=event['batchnumber'] alltime=0.0 for it in range(num_iterations): st=time.time() bs=len(train_x)/bn for b in range(bn): batch_xs = train_x[b*bs:(b+1)*bs] batch_ys = train_y[b*bs:(b+1)*bs] sess.run(train_op, feed_dict={x: batch_xs, y: batch_ys}) alltime+=time.time()-st result=sess.run(outputs,feed_dict={x: test_x}) acc=(np.argmax(result[-1], axis=1)==test_y).mean() s3.put_object(Bucket=AWS_S3_bucket,Body=str(acc), Key='timestamp/timestamp_trainresult_'+str(pid)+'_'+str(pos)) s3.put_object(Bucket=AWS_S3_bucket,Body=str([event['layers'],event['lr'],event['batchnumber'],event['maxiter']]), Key='timestamp/timestamp_traininfo_'+str(pid)+'_'+str(pos)) if pos==event['nworker']-1: event['state']=2 cevent=copy.deepcopy(event) invoke_lambda(client,'nntffunc_1',cevent) s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round']))
def merge(mlayers,pos,mergepos,nowiter,timer,waittime,itertime,pid): stt=time.time() tcount=time.time() layer=pos[0] node=pos[1] data='' base=int(mlayers[layer+1]/mlayers[layer]) remin=mlayers[layer+1]%mlayers[layer] print '='*5,'merge node at layer',layer,'node',node,'='*5,'merge phase start' sn=0 for n in range(node): sn+=base if remin: sn+=1 remin-=1 en=sn+base if remin: en+=1 print '='*5,'merge node at layer',layer,'node',node,'='*5,'merge model file at layer',layer+1,'from',sn,'to',en s3 = boto3.client('s3') s32 = boto3.resource('s3') itertime[0]+=time.time()-stt data+='merge '+str(nowiter)+' layer '+str(layer)+' node '+str(node)+'merge '+str([sn,en-1])+' start up time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n' #============================================start============================================== stt=time.time() print '='*5,'merge node at layer',layer,'node',node,'='*5,'iteration',nowiter weights=[] biases=[] flagt=0.0 modelt=0.0 itertime[0]+=1 files=['data/model_'+str(pid)+'_'+str(layer+1)+'_'+str(now) for now in range(sn,en)] #flag=s3func.s3_check_multi_exist(s3,AWS_S3_bucket,'data/model_',files,waittime,0) finished=[0 for i in range(en-sn)] timer.local_start(0) while 1: if sum(finished)==(en-sn): break for now in range(sn,en): if finished[now-sn]==0: tresult=timer.query() if tresult[0]==1: if tresult[1]>waittime/4: return 0 else: print '='*5,'merge node at layer',layer,'node',node,'='*5,'ERROR!!!: fail to read model: layer',layer+1 s3.put_object(Bucket=AWS_S3_bucket,Body='fail to read model at iteration '+str(nowiter)+' at layer '+str(layer+1), Key='error/error_merge_'+str(pid)+'_'+str(layer)+'_'+str(node)) return 0 flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/model_'+str(pid)+'_'+str(layer+1)+'_'+str(now),'/tmp/model',0,1,1) if flag==1: finished[now-sn]=1 with open('/tmp/model', 'r') as f: temp=pickle.load(f) if not temp[0]==[]: if weights==[]: weights=temp[0] biases=temp[1] else: add_weights(weights,temp[0]) add_weights(biases,temp[1]) data+='merge '+str(nowiter)+' layer '+str(layer)+' node '+str(node)+' model read time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n' stt=time.time() if layer==0: print '='*5,'merge node at layer',layer,'node',node,'='*5,'now is the final node' if not weights==[]: div_weights(weights,mlayers[-1]) div_weights(biases,mlayers[-1]) print len(weights),len(biases) model=[weights,biases] with open('/tmp/model', 'w') as f: pickle.dump(model, f) s32 = boto3.resource('s3') s32.Bucket(AWS_S3_bucket).upload_file('/tmp/model', 'data/model_'+str(pid)+'_new') s32.Bucket(AWS_S3_bucket).upload_file('/tmp/model', 'data/model_'+str(pid)+'_'+str(nowiter+1)) else: model=[weights,biases] with open('/tmp/model', 'w') as f: pickle.dump(model, f) print '='*5,'merge node at layer',layer,'node',node,'='*5,'write model as layer',layer,'node',node s32.Bucket(AWS_S3_bucket).upload_file('/tmp/model', 'data/model_'+str(pid)+'_'+str(layer)+'_'+str(node)) #s3.put_object(Bucket=AWS_S3_bucket,Body='true', Key='flag/flag_'+str(pid)+'_'+str(nowiter)+'_'+str(layer)+'_'+str(node)) itertime[0]+=time.time()-stt data+='merge '+str(nowiter)+' layer '+str(layer)+' node '+str(node)+' model write time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n' if len(mergepos)>0: thismergepos=mergepos[0] del mergepos[0] tempd=merge(mlayers,thismergepos,mergepos,nowiter,timer,waittime,itertime,pid) if type(tempd)==int: return tempd else: data+=tempd return data
def train(event): st=datetime.datetime.now() stt=time.time() tcount=time.time() if 'roundtime' not in event.keys(): event['roundtime']=250 tend=event['roundtime'] ns=event['ns'] pos=event['pos'] mlayers=event['mlayers'] layers=event['layers'] maxiter=event['maxiter'] nowiter=event['nowiter'] funcname=event['funcname'] if 'batchnumber' not in event.keys(): event['batchnumber']=1 bn=event['batchnumber'] pid=event['pid'] if 'testtime' not in event.keys(): event['testtime']=10 if 'waittime' not in event.keys(): event['waittime']=tend*2/3 waittime=event['waittime'] timer=s3func.timer([waittime,tend]) #waittime=tend/4 if 'round' not in event.keys(): event['round']=0 else: event['round']+=1 rounditer=event['rounditer'] s3 = boto3.client('s3') s32 = boto3.resource('s3') client=boto3.client('lambda',region_name = AWS_region) if nowiter==0 and pos==0: s3.put_object(Bucket=AWS_S3_bucket,Body=str(stt), Key='timestamp/timestamp_train_start_'+str(pid)) response = client.get_function( FunctionName=funcname, ) filerecord=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'results/result',0,1,0) if filerecord==0: filerecord='' filerecord+='====='+' merge: '+str(mlayers)+' samples: '+str(ns)+' memory: '+str(event['memory'])+' testtime left :'+str(event['testtime'])+' starttime: '+str(st)+'\n' filerecord+='====='+str(stt)+'\n' data='train round '+str(event['round'])+', round time '+str(event['roundtime'])+', start at '+str(st)+' ##'+str(time.time())+'\n' data+='info: pos '+str(pos)+', memory '+str(response['Configuration']['MemorySize'])+', mlayers '+str(mlayers)+', ns '+str(ns)+', layers '+str(layers)+'\n' print '='*5,'train node',pos,'='*5,'train phase start' split=10000 base=int(ns/mlayers[-1]) remin=ns%mlayers[-1] sn=0 for n in range(pos): sn+=base if remin: sn+=1 remin-=1 en=sn+base if remin: en+=1 print '='*5,'train node',pos,'='*5,'read samples from',sn,'to',en training_inputs=[] training_outputs=[] sfile=int(sn/split) efile=int((en-1)/split) print '='*5,'train node',pos,'='*5,'read files from',sfile,'to',efile data+='start up time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n' s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp') #=========================================read======================================== stt=time.time() if os.path.exists('/tmp/samples_save'): print '='*5,'train node',pos,'='*5,'found samples!!!' with open('/tmp/samples_save', 'r') as f: temp=pickle.load(f) #os.remove('/tmp/samples_save_'+str(pos)) training_inputs=temp[0] training_outputs=temp[1] data+='found samples!!! time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n' else: print '='*5,'train node',pos,'='*5,'samples not found, downloading' for now in range(sfile,efile+1): print 'downloading',now,'from range',sfile,efile+1 #flag=s3func.s3_download_file_v2(s3,s32,AWS_S3_bucket,'data/samples_'+str(now),'/tmp/samples',0,1,0) flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_'+str(now),'/tmp/samples',0,1,0) if flag==0: print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read sample file:',now with open('/tmp/samples', 'r') as f: temp=pickle.load(f) sread=max([split*now,sn])-split*now eread=min([split*(now+1),en])-split*now if training_inputs==[]: training_inputs=temp[0][sread:eread] training_outputs=temp[1][sread:eread] else: training_inputs=np.append(training_inputs,temp[0][sread:eread],axis=0) training_outputs=np.append(training_outputs,temp[1][sread:eread],axis=0) #training_inputs.extend(temp[0][sread:eread]) #training_outputs.extend(temp[1][sread:eread]) if os.path.exists('/tmp/samples'): os.remove('/tmp/samples') with open('/tmp/samples_save', 'w') as f: pickle.dump([training_inputs,training_outputs], f) data+='read from '+str(sfile)+' to '+str(efile)+' time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n' data+='samples length: '+str(len(training_inputs))+'\n' if nowiter==0 and pos==0: s3.put_object(Bucket=AWS_S3_bucket,Body=str(time.time()), Key='timestamp/timestamp_train_start_2_'+str(pid)) #=========================================read======================================== #=========================================initialize================================== stt=time.time() weights=[[] for i in range(len(layers))] biases=[[] for i in range(len(layers))] outputs=[[] for i in range(len(layers))] for l in range(len(layers)): if l>0: weights[l]=tf.Variable(tf.random_normal([layers[l-1], layers[l]])) biases[l]=tf.Variable(tf.random_normal([1, layers[l]])) x=tf.placeholder(tf.float32,[None,layers[0]]) y=tf.placeholder(tf.float32,[None,layers[-1]]) outputs[0]=x for l in range(len(layers)): if l>0: outputs[l]= tf.nn.tanh(tf.add(tf.matmul(outputs[l-1], weights[l]), biases[l])) cost=0.5*(y-outputs[-1])**2 #train=tf.train.AdamOptimizer(0.1).minimize(cost) train=tf.train.GradientDescentOptimizer(1e-1).minimize(cost) init=tf.global_variables_initializer() session=tf.Session() session.run(init) data+='training initialize time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n' #=========================================initialize================================== #=========================================LOOP start======================================== avgitertime=0.0 avgitertimereal=0.0 minitertimereal=100000.0 timerecord=[] smt=0.0 while nowiter<maxiter: itertime=[0.0] stiter=time.time() #flag=s3func.s3_download_file_v2(s3,s32,AWS_S3_bucket,'flag/work_'+str(pid),'/tmp/work',0,1,0) flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_'+str(pid),'/tmp/work',0,1,0) if flag==0: print '='*5,'train node',pos,'='*5,'Abandon!!!! pid:',pid return stt=time.time() print '+'*5,'train node',pos,'pid',pid,'+'*5,'now start iteration',nowiter print '='*5,'train node',pos,'='*5,'now start iteration',nowiter stt2=time.time() #flag=s3func.s3_download_file_v2(s3,s32,AWS_S3_bucket,'data/model_'+str(pid)+'_'+str(nowiter),'/tmp/model',max(waittime,tend-time.time()+tcount),0,0) #flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/model_'+str(pid)+'_'+str(nowiter),'/tmp/model',max(waittime,tend-time.time()+tcount),0,0) flag=s3func.s3_download_file_timer(s3,AWS_S3_bucket,'data/model_'+str(pid)+'_'+str(nowiter),'/tmp/model',timer,0,0) itertime[0]+=time.time()-stt2 data+='training '+str(nowiter)+' model waiting time: '+str(time.time()-stt2)+' ##'+str(stt2)+'--'+str(time.time())+'\n' #print 'flag',flag if flag==0: if timer.query()[1]>waittime/4: print '++++++++lambda train',pos,'at iteration',nowiter,'end at',datetime.datetime.now() """ with open('/tmp/samples_save_'+str(pos), 'w') as f: pickle.dump([training_inputs,training_outputs], f) """ s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp') event['nowiter']=nowiter #invoke_lambda(client,funcname,event) return else: print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read model',nowiter s3.put_object(Bucket=AWS_S3_bucket,Body='fail to read model '+str(nowiter), Key='error/error_train_'+str(pid)+'_'+str(pos)) return if nowiter>=(event['round']+1)*rounditer: print '++++++++lambda train',pos,'at iteration',nowiter,'end at',datetime.datetime.now() if [0,0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket,Body=filerecord, Key='results/result') """ with open('/tmp/samples_save_'+str(pos), 'w') as f: pickle.dump([training_inputs,training_outputs], f) """ s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp') event['nowiter']=nowiter invoke_lambda(client,funcname,event) return stt2=time.time() with open('/tmp/model', 'r') as f: temp=pickle.load(f) if temp[0]==[]: print '='*5,'train node',pos,'='*5,'ERROR!!!: model format wrong',nowiter s3.put_object(Bucket=AWS_S3_bucket,Body='model format wrong '+str(nowiter), Key='error/error_train_'+str(pid)+'_'+str(pos)) return for l in range(len(layers)): if l>0: session.run(tf.assign(weights[l],temp[0][l])) session.run(tf.assign(biases[l],temp[1][l])) itertime[0]+=time.time()-stt2 data+='training '+str(nowiter)+' download model time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n' stt=time.time() print '='*5,'train node',pos,'='*5,'train start' bs=len(training_inputs)/bn for b in range(bn): session.run(train,feed_dict={x:training_inputs[b*bs:(b+1)*bs,:],y:training_outputs[b*bs:(b+1)*bs,:]})#=========================train itertime[0]+=time.time()-stt data+='training '+str(nowiter)+' train time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n' stt=time.time() rew=session.run(weights) reb=session.run(biases) model=[rew,reb] with open('/tmp/model', 'w') as f: pickle.dump(model, f) print '='*5,'train node',pos,'='*5,'write result as layer',len(mlayers)-1,'node',pos s32.Bucket(AWS_S3_bucket).upload_file('/tmp/model', 'data/model_'+str(pid)+'_'+str(len(mlayers)-1)+'_'+str(pos)) #s3.put_object(Bucket=AWS_S3_bucket,Body='true', Key='flag/flag_'+str(pid)+'_'+str(nowiter)+'_'+str(len(mlayers)-1)+'_'+str(pos)) itertime[0]+=time.time()-stt data+='training '+str(nowiter)+' model write time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n' if len(event['mergepos'])>0: mergepos=copy.deepcopy(event['mergepos']) thismergepos=mergepos[0] del mergepos[0] #tempd=merge(mlayers,thismergepos,mergepos,nowiter,timer,max(waittime,tend-time.time()+tcount),itertime,pid) smt=time.time() tempd=merge(mlayers,thismergepos,mergepos,nowiter,timer,waittime,itertime,pid) smt=time.time()-smt if tempd==0: return elif tempd==1: print '++++++++lambda train',pos,'at iteration',nowiter,'end at',datetime.datetime.now() """ with open('/tmp/samples_save_'+str(pos), 'w') as f: pickle.dump([training_inputs,training_outputs], f) """ s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp') event['nowiter']=nowiter #invoke_lambda(client,funcname,event) return else: data+=tempd data+='training '+str(nowiter)+' valid iteration time: '+str(itertime[0])+'\n' print '-'*5,'train node',pos,'-'*5,'now end iteration',nowiter avgitertime+=itertime[0] """ if nowiter>=min(10,maxiter-1) and [0,0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertime/(nowiter+1)), Key='timestamp/timestamp_iteration_'+str(pid)+'.tsp') """ thisitertime=time.time()-stiter filerecord+=str(time.time())+'\n' #filerecord+=str(time.time()-stiter)+'\n' if thisitertime<minitertimereal: minitertimereal=thisitertime #insert_sort(time.time()-stiter,timerecord) if nowiter>=2: avgitertimereal+=time.time()-stiter insert_sort(time.time()-stiter,timerecord) #filerecord+=str(smt)+'\n' if nowiter>=min(10,maxiter-1) and [0,0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket,Body=str(timerecord), Key='timestamp/timestamp_iteration_each_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(avg_no_abnormal(timerecord,2)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(float(np.mean(timerecord))), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') s3.put_object(Bucket=AWS_S3_bucket,Body=str(float(np.mean(extract_valid(timerecord)))), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertimereal/(nowiter-2+1)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') #s3.put_object(Bucket=AWS_S3_bucket,Body=str(minitertimereal), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp') nowiter+=1 if [0,0] in event['mergepos']: s3.put_object(Bucket=AWS_S3_bucket,Body=filerecord, Key='results/result') """ event['testtime']-=1 if event['testtime']>0: inputs={'state':0,'mod':0,'batchnumber':20,'slayers':[],'mlayers':[1,100],'layers':[20,100,100,100,100,100,1],'pos':[0,0],'ns':1000000,'maxiter':10,'nowiter':0,'roundtime':250,'rounditer':15} inputs['testtime']=event['testtime'] invoke_lambda(client,'testfunc',inputs) time.sleep(10) """ if [0,0] in event['mergepos'] and nowiter>=maxiter: s3.put_object(Bucket=AWS_S3_bucket,Body=str(time.time()), Key='timestamp/timestamp_train_end_'+str(pid)) s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp')
def monitor(event): s3 = boto3.client('s3') client=boto3.client('lambda',region_name = AWS_region) data='' print 'monitor begin' #====================================================load info=================================================================== if 'process' not in event.keys(): print 'ERROR!!!!! process key not found' s3.put_object(Bucket=AWS_S3_bucket,Body='process key not found', Key='error/error_monitor.tsp') return if 'round' not in event.keys(): event['round']=0 process=event['process'] tend=250 cratio=event['cratio']# cost ratio pcratio=event['pcratio']# performance/cost ratio rangen=event['rangen'] rangemem=event['rangemem'] maxchange=event['maxchange'] changestep=event['changestep'] lastpid=[] data+='basic info: '+'pcratio: '+str(pcratio)+'rangen: '+str(rangen)+'maxchange: '+str(maxchange)+'changestep: '+str(changestep)+'\n' if 'avgrecord' not in event.keys(): event['avgrecord']=[[0.0,0] for i in range(20)] if 'lastslop' not in event.keys(): event['lastslop']=[] if 'regmod' not in event.keys(): event['regmod']=0 if 'regcount' not in event.keys(): event['regcount']=[0,0,0] diffrange=event['diffrange'] tcount=time.time() data+='UTC:'+str(datetime.datetime.now())+'. '+str(tcount)+'\n' s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp') if event['round']>0: temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_monitor_'+str(event['round']-1)+'.tsp',0,1,0) if temp!=0: data+=temp data+='==============================' best=process[0] #====================================================load info=================================================================== while time.time()-tcount<tend: #load the control file "work_monitor". If it is deleted manually, monitor stops. flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_monitor','/tmp/work',0,1,0) if flag==0: print 'monitor terminated!!!!!!' return if event['regmod']==0: if event['regcount'][0]>=event['regtimes'][0]: event['regcount'][0]=0 event['regmod']=1 if event['regmod']==1: if event['regcount'][1]>=event['regtimes'][1]: event['regcount'][1]=0 event['regmod']=0 event['regcount'][2]+=1 if event['regcount'][2]>=event['regtimes'][2]: print 'final result:',best data+='final result: '+str(best)+'\n' print 'achieve the maximum regression constraint, terminated!!!!!' data+='achieve the maximum regression constraint, terminated!!!!!\n' s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp') return times=[-1,-1] print 'regression times:',event['regcount'] data+='regression times: '+str(event['regcount'])+'\n' #+++++++++++++++++++++++++++++++++++++++++++++++++++++++ temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_startstructure_'+str(process[0][0])+'.tsp',0,1,0) if temp==0: cevent=copy.deepcopy(event) cevent['pid']=process[0][0] cevent['mlayers']=[1,process[0][1]] cevent['state']=2 cevent['funcname']=process[0][3] cevent['memory']=process[0][2] cevent['round']=-1 if lastpid!=[]: cevent['modelname']='data/model_'+str(lastpid)+'_new' else: cevent['modelname']='data/model' print 'invoke',process[0][3] invoke_lambda(client,process[0][3],cevent) temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_startstructure_'+str(process[0][0])+'.tsp',tend-time.time()+tcount,0,0) data+='waiting the first iteration data:'+'timestamp/timestamp_iteration_real_'+str(process[0][0])+'.tsp'+'\n' s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp') temp=0 while tend-time.time()+tcount>0 and temp==0: flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_monitor','/tmp/work',0,1,0) if flag==0: print 'monitor terminated!!!!!!' return temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_iteration_each_'+str(process[0][0])+'.tsp',0,1,0) time.sleep(5) if temp==0: print 'ERROR!!!!! connot read timestamp',process[0] s3.put_object(Bucket=AWS_S3_bucket,Body='connot read timestamp: '+str(process[0]), Key='error/error_monitor.tsp') break #return times[0]=json.loads(temp) s3func.s3_delete_file(s3,AWS_S3_bucket,'flag/work_'+str(process[0][0])) s3func.s3_clear_bucket(AWS_S3_bucket,'timestamp/timestamp_train_'+str(process[0][0])) lastpid=process[0][0] data+='the time cost is: '+str(times[0])+'\n' #-------------------------------------------------------- temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_startstructure_'+str(process[1][0])+'.tsp',0,1,0) if temp==0: cevent=copy.deepcopy(event) cevent['pid']=process[1][0] cevent['mlayers']=[1,process[1][1]] cevent['state']=2 cevent['funcname']=process[1][3] cevent['memory']=process[1][2] cevent['round']=-1 if lastpid!=[]: cevent['modelname']='data/model_'+str(lastpid)+'_new' else: cevent['modelname']='data/model' print 'invoke',process[1][3] invoke_lambda(client,process[1][3],cevent) temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_startstructure_'+str(process[1][0])+'.tsp',tend-time.time()+tcount,0,0) data+='waiting the second iteration data:'+'timestamp/timestamp_iteration_real_'+str(process[1][0])+'.tsp'+'\n' s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp') temp=0 while tend-time.time()+tcount>0 and temp==0: flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_monitor','/tmp/work',0,1,0) if flag==0: print 'monitor terminated!!!!!!' return temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_iteration_each_'+str(process[1][0])+'.tsp',0,1,0) time.sleep(5) if temp==0: print 'ERROR!!!!! connot read timesteamp',process[1] s3.put_object(Bucket=AWS_S3_bucket,Body='connot read timestamp: '+str(process[1]), Key='error/error_monitor.tsp') break #return times[1]=json.loads(temp) s3func.s3_delete_file(s3,AWS_S3_bucket,'flag/work_'+str(process[1][0])) s3func.s3_clear_bucket(AWS_S3_bucket,'timestamp/timestamp_train_'+str(process[1][0])) lastpid=process[1][0] data+='the time cost is: '+str(times[1])+'\n' #+++++++++++++++++++++++++++++++++++++++++++++++++++++++ if event['regmod']==0: times[0]=float(np.median(times[0])) times[1]=float(np.median(times[1])) if times[1]<times[0]: process.reverse() times.reverse() best=process[0] nextfunc=process[1][3] F=times[0]-times[1] data+=str(process[0])+': '+str(times[0])+'. '+str(process[1])+': '+str(times[1])+'. \n' data+='delta target value is '+str(F)+'\n' if process[0][1]==process[1][1]: nextn=process[0][1] else: if event['lastslop']==[]: change=-pcratio[0]*F/(process[0][1]-process[1][1]) event['lastslop']=F/(process[0][1]-process[1][1]) else: change=-pcratio[0]*F/(process[0][1]-process[1][1])+pcratio[1]*(F/(process[0][1]-process[1][1])-event['lastslop']) event['lastslop']=F/(process[0][1]-process[1][1]) change=ceil_step(change,changestep[0]) if change>maxchange[0]: change=maxchange[0] elif change<-maxchange[0]: change=-maxchange[0] nextn=int(process[0][1]+change) nextmem=process[0][2] else: metric=[0.0,0.0] if OPTIMIZATION==0: metric[0]=float(np.median([(process[0][2]*ts) for ts in times[0]])) metric[1]=float(np.median([(process[1][2]*ts) for ts in times[1]])) else: metric[0]=float(np.median([1.0/(process[0][2]*ts**2) for ts in times[0]])) metric[1]=float(np.median([1.0/(process[1][2]*ts**2) for ts in times[1]])) if metric[1]<metric[0]: #if process[1][2]*times[1]<process[0][2]*times[0]: process.reverse() times.reverse() metric.reverse() best=process[0] nextfunc=process[1][3] F=metric[0]-metric[1] data+=str(process[0])+': '+str(metric[0])+'. '+str(process[1])+': '+str(metric[1])+'. \n' data+='delta target value is '+str(F)+'\n' nextn=process[0][1] if process[0][2]==process[1][2]: nextmem=process[0][2] else: #change=-pcratio[2]*F/(process[0][2]-process[1][2]) if event['lastslop']==[]: change=-pcratio[2]*F/(process[0][2]-process[1][2]) event['lastslop']=F/(process[0][2]-process[1][2]) else: change=-pcratio[2]*F/(process[0][2]-process[1][2])+pcratio[3]*(F/(process[0][2]-process[1][2])-event['lastslop']) event['lastslop']=F/(process[0][2]-process[1][2]) #change=-change#for 1/mt2 change=ceil_step(change,changestep[1]) if change>maxchange[1]: change=maxchange[1] elif change<-maxchange[1]: change=-maxchange[1] nextmem=int(process[0][2]+change) nextpid=max(process[0][0],process[1][0])+1 if nextn>rangen[1]: nextn=rangen[1] elif nextn<rangen[0]: nextn=rangen[0] if nextmem>rangemem[1]: nextmem=rangemem[1] elif nextmem<rangemem[0]: nextmem=rangemem[0] #s3func.s3_clear_bucket(AWS_S3_bucket,'data/model_'+str(process[1][0])) process[1]=process[0] process[0]=[nextpid,nextn,nextmem,nextfunc] #process[1][0]=max(process[0][0],process[1][0])+1 event['process']=process response = client.update_function_configuration( FunctionName=nextfunc, MemorySize=nextmem, ) time.sleep(5) print '---------',process if abs(process[0][1]-process[1][1])<1 and process[0][2]==process[1][2]: print 'terminated!!!' print 'final result:',best data+='final result: '+str(best)+'\n' #data+='the final result is '+str(process[0]) s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp') return data+= '---------'+str(process)+'\n' if event['regmod']==0: event['regcount'][0]+=1 else: event['regcount'][1]+=1 s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp') event['round']+=1 invoke_lambda(client,event['funcname'],event)
def test_result(layers,num): st=time.time() s32 = boto3.resource('s3') s3 = boto3.client('s3') training_inputs=[] training_outputs=[] for now in range(99): print now flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_'+str(now),'/tmp/samples',0,1,0) with open('/tmp/samples', 'r') as f: temp=pickle.load(f) print len(temp[0]) if training_inputs==[]: training_inputs=temp[0] training_outputs=temp[1] else: training_inputs=np.append(training_inputs,temp[0],axis=0) training_outputs=np.append(training_outputs,temp[1],axis=0) print len(training_inputs) print len(training_inputs) weights=[[] for i in range(len(layers))] biases=[[] for i in range(len(layers))] outputs=[[] for i in range(len(layers))] for l in range(len(layers)): if l>0: weights[l]=tf.Variable(tf.random_normal([layers[l-1], layers[l]])) biases[l]=tf.Variable(tf.random_normal([1, layers[l]])) x=tf.placeholder(tf.float32,[None,layers[0]]) y=tf.placeholder(tf.float32,[None,layers[-1]]) outputs[0]=x for l in range(len(layers)): if l>0: outputs[l]= tf.nn.tanh(tf.add(tf.matmul(outputs[l-1], weights[l]), biases[l])) cost=0.5*(y-outputs[-1])**2 init=tf.global_variables_initializer() session=tf.Session() session.run(init) flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/model_0_new','/tmp/model',0,1,0) with open('/tmp/model', 'r') as f: temp=pickle.load(f) for l in range(len(layers)): if l>0: session.run(tf.assign(weights[l],temp[0][l])) session.run(tf.assign(biases[l],temp[1][l])) results=session.run(outputs,feed_dict={x:training_inputs}) test_output=results[-1] test_output=list(test_output) for i in range(len(test_output)): for j in range(len(test_output[i])): if test_output[i][j]>=0: test_output[i][j]=1 else: test_output[i][j]=-1 error=0 for i in range(len(training_outputs)): if training_outputs[i]!=test_output[i]: error+=1 print float(error)/len(training_outputs) print time.time()-st