Ejemplo n.º 1
0
def monitor(event):
    st = datetime.datetime.now()
    stt = time.time()
    nworker = event['nworker']
    pid = event['pid']
    if 'roundtime' not in list(event.keys()):
        event['roundtime'] = 250
    if 'waittime' not in list(event.keys()):
        event['waittime'] = event['roundtime'] * 2 / 3
    timer = s3func.timer([event['waittime'], event['roundtime']])
    s3 = boto3.client('s3')
    s32 = boto3.resource('s3')
    flag = s3func.s3_download_file(s3, AWS_S3_bucket, 'flag/work_pt',
                                   '/tmp/work', 0, 1, 0)
    if flag == 0:
        print('monitor terminated!!!!!!')
        return
    s3.put_object(Bucket=AWS_S3_bucket,
                  Body=str(st),
                  Key='timestamp/timestamp_monitor')
    finished = [0 for i in range(nworker)]
    timer.local_start(0)
    bresult = 0.0
    bpos = 0
    while 1:
        if sum(finished) == nworker:
            break
        for now in range(nworker):
            if finished[now] == 0:
                tresult = timer.query()
                if tresult[0] == 1:
                    return 0
                flag = s3func.s3_download_file(
                    s3, AWS_S3_bucket, 'timestamp/timestamp_trainresult_' +
                    str(pid) + '_' + str(now), '/tmp/result', 0, 1, 0)
                if flag == 1:
                    finished[now] = 1
                    with open('/tmp/result', 'r') as f:
                        temp = f.read()
                    r = float(temp)
                    if r > bresult:
                        bresult = r
                        bpos = now
    s3.put_object(Bucket=AWS_S3_bucket,
                  Body=str([bresult, bpos]),
                  Key='timestamp/timestamp_final_result')
    et = time.time()
    st = s3func.s3_read_file_v2(s3, AWS_S3_bucket,
                                'timestamp/timestamp_startup.tsp', 0, 1, 0)
    filerecord = s3func.s3_read_file_v2(s3, AWS_S3_bucket, 'results/timecost',
                                        0, 1, 0)
    if filerecord == 0:
        filerecord = ''
    filerecord += str(et - float(st)) + '\n'
    s3.put_object(Bucket=AWS_S3_bucket,
                  Body=filerecord,
                  Key='results/timecost')
    """
Ejemplo n.º 2
0
def train(event):
    st = datetime.datetime.now()
    stt = time.time()
    tcount = time.time()
    if 'roundtime' not in list(event.keys()):
        event['roundtime'] = 250
    tend = event['roundtime']
    ns = event['ns']
    pos = event['pos']
    mlayers = event['mlayers']
    maxiter = event['maxiter']
    nowiter = event['nowiter']
    funcname = event['funcname']
    if 'batchnumber' not in list(event.keys()):
        event['batchnumber'] = 1
    bn = event['batchnumber']
    pid = event['pid']
    if 'testtime' not in list(event.keys()):
        event['testtime'] = 10
    if 'waittime' not in list(event.keys()):
        event['waittime'] = tend * 2 / 3
    if 'learningrate' not in list(event.keys()):
        event['learningrate'] = 0.1
    waittime = event['waittime']
    timer = s3func.timer([waittime, tend])
    if 'round' not in list(event.keys()):
        event['round'] = 0
    else:
        event['round'] += 1
    rounditer = event['rounditer']
    s3 = boto3.client('s3')
    s32 = boto3.resource('s3')
    client = boto3.client('lambda', region_name=AWS_region)
    response = client.get_function(FunctionName=funcname, )
    if nowiter == 0 and pos == 0:
        s3.put_object(Bucket=AWS_S3_bucket,
                      Body=str(stt),
                      Key='timestamp/timestamp_train_start_' + str(pid))
    response = client.get_function(FunctionName=funcname, )
    filerecord = s3func.s3_read_file_v2(s3, AWS_S3_bucket, 'results/result', 0,
                                        1, 0)
    if filerecord == 0:
        filerecord = ''
    filerecord += '=====' + ' merge: ' + str(mlayers) + ' samples: ' + str(
        ns) + ' memory: ' + str(event['memory']) + ' testtime left :' + str(
            event['testtime']) + ' starttime: ' + str(st) + '\n'
    filerecord += '=====' + str(stt) + '\n'
    data = 'train round ' + str(event['round']) + ', round time ' + str(
        event['roundtime']) + ', start at ' + str(st) + ' ##' + str(
            time.time()) + '\n'
    data += 'info: pos ' + str(pos) + ', memory ' + str(
        response['Configuration']['MemorySize']) + ', mlayers ' + str(
            mlayers) + ', ns ' + str(ns) + '\n'
    print('=' * 5, 'train node', pos, '=' * 5, 'train phase start')
    split = 500
    base = int(ns / mlayers[-1])
    remin = ns % mlayers[-1]
    sn = 0
    for n in range(pos):
        sn += base
        if remin:
            sn += 1
            remin -= 1
    en = sn + base
    if remin:
        en += 1
    print('=' * 5, 'train node', pos, '=' * 5, 'read samples from', sn, 'to',
          en)
    train_x = []
    train_y = []
    sfile = int(sn / split)
    efile = int((en - 1) / split)
    print('=' * 5, 'train node', pos, '=' * 5, 'read files from', sfile, 'to',
          efile)
    data += 'start up time: ' + str(time.time() - stt) + ' ##' + str(
        time.time()) + ' ##' + str(stt) + '--' + str(time.time()) + '\n'
    s3.put_object(Bucket=AWS_S3_bucket,
                  Body=data,
                  Key='timestamp/timestamp_train_' + str(pid) + '_' +
                  str(pos) + '_' + str(event['round']) + '.tsp')
    #=========================================read========================================
    stt = time.time()
    if os.path.exists('/tmp/samples_save'):
        print('=' * 5, 'train node', pos, '=' * 5, 'found samples!!!')
        with open('/tmp/samples_save', 'r') as f:
            temp = pickle.load(f)
        #os.remove('/tmp/samples_save_'+str(pos))
        train_x = temp['data']
        train_y = temp['label']
        data += 'found samples!!! time: ' + str(
            time.time() - stt) + ' ##' + str(
                time.time()) + ' ##' + str(stt) + '--' + str(
                    time.time()) + '\n'
    else:
        print('=' * 5, 'train node', pos, '=' * 5,
              'samples not found, downloading')
        for now in range(sfile, efile + 1):
            print('downloading', now, 'from range', sfile, efile + 1)
            flag = s3func.s3_download_file(s3, AWS_S3_bucket,
                                           'data/samples_cifar_' + str(now),
                                           '/tmp/samples', 0, 1, 0)
            if flag == 0:
                print('=' * 5, 'train node', pos, '=' * 5,
                      'ERROR!!!: fail to read sample file:', now)
            with open('/tmp/samples', 'r') as f:
                temp = pickle.load(f)
            sread = max([split * now, sn]) - split * now
            eread = min([split * (now + 1), en]) - split * now

            if train_x == []:
                train_x = temp['data'][sread:eread]
                train_y = temp['label'][sread:eread]
            else:
                train_x = np.append(temp['data'],
                                    temp['data'][sread:eread],
                                    axis=0)
                train_y = np.append(temp['label'],
                                    temp['label'][sread:eread],
                                    axis=0)

        if os.path.exists('/tmp/samples'):
            os.remove('/tmp/samples')
        with open('/tmp/samples_save', 'w') as f:
            pickle.dump({'data': train_x, 'label': train_y}, f)
        data += 'read from ' + str(sfile) + ' to ' + str(
            efile) + ' time: ' + str(time.time() - stt) + ' ##' + str(
                time.time()) + ' ##' + str(stt) + '--' + str(
                    time.time()) + '\n'
    data += 'samples length: ' + str(len(train_x)) + '\n'
    if nowiter == 0 and pos == 0:
        s3.put_object(Bucket=AWS_S3_bucket,
                      Body=str(time.time()),
                      Key='timestamp/timestamp_train_start_' + str(pid))
    #=========================================read========================================
    #=========================================initialize==================================
    stt = time.time()
    x, y, output, global_step, y_pred_cls, params = model()

    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=y))
    #optimizer = tf.train.RMSPropOptimizer(learning_rate=1e-3).minimize(loss, global_step=global_step)
    #optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(loss, global_step=global_step)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    data += 'training initialize time: ' + str(
        time.time() - stt) + ' ##' + str(stt) + '--' + str(time.time()) + '\n'
    #=========================================initialize==================================
    #=========================================LOOP start========================================
    avgitertime = 0.0
    avgitertimereal = 0.0
    minitertimereal = 100000.0
    timerecord = []
    smt = 0.0
    while nowiter < maxiter:
        itertime = [0.0]
        stiter = time.time()
        flag = s3func.s3_download_file(s3, AWS_S3_bucket,
                                       'flag/work_' + str(pid), '/tmp/work', 0,
                                       1, 0)
        if flag == 0:
            print('=' * 5, 'train node', pos, '=' * 5, 'Abandon!!!! pid:', pid)
            return
        stt = time.time()
        print('+' * 5, 'train node', pos, 'pid', pid, '+' * 5,
              'now start iteration', nowiter)
        print('=' * 5, 'train node', pos, '=' * 5, 'now start iteration',
              nowiter)
        stt2 = time.time()
        flag = s3func.s3_download_file_timer(
            s3, AWS_S3_bucket,
            'data/modelcifar_' + str(pid) + '_' + str(nowiter), '/tmp/model',
            timer, 0, 0)
        itertime[0] += time.time() - stt2
        data += 'training ' + str(nowiter) + ' model waiting time: ' + str(
            time.time() - stt2) + ' ##' + str(stt2) + '--' + str(
                time.time()) + '\n'
        if flag == 0:
            if timer.query()[1] > waittime / 4:
                print('++++++++lambda train', pos, 'at iteration', nowiter,
                      'end at', datetime.datetime.now())
                s3.put_object(Bucket=AWS_S3_bucket,
                              Body=data,
                              Key='timestamp/timestamp_train_' + str(pid) +
                              '_' + str(pos) + '_' + str(event['round']) +
                              '.tsp')
                event['nowiter'] = nowiter
                return
            else:
                print('=' * 5, 'train node', pos, '=' * 5,
                      'ERROR!!!: fail to read model', nowiter)
                s3.put_object(Bucket=AWS_S3_bucket,
                              Body='fail to read model ' + str(nowiter),
                              Key='error/error_train_' + str(pid) + '_' +
                              str(pos))
                return
        if nowiter >= (event['round'] + 1) * rounditer:
            if [0, 0] in event['mergepos']:
                s3.put_object(Bucket=AWS_S3_bucket,
                              Body=filerecord,
                              Key='results/result')
            print('++++++++lambda train', pos, 'at iteration', nowiter,
                  'end at', datetime.datetime.now())
            s3.put_object(Bucket=AWS_S3_bucket,
                          Body=data,
                          Key='timestamp/timestamp_train_' + str(pid) + '_' +
                          str(pos) + '_' + str(event['round']) + '.tsp')
            event['nowiter'] = nowiter
            invoke_lambda(client, funcname, event)
            return
        stt2 = time.time()
        with open('/tmp/model', 'r') as f:
            temp = pickle.load(f)
        if temp[0] == []:
            print('=' * 5, 'train node', pos, '=' * 5,
                  'ERROR!!!: model format wrong', nowiter)
            s3.put_object(Bucket=AWS_S3_bucket,
                          Body='model format wrong ' + str(nowiter),
                          Key='error/error_train_' + str(pid) + '_' + str(pos))
            return
        for i in range(len(temp)):
            sess.run(tf.assign(params[i], temp[i]))
        itertime[0] += time.time() - stt2
        data += 'training ' + str(nowiter) + ' download model time: ' + str(
            time.time() - stt) + ' ##' + str(stt) + '--' + str(
                time.time()) + '\n'
        stt = time.time()
        bs = len(train_x) / bn
        print('=' * 5, 'train node', pos, '=' * 5, 'train start')
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=event['learningrate']).minimize(
                loss, global_step=global_step)
        for b in range(bn):
            i_global, _ = sess.run(
                [global_step, optimizer],
                feed_dict={
                    x: train_x[b * bs:(b + 1) * bs, :],
                    y: train_y[b * bs:(b + 1) * bs, :]
                })  #=========================train
        event['learningrate'] = event['learningrate']
        itertime[0] += time.time() - stt
        data += 'training ' + str(nowiter) + ' train time: ' + str(
            time.time() - stt) + ' ##' + str(stt) + '--' + str(
                time.time()) + '\n'
        stt = time.time()
        pw = []
        for i in range(len(params)):
            pw.append(sess.run(params[i]))
        with open('/tmp/model', 'w') as f:
            pickle.dump(pw, f)
        print('=' * 5, 'train node', pos, '=' * 5, 'write result as layer',
              len(mlayers) - 1, 'node', pos)
        s32.Bucket(AWS_S3_bucket).upload_file(
            '/tmp/model', 'data/modelcifar_' + str(pid) + '_' +
            str(len(mlayers) - 1) + '_' + str(pos))
        itertime[0] += time.time() - stt
        data += 'training ' + str(nowiter) + ' model write time: ' + str(
            time.time() - stt) + ' ##' + str(stt) + '--' + str(
                time.time()) + '\n'
        if len(event['mergepos']) > 0:
            mergepos = copy.deepcopy(event['mergepos'])
            thismergepos = mergepos[0]
            del mergepos[0]
            #tempd=merge(mlayers,thismergepos,mergepos,nowiter,timer,max(waittime,tend-time.time()+tcount),itertime,pid)
            smt = time.time()
            tempd = merge(mlayers, thismergepos, mergepos, nowiter, timer,
                          waittime, itertime, pid)
            smt = time.time() - smt
            if tempd == 0:
                return
            elif tempd == 1:
                print('++++++++lambda train', pos, 'at iteration', nowiter,
                      'end at', datetime.datetime.now())
                s3.put_object(Bucket=AWS_S3_bucket,
                              Body=data,
                              Key='timestamp/timestamp_train_' + str(pid) +
                              '_' + str(pos) + '_' + str(event['round']) +
                              '.tsp')
                event['nowiter'] = nowiter
                return
            else:
                data += tempd
        data += 'training ' + str(nowiter) + ' valid iteration time: ' + str(
            itertime[0]) + '\n'
        print('-' * 5, 'train node', pos, '-' * 5, 'now end iteration',
              nowiter)
        avgitertime += itertime[0]
        """
		if nowiter>=min(10,maxiter-1) and [0,0] in event['mergepos']:
			s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertime/(nowiter+1)), Key='timestamp/timestamp_iteration_'+str(pid)+'.tsp')
		"""
        thisitertime = time.time() - stiter
        filerecord += str(time.time()) + '\n'
        #filerecord+=str(time.time()-stiter)+'\n'
        if thisitertime < minitertimereal:
            minitertimereal = thisitertime
        if nowiter >= 2:
            avgitertimereal += time.time() - stiter
            insert_sort(time.time() - stiter, timerecord)
            #filerecord+=str(time.time()-stiter)+'\n'
            #filerecord+=str(smt)+'\n'
        if nowiter >= min(10, maxiter - 1) and [0, 0] in event['mergepos']:
            aaaa = 0
            #s3.put_object(Bucket=AWS_S3_bucket,Body=str(timerecord), Key='timestamp/timestamp_iteration_each_'+str(pid)+'.tsp')
            #s3.put_object(Bucket=AWS_S3_bucket,Body=str(avg_no_abnormal(timerecord,2)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
            #s3.put_object(Bucket=AWS_S3_bucket,Body=str(find_median(timerecord)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
            #s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertimereal/(nowiter-2+1)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
            #s3.put_object(Bucket=AWS_S3_bucket,Body=str(minitertimereal), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
        nowiter += 1
    if [0, 0] in event['mergepos']:
        s3.put_object(Bucket=AWS_S3_bucket,
                      Body=filerecord,
                      Key='results/result')
        """
		event['testtime']-=1
		if event['testtime']>0:
			inputs={'state':0,'mod':0,'slayers':[],'mlayers':event['mlayers'],'layers':[100,100,100,100,100],'pos':[0,0],'ns':1000000,'maxiter':10,'nowiter':0,'roundtime':250}
			inputs['testtime']=event['testtime']
			invoke_lambda(client,'testfunc',inputs)
			time.sleep(10)
		"""
    if [0, 0] in event['mergepos']:
        s3.put_object(Bucket=AWS_S3_bucket,
                      Body=str(time.time()),
                      Key='timestamp/timestamp_train_end_' + str(pid))
    s3.put_object(Bucket=AWS_S3_bucket,
                  Body=data,
                  Key='timestamp/timestamp_train_' + str(pid) + '_' +
                  str(pos) + '_' + str(event['round']) + '.tsp')
Ejemplo n.º 3
0
def train(event):
	st=datetime.datetime.now()
	stt=time.time()
	tcount=time.time()
	if 'roundtime' not in event.keys():
		event['roundtime']=250
	tend=event['roundtime']
	ns=event['ns']
	pos=event['pos']
	layers=event['layers']
	lr=event['lr']
	if 'batchnumber' not in event.keys():
		event['batchnumber']=1
	pid=event['pid']
	if 'testtime' not in event.keys():
		event['testtime']=10
	if 'waittime' not in event.keys():
		event['waittime']=tend*2/3
	waittime=event['waittime']
	timer=s3func.timer([waittime,tend])
	#waittime=tend/4
	if 'round' not in event.keys():
		event['round']=0
	else:
		event['round']+=1
	rounditer=event['rounditer']
	s3 = boto3.client('s3')
	s32 = boto3.resource('s3')
	flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_pt','/tmp/work',0,1,0)
	if flag==0:
		print 'terminated!!!!!!'
		return
	client=boto3.client('lambda',region_name = AWS_region)
	filerecord=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'results/result',0,1,0)
	if filerecord==0:
		filerecord=''
	filerecord+='====='+' starttime: '+str(st)+'\n'
	filerecord+='====='+str(stt)+'\n'
	data='train round '+str(event['round'])+', round time '+str(event['roundtime'])+', start at '+str(st)+' ##'+str(time.time())+'\n'
	data+='info: pos '+str(pos)+'\n'
	print '='*5,'train node',pos,'='*5,'train phase start'
	data+='start up time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n'
	s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round']))
	#=========================================read========================================
	stt=time.time()
	print '='*5,'train node',pos,'='*5,'downloading samples'
	flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_x','/tmp/mnist_x',0,1,0)
	if flag==0:
		print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read sample file x'
	flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_y','/tmp/mnist_y',0,1,0)
	if flag==0:
		print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read sample file y'
	flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_test_x','/tmp/mnist_test_x',0,1,0)
	if flag==0:
		print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read test file x'
	flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_test_y','/tmp/mnist_test_y',0,1,0)
	if flag==0:
		print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read test file y'
	train_x = extract_data('/tmp/mnist_x', 60000)
	train_y = extract_labels('/tmp/mnist_y', 60000)
	test_x = extract_data('/tmp/mnist_test_x', 10000)
	test_y = extract_labels('/tmp/mnist_test_y', 10000)
	train_x=train_x.reshape([60000,28*28])
	test_x=test_x.reshape([10000,28*28])
	data+='samples length: '+str(len(train_x))+'\n'
	#=========================================read========================================
	#=========================================initialize==================================
	stt=time.time()
	outputs,x,y,labels,train_op,weights,biases=model(layers,lr)
	sess = tf.Session()
	sess.run(tf.global_variables_initializer())
	data+='training initialize time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n'
	#=========================================initialize==================================
	#=========================================LOOP start========================================
	num_iterations=event['maxiter']
	bn=event['batchnumber']
	alltime=0.0
	for it in range(num_iterations):
		st=time.time()
		bs=len(train_x)/bn
		for b in range(bn):
			batch_xs = train_x[b*bs:(b+1)*bs]
			batch_ys = train_y[b*bs:(b+1)*bs]
			sess.run(train_op, feed_dict={x: batch_xs, y: batch_ys})
		alltime+=time.time()-st
	result=sess.run(outputs,feed_dict={x: test_x})
	acc=(np.argmax(result[-1], axis=1)==test_y).mean()
	s3.put_object(Bucket=AWS_S3_bucket,Body=str(acc), Key='timestamp/timestamp_trainresult_'+str(pid)+'_'+str(pos))
	s3.put_object(Bucket=AWS_S3_bucket,Body=str([event['layers'],event['lr'],event['batchnumber'],event['maxiter']]), Key='timestamp/timestamp_traininfo_'+str(pid)+'_'+str(pos))
	if pos==event['nworker']-1:
		event['state']=2
		cevent=copy.deepcopy(event)
		invoke_lambda(client,'nntffunc_1',cevent)
	s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round']))
Ejemplo n.º 4
0
def train(event):
	st=datetime.datetime.now()
	stt=time.time()
	tcount=time.time()
	if 'roundtime' not in event.keys():
		event['roundtime']=250
	tend=event['roundtime']
	ns=event['ns']
	pos=event['pos']
	mlayers=event['mlayers']
	layers=event['layers']
	maxiter=event['maxiter']
	nowiter=event['nowiter']
	funcname=event['funcname']
	if 'batchnumber' not in event.keys():
		event['batchnumber']=1
	bn=event['batchnumber']
	pid=event['pid']
	if 'testtime' not in event.keys():
		event['testtime']=10
	if 'waittime' not in event.keys():
		event['waittime']=tend*2/3
	waittime=event['waittime']
	timer=s3func.timer([waittime,tend])
	#waittime=tend/4
	if 'round' not in event.keys():
		event['round']=0
	else:
		event['round']+=1
	rounditer=event['rounditer']
	s3 = boto3.client('s3')
	s32 = boto3.resource('s3')
	client=boto3.client('lambda',region_name = AWS_region)
	if nowiter==0 and pos==0:
		s3.put_object(Bucket=AWS_S3_bucket,Body=str(stt), Key='timestamp/timestamp_train_start_'+str(pid))
	response = client.get_function(
		FunctionName=funcname,
	)
	filerecord=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'results/result',0,1,0)
	if filerecord==0:
		filerecord=''
	filerecord+='====='+' merge: '+str(mlayers)+' samples: '+str(ns)+' memory: '+str(event['memory'])+' testtime left :'+str(event['testtime'])+' starttime: '+str(st)+'\n'
	filerecord+='====='+str(stt)+'\n'
	data='train round '+str(event['round'])+', round time '+str(event['roundtime'])+', start at '+str(st)+' ##'+str(time.time())+'\n'
	data+='info: pos '+str(pos)+', memory '+str(response['Configuration']['MemorySize'])+', mlayers '+str(mlayers)+', ns '+str(ns)+', layers '+str(layers)+'\n'
	print '='*5,'train node',pos,'='*5,'train phase start'
	split=10000
	base=int(ns/mlayers[-1])
	remin=ns%mlayers[-1]
	sn=0
	for n in range(pos):
		sn+=base
		if remin:
			sn+=1
			remin-=1
	en=sn+base
	if remin:
		en+=1
	print '='*5,'train node',pos,'='*5,'read samples from',sn,'to',en
	training_inputs=[]
	training_outputs=[]
	sfile=int(sn/split)
	efile=int((en-1)/split)
	print '='*5,'train node',pos,'='*5,'read files from',sfile,'to',efile
	data+='start up time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n'
	s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp')
	#=========================================read========================================
	stt=time.time()
	if os.path.exists('/tmp/samples_save'):
		print '='*5,'train node',pos,'='*5,'found samples!!!'
		with open('/tmp/samples_save', 'r') as f:
			temp=pickle.load(f)
		#os.remove('/tmp/samples_save_'+str(pos))
		training_inputs=temp[0]
		training_outputs=temp[1]
		data+='found samples!!! time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n'
	else:
		print '='*5,'train node',pos,'='*5,'samples not found, downloading'
		for now in range(sfile,efile+1):
			print 'downloading',now,'from range',sfile,efile+1
			#flag=s3func.s3_download_file_v2(s3,s32,AWS_S3_bucket,'data/samples_'+str(now),'/tmp/samples',0,1,0)
			flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_'+str(now),'/tmp/samples',0,1,0)
			if flag==0:
				print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read sample file:',now
			with open('/tmp/samples', 'r') as f:
				temp=pickle.load(f)
			sread=max([split*now,sn])-split*now
			eread=min([split*(now+1),en])-split*now
			
			if training_inputs==[]:
				training_inputs=temp[0][sread:eread]
				training_outputs=temp[1][sread:eread]
			else:
				training_inputs=np.append(training_inputs,temp[0][sread:eread],axis=0)
				training_outputs=np.append(training_outputs,temp[1][sread:eread],axis=0)
			
			#training_inputs.extend(temp[0][sread:eread])
			#training_outputs.extend(temp[1][sread:eread])
		if os.path.exists('/tmp/samples'):
			os.remove('/tmp/samples')
		with open('/tmp/samples_save', 'w') as f:
			pickle.dump([training_inputs,training_outputs], f)
		data+='read from '+str(sfile)+' to '+str(efile)+' time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n'
	data+='samples length: '+str(len(training_inputs))+'\n'
	if nowiter==0 and pos==0:
		s3.put_object(Bucket=AWS_S3_bucket,Body=str(time.time()), Key='timestamp/timestamp_train_start_2_'+str(pid))
	#=========================================read========================================
	#=========================================initialize==================================
	stt=time.time()
	weights=[[] for i in range(len(layers))]
	biases=[[] for i in range(len(layers))]
	outputs=[[] for i in range(len(layers))]
	
	for l in range(len(layers)):
		if l>0:
			weights[l]=tf.Variable(tf.random_normal([layers[l-1], layers[l]]))
			biases[l]=tf.Variable(tf.random_normal([1, layers[l]]))
	
	x=tf.placeholder(tf.float32,[None,layers[0]])
	y=tf.placeholder(tf.float32,[None,layers[-1]])
	
	outputs[0]=x
	for l in range(len(layers)):
		if l>0:
			outputs[l]= tf.nn.tanh(tf.add(tf.matmul(outputs[l-1], weights[l]), biases[l]))
	
	cost=0.5*(y-outputs[-1])**2

	#train=tf.train.AdamOptimizer(0.1).minimize(cost)
	train=tf.train.GradientDescentOptimizer(1e-1).minimize(cost)
	
	init=tf.global_variables_initializer()
	session=tf.Session()
	session.run(init)
	data+='training initialize time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n'
	#=========================================initialize==================================
	#=========================================LOOP start========================================
	avgitertime=0.0
	avgitertimereal=0.0
	minitertimereal=100000.0
	timerecord=[]
	smt=0.0
	while nowiter<maxiter:
		itertime=[0.0]
		stiter=time.time()
		#flag=s3func.s3_download_file_v2(s3,s32,AWS_S3_bucket,'flag/work_'+str(pid),'/tmp/work',0,1,0)
		flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_'+str(pid),'/tmp/work',0,1,0)
		if flag==0:
			print '='*5,'train node',pos,'='*5,'Abandon!!!! pid:',pid
			return
		stt=time.time()
		print '+'*5,'train node',pos,'pid',pid,'+'*5,'now start iteration',nowiter
		print '='*5,'train node',pos,'='*5,'now start iteration',nowiter
		stt2=time.time()
		#flag=s3func.s3_download_file_v2(s3,s32,AWS_S3_bucket,'data/model_'+str(pid)+'_'+str(nowiter),'/tmp/model',max(waittime,tend-time.time()+tcount),0,0)
		#flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/model_'+str(pid)+'_'+str(nowiter),'/tmp/model',max(waittime,tend-time.time()+tcount),0,0)
		flag=s3func.s3_download_file_timer(s3,AWS_S3_bucket,'data/model_'+str(pid)+'_'+str(nowiter),'/tmp/model',timer,0,0)
		itertime[0]+=time.time()-stt2
		data+='training '+str(nowiter)+' model waiting time: '+str(time.time()-stt2)+' ##'+str(stt2)+'--'+str(time.time())+'\n'
		#print 'flag',flag
		if flag==0:
			if timer.query()[1]>waittime/4:
				print '++++++++lambda train',pos,'at iteration',nowiter,'end at',datetime.datetime.now()
				"""
				with open('/tmp/samples_save_'+str(pos), 'w') as f:
					pickle.dump([training_inputs,training_outputs], f)
				"""
				s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp')
				event['nowiter']=nowiter
				#invoke_lambda(client,funcname,event)
				return
			else:
				print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read model',nowiter
				s3.put_object(Bucket=AWS_S3_bucket,Body='fail to read model '+str(nowiter), Key='error/error_train_'+str(pid)+'_'+str(pos))
				return
		if nowiter>=(event['round']+1)*rounditer:
			print '++++++++lambda train',pos,'at iteration',nowiter,'end at',datetime.datetime.now()
			if [0,0] in event['mergepos']:
				s3.put_object(Bucket=AWS_S3_bucket,Body=filerecord, Key='results/result')
			"""
			with open('/tmp/samples_save_'+str(pos), 'w') as f:
				pickle.dump([training_inputs,training_outputs], f)
			"""
			s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp')
			event['nowiter']=nowiter
			invoke_lambda(client,funcname,event)
			return
		stt2=time.time()
		with open('/tmp/model', 'r') as f:
			temp=pickle.load(f)
		if temp[0]==[]:
			print '='*5,'train node',pos,'='*5,'ERROR!!!: model format wrong',nowiter
			s3.put_object(Bucket=AWS_S3_bucket,Body='model format wrong '+str(nowiter), Key='error/error_train_'+str(pid)+'_'+str(pos))
			return
		for l in range(len(layers)):
			if l>0:
				session.run(tf.assign(weights[l],temp[0][l]))
				session.run(tf.assign(biases[l],temp[1][l]))
		itertime[0]+=time.time()-stt2
		data+='training '+str(nowiter)+' download model time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n'
		stt=time.time()
		print '='*5,'train node',pos,'='*5,'train start'
		bs=len(training_inputs)/bn
		for b in range(bn):
			session.run(train,feed_dict={x:training_inputs[b*bs:(b+1)*bs,:],y:training_outputs[b*bs:(b+1)*bs,:]})#=========================train
		itertime[0]+=time.time()-stt
		data+='training '+str(nowiter)+' train time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n'
		stt=time.time()
		rew=session.run(weights)
		reb=session.run(biases)
		model=[rew,reb]
		with open('/tmp/model', 'w') as f:
			pickle.dump(model, f)
		print '='*5,'train node',pos,'='*5,'write result as layer',len(mlayers)-1,'node',pos
		s32.Bucket(AWS_S3_bucket).upload_file('/tmp/model', 'data/model_'+str(pid)+'_'+str(len(mlayers)-1)+'_'+str(pos))
		#s3.put_object(Bucket=AWS_S3_bucket,Body='true', Key='flag/flag_'+str(pid)+'_'+str(nowiter)+'_'+str(len(mlayers)-1)+'_'+str(pos))
		itertime[0]+=time.time()-stt
		data+='training '+str(nowiter)+' model write time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n'
		if len(event['mergepos'])>0:
			mergepos=copy.deepcopy(event['mergepos'])
			thismergepos=mergepos[0]
			del mergepos[0]
			#tempd=merge(mlayers,thismergepos,mergepos,nowiter,timer,max(waittime,tend-time.time()+tcount),itertime,pid)
			smt=time.time()
			tempd=merge(mlayers,thismergepos,mergepos,nowiter,timer,waittime,itertime,pid)
			smt=time.time()-smt
			if tempd==0:
				return
			elif tempd==1:
				print '++++++++lambda train',pos,'at iteration',nowiter,'end at',datetime.datetime.now()
				"""
				with open('/tmp/samples_save_'+str(pos), 'w') as f:
					pickle.dump([training_inputs,training_outputs], f)
				"""
				s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp')
				event['nowiter']=nowiter
				#invoke_lambda(client,funcname,event)
				return
			else:
				data+=tempd
		data+='training '+str(nowiter)+' valid iteration time: '+str(itertime[0])+'\n'
		print '-'*5,'train node',pos,'-'*5,'now end iteration',nowiter
		avgitertime+=itertime[0]
		"""
		if nowiter>=min(10,maxiter-1) and [0,0] in event['mergepos']:
			s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertime/(nowiter+1)), Key='timestamp/timestamp_iteration_'+str(pid)+'.tsp')
		"""
		thisitertime=time.time()-stiter
		filerecord+=str(time.time())+'\n'
		#filerecord+=str(time.time()-stiter)+'\n'
		if thisitertime<minitertimereal:
			minitertimereal=thisitertime
		#insert_sort(time.time()-stiter,timerecord)
		if nowiter>=2:
			avgitertimereal+=time.time()-stiter
			insert_sort(time.time()-stiter,timerecord)
			#filerecord+=str(smt)+'\n'
		if nowiter>=min(10,maxiter-1) and [0,0] in event['mergepos']:
			s3.put_object(Bucket=AWS_S3_bucket,Body=str(timerecord), Key='timestamp/timestamp_iteration_each_'+str(pid)+'.tsp')
			#s3.put_object(Bucket=AWS_S3_bucket,Body=str(avg_no_abnormal(timerecord,2)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
			#s3.put_object(Bucket=AWS_S3_bucket,Body=str(float(np.mean(timerecord))), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
			s3.put_object(Bucket=AWS_S3_bucket,Body=str(float(np.mean(extract_valid(timerecord)))), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
			#s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertimereal/(nowiter-2+1)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
			#s3.put_object(Bucket=AWS_S3_bucket,Body=str(minitertimereal), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
		nowiter+=1
	if [0,0] in event['mergepos']:
		s3.put_object(Bucket=AWS_S3_bucket,Body=filerecord, Key='results/result')
		"""
		event['testtime']-=1
		if event['testtime']>0:
			inputs={'state':0,'mod':0,'batchnumber':20,'slayers':[],'mlayers':[1,100],'layers':[20,100,100,100,100,100,1],'pos':[0,0],'ns':1000000,'maxiter':10,'nowiter':0,'roundtime':250,'rounditer':15}
			inputs['testtime']=event['testtime']
			invoke_lambda(client,'testfunc',inputs)
			time.sleep(10)
		"""
	if [0,0] in event['mergepos'] and nowiter>=maxiter:
		s3.put_object(Bucket=AWS_S3_bucket,Body=str(time.time()), Key='timestamp/timestamp_train_end_'+str(pid))
	s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp')