def start_structure(event):
	st=datetime.datetime.now()
	pid=event['pid']
	mlayers=event['mlayers']
	nowiter=event['nowiter']
	s3 = boto3.client('s3')
	s3.put_object(Bucket=AWS_S3_bucket,Body='true', Key='flag/work_'+str(pid))
	data=''
	print 'structure start:',pid
	if 'modelname' in event.keys():
		data+='modelname found: '+event['modelname']+'\n'
		flag=s3func.s3_download_file(0,AWS_S3_bucket,event['modelname'],'/tmp/model',0,1,0)
		if flag==0:
			print 'ERROR!!!',event['modelname'],'not found'
			s3.put_object(Bucket=AWS_S3_bucket,Body=event['modelname']+' not found', Key='error/error_start_'+str(pid))
	else:
		data+='modelname not found\n'
		s3func.s3_download_file(0,AWS_S3_bucket,'data/modelcifar','/tmp/model',0,1,0)
	s32 = boto3.resource('s3')
	s32.Bucket(AWS_S3_bucket).upload_file('/tmp/model', 'data/modelcifar_'+str(pid)+'_'+str(nowiter))
	
	client=boto3.client('lambda',region_name = AWS_region)
	
	if ('fixedmlayers' not in event.keys()) or event['fixedmlayers']==0:
		mlayers=estimate_merging(mlayers[-1])
		if len(mlayers)>4:
			mlayers=estimate_best_layers(mlayers[-1],2)
		event['mlayers']=mlayers
		data+='mlayers is: '+str(mlayers)+'\n'
	
	search=range(len(mlayers)-1)
	search.reverse()
	mstart=[]
	for l in search:
		base=int(mlayers[l+1]/mlayers[l])
		remin=mlayers[l+1]%mlayers[l]
		temp=[]
		now=0
		for i in range(mlayers[l]):
			now+=base
			if remin>0:
				now+=1
				remin-=1
			temp.append(now-1)
		if l!=len(mlayers)-2:
			for i in range(len(temp)):
				temp[i]=mstart[-1][temp[i]]
		mstart.append(temp)
	mstart.reverse()
	for i in range(mlayers[-1]):
		event['pos']=i
		event['state']=1
		event['mergepos']=[]
		for l in range(len(mstart)):
			if i in mstart[l]:
				event['mergepos'].append([l,mstart[l].index(i)])
		event['mergepos'].reverse()
		data+='work node '+str(event['pos'])+', mergepos: '+str(event['mergepos'])+'\n'
		invoke_lambda(client,event['funcname'],event)
	s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_startstructure_'+str(pid)+'.tsp')
Esempio n. 2
0
def monitor(event):
    st = datetime.datetime.now()
    stt = time.time()
    nworker = event['nworker']
    pid = event['pid']
    if 'roundtime' not in list(event.keys()):
        event['roundtime'] = 250
    if 'waittime' not in list(event.keys()):
        event['waittime'] = event['roundtime'] * 2 / 3
    timer = s3func.timer([event['waittime'], event['roundtime']])
    s3 = boto3.client('s3')
    s32 = boto3.resource('s3')
    flag = s3func.s3_download_file(s3, AWS_S3_bucket, 'flag/work_pt',
                                   '/tmp/work', 0, 1, 0)
    if flag == 0:
        print('monitor terminated!!!!!!')
        return
    s3.put_object(Bucket=AWS_S3_bucket,
                  Body=str(st),
                  Key='timestamp/timestamp_monitor')
    finished = [0 for i in range(nworker)]
    timer.local_start(0)
    bresult = 0.0
    bpos = 0
    while 1:
        if sum(finished) == nworker:
            break
        for now in range(nworker):
            if finished[now] == 0:
                tresult = timer.query()
                if tresult[0] == 1:
                    return 0
                flag = s3func.s3_download_file(
                    s3, AWS_S3_bucket, 'timestamp/timestamp_trainresult_' +
                    str(pid) + '_' + str(now), '/tmp/result', 0, 1, 0)
                if flag == 1:
                    finished[now] = 1
                    with open('/tmp/result', 'r') as f:
                        temp = f.read()
                    r = float(temp)
                    if r > bresult:
                        bresult = r
                        bpos = now
    s3.put_object(Bucket=AWS_S3_bucket,
                  Body=str([bresult, bpos]),
                  Key='timestamp/timestamp_final_result')
    et = time.time()
    st = s3func.s3_read_file_v2(s3, AWS_S3_bucket,
                                'timestamp/timestamp_startup.tsp', 0, 1, 0)
    filerecord = s3func.s3_read_file_v2(s3, AWS_S3_bucket, 'results/timecost',
                                        0, 1, 0)
    if filerecord == 0:
        filerecord = ''
    filerecord += str(et - float(st)) + '\n'
    s3.put_object(Bucket=AWS_S3_bucket,
                  Body=filerecord,
                  Key='results/timecost')
    """
def merge(mlayers, pos, mergepos, nowiter, timer, waittime, itertime, pid):
    stt = time.time()
    tcount = time.time()
    layer = pos[0]
    node = pos[1]
    data = ''

    base = int(mlayers[layer + 1] / mlayers[layer])
    remin = mlayers[layer + 1] % mlayers[layer]
    print('=' * 5, 'merge node at layer', layer, 'node', node, '=' * 5,
          'merge phase start')
    sn = 0
    for n in range(node):
        sn += base
        if remin:
            sn += 1
            remin -= 1
    en = sn + base
    if remin:
        en += 1
    print('=' * 5, 'merge node at layer', layer, 'node', node, '=' * 5,
          'merge model file at layer', layer + 1, 'from', sn, 'to', en)
    s3 = boto3.client('s3')
    s32 = boto3.resource('s3')
    itertime[0] += time.time() - stt
    data += 'merge ' + str(nowiter) + ' layer ' + str(layer) + ' node ' + str(
        node) + ' merge ' + str([sn, en - 1]) + ' start up time: ' + str(
            time.time() - stt) + ' ##' + str(stt) + '--' + str(
                time.time()) + '\n'

    #============================================start==============================================
    stt = time.time()
    print('=' * 5, 'merge node at layer', layer, 'node', node, '=' * 5,
          'iteration', nowiter)
    params = []
    flagt = 0.0
    modelt = 0.0
    itertime[0] += 1
    #files=['data/modelcifar_'+str(pid)+'_'+str(layer+1)+'_'+str(now) for now in range(sn,en)]

    finished = [0 for i in range(en - sn)]
    timer.local_start(0)
    while 1:
        if sum(finished) == (en - sn):
            break
        for now in range(sn, en):
            if finished[now - sn] == 0:
                tresult = timer.query()
                if tresult[0] == 1:
                    if tresult[1] > waittime / 4:
                        return 0
                    else:
                        print('=' * 5, 'merge node at layer', layer, 'node',
                              node, '=' * 5,
                              'ERROR!!!: fail to read model: layer', layer + 1,
                              'finished state', str(finished))
                        s3.put_object(
                            Bucket=AWS_S3_bucket,
                            Body='fail to read model at iteration ' +
                            str(nowiter) + ' at layer ' + str(layer + 1) +
                            ', finished state' + str(finished),
                            Key='error/error_merge_' + str(pid) + '_' +
                            str(layer) + '_' + str(node))
                        return 0
                flag = s3func.s3_download_file(
                    s3, AWS_S3_bucket, 'data/modelcifar_' + str(pid) + '_' +
                    str(layer + 1) + '_' + str(now), '/tmp/model', 0, 1, 1)
                if flag == 1:
                    finished[now - sn] = 1
                    with open('/tmp/model', 'r') as f:
                        temp = pickle.load(f)
                    if not temp[0] == []:
                        if params == []:
                            params = temp
                        else:
                            for i in range(len(temp)):
                                params[i] = params[i] + temp[i]
    data += 'merge ' + str(nowiter) + ' layer ' + str(layer) + ' node ' + str(
        node) + ' model read time: ' + str(time.time() - stt) + ' ##' + str(
            stt) + '--' + str(time.time()) + '\n'
    stt = time.time()
    if layer == 0:
        print('=' * 5, 'merge node at layer', layer, 'node', node, '=' * 5,
              'now is the final node')
        if not params == []:
            for i in range(len(params)):
                params[i] = params[i] / mlayers[-1]
        with open('/tmp/model', 'w') as f:
            pickle.dump(params, f)
        s32 = boto3.resource('s3')
        s32.Bucket(AWS_S3_bucket).upload_file(
            '/tmp/model', 'data/modelcifar_' + str(pid) + '_new')
        s32.Bucket(AWS_S3_bucket).upload_file(
            '/tmp/model',
            'data/modelcifar_' + str(pid) + '_' + str(nowiter + 1))
    else:
        with open('/tmp/model', 'w') as f:
            pickle.dump(params, f)
        print('=' * 5, 'merge node at layer', layer, 'node', node, '=' * 5,
              'write model as layer', layer, 'node', node)
        s32.Bucket(AWS_S3_bucket).upload_file(
            '/tmp/model',
            'data/modelcifar_' + str(pid) + '_' + str(layer) + '_' + str(node))
    itertime[0] += time.time() - stt
    data += 'merge ' + str(nowiter) + ' layer ' + str(layer) + ' node ' + str(
        node) + ' model write time: ' + str(time.time() - stt) + ' ##' + str(
            stt) + '--' + str(time.time()) + '\n'
    if len(mergepos) > 0:
        thismergepos = mergepos[0]
        del mergepos[0]
        tempd = merge(mlayers, thismergepos, mergepos, nowiter, timer,
                      waittime, itertime, pid)
        if type(tempd) == int:
            return tempd
        else:
            data += tempd

    return data
def train(event):
    st = datetime.datetime.now()
    stt = time.time()
    tcount = time.time()
    if 'roundtime' not in list(event.keys()):
        event['roundtime'] = 250
    tend = event['roundtime']
    ns = event['ns']
    pos = event['pos']
    mlayers = event['mlayers']
    maxiter = event['maxiter']
    nowiter = event['nowiter']
    funcname = event['funcname']
    if 'batchnumber' not in list(event.keys()):
        event['batchnumber'] = 1
    bn = event['batchnumber']
    pid = event['pid']
    if 'testtime' not in list(event.keys()):
        event['testtime'] = 10
    if 'waittime' not in list(event.keys()):
        event['waittime'] = tend * 2 / 3
    if 'learningrate' not in list(event.keys()):
        event['learningrate'] = 0.1
    waittime = event['waittime']
    timer = s3func.timer([waittime, tend])
    if 'round' not in list(event.keys()):
        event['round'] = 0
    else:
        event['round'] += 1
    rounditer = event['rounditer']
    s3 = boto3.client('s3')
    s32 = boto3.resource('s3')
    client = boto3.client('lambda', region_name=AWS_region)
    response = client.get_function(FunctionName=funcname, )
    if nowiter == 0 and pos == 0:
        s3.put_object(Bucket=AWS_S3_bucket,
                      Body=str(stt),
                      Key='timestamp/timestamp_train_start_' + str(pid))
    response = client.get_function(FunctionName=funcname, )
    filerecord = s3func.s3_read_file_v2(s3, AWS_S3_bucket, 'results/result', 0,
                                        1, 0)
    if filerecord == 0:
        filerecord = ''
    filerecord += '=====' + ' merge: ' + str(mlayers) + ' samples: ' + str(
        ns) + ' memory: ' + str(event['memory']) + ' testtime left :' + str(
            event['testtime']) + ' starttime: ' + str(st) + '\n'
    filerecord += '=====' + str(stt) + '\n'
    data = 'train round ' + str(event['round']) + ', round time ' + str(
        event['roundtime']) + ', start at ' + str(st) + ' ##' + str(
            time.time()) + '\n'
    data += 'info: pos ' + str(pos) + ', memory ' + str(
        response['Configuration']['MemorySize']) + ', mlayers ' + str(
            mlayers) + ', ns ' + str(ns) + '\n'
    print('=' * 5, 'train node', pos, '=' * 5, 'train phase start')
    split = 500
    base = int(ns / mlayers[-1])
    remin = ns % mlayers[-1]
    sn = 0
    for n in range(pos):
        sn += base
        if remin:
            sn += 1
            remin -= 1
    en = sn + base
    if remin:
        en += 1
    print('=' * 5, 'train node', pos, '=' * 5, 'read samples from', sn, 'to',
          en)
    train_x = []
    train_y = []
    sfile = int(sn / split)
    efile = int((en - 1) / split)
    print('=' * 5, 'train node', pos, '=' * 5, 'read files from', sfile, 'to',
          efile)
    data += 'start up time: ' + str(time.time() - stt) + ' ##' + str(
        time.time()) + ' ##' + str(stt) + '--' + str(time.time()) + '\n'
    s3.put_object(Bucket=AWS_S3_bucket,
                  Body=data,
                  Key='timestamp/timestamp_train_' + str(pid) + '_' +
                  str(pos) + '_' + str(event['round']) + '.tsp')
    #=========================================read========================================
    stt = time.time()
    if os.path.exists('/tmp/samples_save'):
        print('=' * 5, 'train node', pos, '=' * 5, 'found samples!!!')
        with open('/tmp/samples_save', 'r') as f:
            temp = pickle.load(f)
        #os.remove('/tmp/samples_save_'+str(pos))
        train_x = temp['data']
        train_y = temp['label']
        data += 'found samples!!! time: ' + str(
            time.time() - stt) + ' ##' + str(
                time.time()) + ' ##' + str(stt) + '--' + str(
                    time.time()) + '\n'
    else:
        print('=' * 5, 'train node', pos, '=' * 5,
              'samples not found, downloading')
        for now in range(sfile, efile + 1):
            print('downloading', now, 'from range', sfile, efile + 1)
            flag = s3func.s3_download_file(s3, AWS_S3_bucket,
                                           'data/samples_cifar_' + str(now),
                                           '/tmp/samples', 0, 1, 0)
            if flag == 0:
                print('=' * 5, 'train node', pos, '=' * 5,
                      'ERROR!!!: fail to read sample file:', now)
            with open('/tmp/samples', 'r') as f:
                temp = pickle.load(f)
            sread = max([split * now, sn]) - split * now
            eread = min([split * (now + 1), en]) - split * now

            if train_x == []:
                train_x = temp['data'][sread:eread]
                train_y = temp['label'][sread:eread]
            else:
                train_x = np.append(temp['data'],
                                    temp['data'][sread:eread],
                                    axis=0)
                train_y = np.append(temp['label'],
                                    temp['label'][sread:eread],
                                    axis=0)

        if os.path.exists('/tmp/samples'):
            os.remove('/tmp/samples')
        with open('/tmp/samples_save', 'w') as f:
            pickle.dump({'data': train_x, 'label': train_y}, f)
        data += 'read from ' + str(sfile) + ' to ' + str(
            efile) + ' time: ' + str(time.time() - stt) + ' ##' + str(
                time.time()) + ' ##' + str(stt) + '--' + str(
                    time.time()) + '\n'
    data += 'samples length: ' + str(len(train_x)) + '\n'
    if nowiter == 0 and pos == 0:
        s3.put_object(Bucket=AWS_S3_bucket,
                      Body=str(time.time()),
                      Key='timestamp/timestamp_train_start_' + str(pid))
    #=========================================read========================================
    #=========================================initialize==================================
    stt = time.time()
    x, y, output, global_step, y_pred_cls, params = model()

    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=y))
    #optimizer = tf.train.RMSPropOptimizer(learning_rate=1e-3).minimize(loss, global_step=global_step)
    #optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(loss, global_step=global_step)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    data += 'training initialize time: ' + str(
        time.time() - stt) + ' ##' + str(stt) + '--' + str(time.time()) + '\n'
    #=========================================initialize==================================
    #=========================================LOOP start========================================
    avgitertime = 0.0
    avgitertimereal = 0.0
    minitertimereal = 100000.0
    timerecord = []
    smt = 0.0
    while nowiter < maxiter:
        itertime = [0.0]
        stiter = time.time()
        flag = s3func.s3_download_file(s3, AWS_S3_bucket,
                                       'flag/work_' + str(pid), '/tmp/work', 0,
                                       1, 0)
        if flag == 0:
            print('=' * 5, 'train node', pos, '=' * 5, 'Abandon!!!! pid:', pid)
            return
        stt = time.time()
        print('+' * 5, 'train node', pos, 'pid', pid, '+' * 5,
              'now start iteration', nowiter)
        print('=' * 5, 'train node', pos, '=' * 5, 'now start iteration',
              nowiter)
        stt2 = time.time()
        flag = s3func.s3_download_file_timer(
            s3, AWS_S3_bucket,
            'data/modelcifar_' + str(pid) + '_' + str(nowiter), '/tmp/model',
            timer, 0, 0)
        itertime[0] += time.time() - stt2
        data += 'training ' + str(nowiter) + ' model waiting time: ' + str(
            time.time() - stt2) + ' ##' + str(stt2) + '--' + str(
                time.time()) + '\n'
        if flag == 0:
            if timer.query()[1] > waittime / 4:
                print('++++++++lambda train', pos, 'at iteration', nowiter,
                      'end at', datetime.datetime.now())
                s3.put_object(Bucket=AWS_S3_bucket,
                              Body=data,
                              Key='timestamp/timestamp_train_' + str(pid) +
                              '_' + str(pos) + '_' + str(event['round']) +
                              '.tsp')
                event['nowiter'] = nowiter
                return
            else:
                print('=' * 5, 'train node', pos, '=' * 5,
                      'ERROR!!!: fail to read model', nowiter)
                s3.put_object(Bucket=AWS_S3_bucket,
                              Body='fail to read model ' + str(nowiter),
                              Key='error/error_train_' + str(pid) + '_' +
                              str(pos))
                return
        if nowiter >= (event['round'] + 1) * rounditer:
            if [0, 0] in event['mergepos']:
                s3.put_object(Bucket=AWS_S3_bucket,
                              Body=filerecord,
                              Key='results/result')
            print('++++++++lambda train', pos, 'at iteration', nowiter,
                  'end at', datetime.datetime.now())
            s3.put_object(Bucket=AWS_S3_bucket,
                          Body=data,
                          Key='timestamp/timestamp_train_' + str(pid) + '_' +
                          str(pos) + '_' + str(event['round']) + '.tsp')
            event['nowiter'] = nowiter
            invoke_lambda(client, funcname, event)
            return
        stt2 = time.time()
        with open('/tmp/model', 'r') as f:
            temp = pickle.load(f)
        if temp[0] == []:
            print('=' * 5, 'train node', pos, '=' * 5,
                  'ERROR!!!: model format wrong', nowiter)
            s3.put_object(Bucket=AWS_S3_bucket,
                          Body='model format wrong ' + str(nowiter),
                          Key='error/error_train_' + str(pid) + '_' + str(pos))
            return
        for i in range(len(temp)):
            sess.run(tf.assign(params[i], temp[i]))
        itertime[0] += time.time() - stt2
        data += 'training ' + str(nowiter) + ' download model time: ' + str(
            time.time() - stt) + ' ##' + str(stt) + '--' + str(
                time.time()) + '\n'
        stt = time.time()
        bs = len(train_x) / bn
        print('=' * 5, 'train node', pos, '=' * 5, 'train start')
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=event['learningrate']).minimize(
                loss, global_step=global_step)
        for b in range(bn):
            i_global, _ = sess.run(
                [global_step, optimizer],
                feed_dict={
                    x: train_x[b * bs:(b + 1) * bs, :],
                    y: train_y[b * bs:(b + 1) * bs, :]
                })  #=========================train
        event['learningrate'] = event['learningrate']
        itertime[0] += time.time() - stt
        data += 'training ' + str(nowiter) + ' train time: ' + str(
            time.time() - stt) + ' ##' + str(stt) + '--' + str(
                time.time()) + '\n'
        stt = time.time()
        pw = []
        for i in range(len(params)):
            pw.append(sess.run(params[i]))
        with open('/tmp/model', 'w') as f:
            pickle.dump(pw, f)
        print('=' * 5, 'train node', pos, '=' * 5, 'write result as layer',
              len(mlayers) - 1, 'node', pos)
        s32.Bucket(AWS_S3_bucket).upload_file(
            '/tmp/model', 'data/modelcifar_' + str(pid) + '_' +
            str(len(mlayers) - 1) + '_' + str(pos))
        itertime[0] += time.time() - stt
        data += 'training ' + str(nowiter) + ' model write time: ' + str(
            time.time() - stt) + ' ##' + str(stt) + '--' + str(
                time.time()) + '\n'
        if len(event['mergepos']) > 0:
            mergepos = copy.deepcopy(event['mergepos'])
            thismergepos = mergepos[0]
            del mergepos[0]
            #tempd=merge(mlayers,thismergepos,mergepos,nowiter,timer,max(waittime,tend-time.time()+tcount),itertime,pid)
            smt = time.time()
            tempd = merge(mlayers, thismergepos, mergepos, nowiter, timer,
                          waittime, itertime, pid)
            smt = time.time() - smt
            if tempd == 0:
                return
            elif tempd == 1:
                print('++++++++lambda train', pos, 'at iteration', nowiter,
                      'end at', datetime.datetime.now())
                s3.put_object(Bucket=AWS_S3_bucket,
                              Body=data,
                              Key='timestamp/timestamp_train_' + str(pid) +
                              '_' + str(pos) + '_' + str(event['round']) +
                              '.tsp')
                event['nowiter'] = nowiter
                return
            else:
                data += tempd
        data += 'training ' + str(nowiter) + ' valid iteration time: ' + str(
            itertime[0]) + '\n'
        print('-' * 5, 'train node', pos, '-' * 5, 'now end iteration',
              nowiter)
        avgitertime += itertime[0]
        """
		if nowiter>=min(10,maxiter-1) and [0,0] in event['mergepos']:
			s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertime/(nowiter+1)), Key='timestamp/timestamp_iteration_'+str(pid)+'.tsp')
		"""
        thisitertime = time.time() - stiter
        filerecord += str(time.time()) + '\n'
        #filerecord+=str(time.time()-stiter)+'\n'
        if thisitertime < minitertimereal:
            minitertimereal = thisitertime
        if nowiter >= 2:
            avgitertimereal += time.time() - stiter
            insert_sort(time.time() - stiter, timerecord)
            #filerecord+=str(time.time()-stiter)+'\n'
            #filerecord+=str(smt)+'\n'
        if nowiter >= min(10, maxiter - 1) and [0, 0] in event['mergepos']:
            aaaa = 0
            #s3.put_object(Bucket=AWS_S3_bucket,Body=str(timerecord), Key='timestamp/timestamp_iteration_each_'+str(pid)+'.tsp')
            #s3.put_object(Bucket=AWS_S3_bucket,Body=str(avg_no_abnormal(timerecord,2)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
            #s3.put_object(Bucket=AWS_S3_bucket,Body=str(find_median(timerecord)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
            #s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertimereal/(nowiter-2+1)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
            #s3.put_object(Bucket=AWS_S3_bucket,Body=str(minitertimereal), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
        nowiter += 1
    if [0, 0] in event['mergepos']:
        s3.put_object(Bucket=AWS_S3_bucket,
                      Body=filerecord,
                      Key='results/result')
        """
		event['testtime']-=1
		if event['testtime']>0:
			inputs={'state':0,'mod':0,'slayers':[],'mlayers':event['mlayers'],'layers':[100,100,100,100,100],'pos':[0,0],'ns':1000000,'maxiter':10,'nowiter':0,'roundtime':250}
			inputs['testtime']=event['testtime']
			invoke_lambda(client,'testfunc',inputs)
			time.sleep(10)
		"""
    if [0, 0] in event['mergepos']:
        s3.put_object(Bucket=AWS_S3_bucket,
                      Body=str(time.time()),
                      Key='timestamp/timestamp_train_end_' + str(pid))
    s3.put_object(Bucket=AWS_S3_bucket,
                  Body=data,
                  Key='timestamp/timestamp_train_' + str(pid) + '_' +
                  str(pos) + '_' + str(event['round']) + '.tsp')
Esempio n. 5
0
def tfpredict(layers, num, modelnum):
    [samples, ds] = gen_random_sample(layers, num)
    samples = np.array(samples)
    ds = np.array(ds)
    ds[0:len(ds) / 2, :] = -1.0
    ds[len(ds) / 2:, :] = 1.0
    print ds
    samples = np.append(samples, ds, axis=1)
    np.random.shuffle(samples)
    ds = samples[:, -layers[-1]:]
    samples = samples[:, 0:-layers[-1]]
    print samples.shape, ds.shape
    training_inputs = samples
    training_outputs = ds

    weights = [[] for i in range(len(layers))]
    biases = [[] for i in range(len(layers))]
    outputs = [[] for i in range(len(layers))]
    for l in range(len(layers)):
        if l > 0:
            weights[l] = tf.Variable(
                tf.random_normal([layers[l - 1], layers[l]]))
            biases[l] = tf.Variable(tf.random_normal([1, layers[l]]))

    x = tf.placeholder(tf.float32, [None, layers[0]])
    y = tf.placeholder(tf.float32, [None, layers[-1]])

    outputs[0] = x
    for l in range(len(layers)):
        if l > 0:
            outputs[l] = tf.nn.tanh(
                tf.add(tf.matmul(outputs[l - 1], weights[l]), biases[l]))

    cost = 0.5 * (y - outputs[-1])**2

    #train=tf.train.AdamOptimizer(0.01).minimize(cost)
    train = tf.train.GradientDescentOptimizer(1e-2).minimize(cost)

    init = tf.global_variables_initializer()
    session = tf.Session()
    session.run(init)
    st = time.time()
    for mo in range(modelnum + 1):
        s3func.s3_download_file(0, 'lf-source', 'data/model_0_' + str(mo),
                                '/tmp/model', 0, 1, 0)
        with open('/tmp/model', 'r') as f:
            temp = pickle.load(f)
        for l in range(len(layers)):
            if l > 0:
                session.run(tf.assign(weights[l], temp[0][l]))
                session.run(tf.assign(biases[l], temp[1][l]))

        results = session.run(outputs, feed_dict={x: training_inputs})
        test_output = results[-1]
        test_output = list(test_output)
        for i in range(len(test_output)):
            for j in range(len(test_output[i])):
                if test_output[i][j] >= 0:
                    test_output[i][j] = 1
                else:
                    test_output[i][j] = -1
        error = 0
        for i in range(len(training_outputs)):
            if training_outputs[i] != test_output[i]:
                #print training_outputs[i],test_output[i]
                error += 1
        print 1.0 - float(error) / len(training_outputs)
Esempio n. 6
0
def tfmultitrain(layers, num):  #simulate lambda training
    [samples, ds] = gen_random_sample(layers, num)
    samples = np.array(samples)
    ds = np.array(ds)
    ds[0:len(ds) / 2, :] = -1.0
    ds[len(ds) / 2:, :] = 1.0
    print ds
    samples = np.append(samples, ds, axis=1)
    np.random.shuffle(samples)
    ds = samples[:, -layers[-1]:]
    samples = samples[:, 0:-layers[-1]]
    print samples.shape, ds.shape
    training_inputs = samples
    training_outputs = ds

    weights = [[] for i in range(len(layers))]
    biases = [[] for i in range(len(layers))]
    outputs = [[] for i in range(len(layers))]
    for l in range(len(layers)):
        if l > 0:
            weights[l] = tf.Variable(
                tf.random_normal([layers[l - 1], layers[l]]))
            biases[l] = tf.Variable(tf.random_normal([1, layers[l]]))

    x = tf.placeholder(tf.float32, [None, layers[0]])
    y = tf.placeholder(tf.float32, [None, layers[-1]])

    outputs[0] = x
    for l in range(len(layers)):
        if l > 0:
            outputs[l] = tf.nn.tanh(
                tf.add(tf.matmul(outputs[l - 1], weights[l]), biases[l]))

    cost = 0.5 * (y - outputs[-1])**2

    #train=tf.train.AdamOptimizer(0.01).minimize(cost)
    train = tf.train.GradientDescentOptimizer(1e-2).minimize(cost)

    init = tf.global_variables_initializer()
    session = tf.Session()
    session.run(init)
    st = time.time()

    s3func.s3_download_file(0, 'lf-source', 'data/model_0_10', '/tmp/model', 0,
                            1, 0)
    with open('/tmp/model', 'r') as f:
        temp = pickle.load(f)
    for l in range(len(layers)):
        if l > 0:
            session.run(tf.assign(weights[l], temp[0][l]))
            session.run(tf.assign(biases[l], temp[1][l]))

    rew = session.run(weights)
    reb = session.run(biases)
    startw = rew
    startb = reb

    nn = 10
    bn = 10
    bs = len(samples) / nn / bn
    for i in range(0):
        allw = []
        allb = []
        for n in range(nn):  #simulate every parallel worker
            for l in range(len(layers)):
                if l > 0:
                    session.run(tf.assign(weights[l], startw[l]))
                    session.run(tf.assign(biases[l], startb[l]))
            for b in range(bn):  #every batch
                rs = n * bn * bs + b * bs
                re = n * bn * bs + (b + 1) * bs
                print[i, n, b, rs, re]
                session.run(train,
                            feed_dict={
                                x: training_inputs[rs:re, :],
                                y: training_outputs[rs:re, :]
                            })
            if allw == []:
                allw = session.run(weights)
                allb = session.run(biases)
            else:
                add_weights(allw, session.run(weights))
                add_weights(allb, session.run(biases))
        startw = allw
        startb = allb
    print 'time cost:----------------', time.time() - st
    results = session.run(outputs, feed_dict={x: training_inputs})
    test_output = results[-1]
    test_output = list(test_output)
    for i in range(len(test_output)):
        for j in range(len(test_output[i])):
            if test_output[i][j] >= 0:
                test_output[i][j] = 1
            else:
                test_output[i][j] = -1
    error = 0
    for i in range(len(training_outputs)):
        if training_outputs[i] != test_output[i]:
            #print training_outputs[i],test_output[i]
            error += 1
    print float(error) / len(training_outputs)
    """
def train(event):
	st=datetime.datetime.now()
	stt=time.time()
	tcount=time.time()
	if 'roundtime' not in event.keys():
		event['roundtime']=250
	tend=event['roundtime']
	ns=event['ns']
	pos=event['pos']
	layers=event['layers']
	lr=event['lr']
	if 'batchnumber' not in event.keys():
		event['batchnumber']=1
	pid=event['pid']
	if 'testtime' not in event.keys():
		event['testtime']=10
	if 'waittime' not in event.keys():
		event['waittime']=tend*2/3
	waittime=event['waittime']
	timer=s3func.timer([waittime,tend])
	#waittime=tend/4
	if 'round' not in event.keys():
		event['round']=0
	else:
		event['round']+=1
	rounditer=event['rounditer']
	s3 = boto3.client('s3')
	s32 = boto3.resource('s3')
	flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_pt','/tmp/work',0,1,0)
	if flag==0:
		print 'terminated!!!!!!'
		return
	client=boto3.client('lambda',region_name = AWS_region)
	filerecord=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'results/result',0,1,0)
	if filerecord==0:
		filerecord=''
	filerecord+='====='+' starttime: '+str(st)+'\n'
	filerecord+='====='+str(stt)+'\n'
	data='train round '+str(event['round'])+', round time '+str(event['roundtime'])+', start at '+str(st)+' ##'+str(time.time())+'\n'
	data+='info: pos '+str(pos)+'\n'
	print '='*5,'train node',pos,'='*5,'train phase start'
	data+='start up time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n'
	s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round']))
	#=========================================read========================================
	stt=time.time()
	print '='*5,'train node',pos,'='*5,'downloading samples'
	flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_x','/tmp/mnist_x',0,1,0)
	if flag==0:
		print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read sample file x'
	flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_y','/tmp/mnist_y',0,1,0)
	if flag==0:
		print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read sample file y'
	flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_test_x','/tmp/mnist_test_x',0,1,0)
	if flag==0:
		print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read test file x'
	flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_mnist_test_y','/tmp/mnist_test_y',0,1,0)
	if flag==0:
		print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read test file y'
	train_x = extract_data('/tmp/mnist_x', 60000)
	train_y = extract_labels('/tmp/mnist_y', 60000)
	test_x = extract_data('/tmp/mnist_test_x', 10000)
	test_y = extract_labels('/tmp/mnist_test_y', 10000)
	train_x=train_x.reshape([60000,28*28])
	test_x=test_x.reshape([10000,28*28])
	data+='samples length: '+str(len(train_x))+'\n'
	#=========================================read========================================
	#=========================================initialize==================================
	stt=time.time()
	outputs,x,y,labels,train_op,weights,biases=model(layers,lr)
	sess = tf.Session()
	sess.run(tf.global_variables_initializer())
	data+='training initialize time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n'
	#=========================================initialize==================================
	#=========================================LOOP start========================================
	num_iterations=event['maxiter']
	bn=event['batchnumber']
	alltime=0.0
	for it in range(num_iterations):
		st=time.time()
		bs=len(train_x)/bn
		for b in range(bn):
			batch_xs = train_x[b*bs:(b+1)*bs]
			batch_ys = train_y[b*bs:(b+1)*bs]
			sess.run(train_op, feed_dict={x: batch_xs, y: batch_ys})
		alltime+=time.time()-st
	result=sess.run(outputs,feed_dict={x: test_x})
	acc=(np.argmax(result[-1], axis=1)==test_y).mean()
	s3.put_object(Bucket=AWS_S3_bucket,Body=str(acc), Key='timestamp/timestamp_trainresult_'+str(pid)+'_'+str(pos))
	s3.put_object(Bucket=AWS_S3_bucket,Body=str([event['layers'],event['lr'],event['batchnumber'],event['maxiter']]), Key='timestamp/timestamp_traininfo_'+str(pid)+'_'+str(pos))
	if pos==event['nworker']-1:
		event['state']=2
		cevent=copy.deepcopy(event)
		invoke_lambda(client,'nntffunc_1',cevent)
	s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round']))
def merge(mlayers,pos,mergepos,nowiter,timer,waittime,itertime,pid):
	stt=time.time()
	tcount=time.time()
	layer=pos[0]
	node=pos[1]
	data=''
	
	base=int(mlayers[layer+1]/mlayers[layer])
	remin=mlayers[layer+1]%mlayers[layer]
	print '='*5,'merge node at layer',layer,'node',node,'='*5,'merge phase start'
	sn=0
	for n in range(node):
		sn+=base
		if remin:
			sn+=1
			remin-=1
	en=sn+base
	if remin:
		en+=1
	print '='*5,'merge node at layer',layer,'node',node,'='*5,'merge model file at layer',layer+1,'from',sn,'to',en
	s3 = boto3.client('s3')
	s32 = boto3.resource('s3')
	itertime[0]+=time.time()-stt
	data+='merge '+str(nowiter)+' layer '+str(layer)+' node '+str(node)+'merge '+str([sn,en-1])+' start up time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n'
	
	#============================================start==============================================
	stt=time.time()
	print '='*5,'merge node at layer',layer,'node',node,'='*5,'iteration',nowiter
	weights=[]
	biases=[]
	flagt=0.0
	modelt=0.0
	itertime[0]+=1
	files=['data/model_'+str(pid)+'_'+str(layer+1)+'_'+str(now) for now in range(sn,en)]
	#flag=s3func.s3_check_multi_exist(s3,AWS_S3_bucket,'data/model_',files,waittime,0)
	finished=[0 for i in range(en-sn)]
	timer.local_start(0)
	while 1:
		if sum(finished)==(en-sn):
			break
		for now in range(sn,en):
			if finished[now-sn]==0:
				tresult=timer.query()
				if tresult[0]==1:
					if tresult[1]>waittime/4:
						return 0
					else:
						print '='*5,'merge node at layer',layer,'node',node,'='*5,'ERROR!!!: fail to read model: layer',layer+1
						s3.put_object(Bucket=AWS_S3_bucket,Body='fail to read model at iteration '+str(nowiter)+' at layer '+str(layer+1), Key='error/error_merge_'+str(pid)+'_'+str(layer)+'_'+str(node))
						return 0
				flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/model_'+str(pid)+'_'+str(layer+1)+'_'+str(now),'/tmp/model',0,1,1)
				if flag==1:
					finished[now-sn]=1
					with open('/tmp/model', 'r') as f:
						temp=pickle.load(f)
					if not temp[0]==[]:
						if weights==[]:
							weights=temp[0]
							biases=temp[1]
						else:
							add_weights(weights,temp[0])
							add_weights(biases,temp[1])
	data+='merge '+str(nowiter)+' layer '+str(layer)+' node '+str(node)+' model read time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n'
	stt=time.time()
	if layer==0:
		print '='*5,'merge node at layer',layer,'node',node,'='*5,'now is the final node'
		if not weights==[]:
			div_weights(weights,mlayers[-1])
			div_weights(biases,mlayers[-1])
		print len(weights),len(biases)
		model=[weights,biases]
		with open('/tmp/model', 'w') as f:
			pickle.dump(model, f)
		s32 = boto3.resource('s3')
		s32.Bucket(AWS_S3_bucket).upload_file('/tmp/model', 'data/model_'+str(pid)+'_new')
		s32.Bucket(AWS_S3_bucket).upload_file('/tmp/model', 'data/model_'+str(pid)+'_'+str(nowiter+1))
	else:
		model=[weights,biases]
		with open('/tmp/model', 'w') as f:
			pickle.dump(model, f)
		print '='*5,'merge node at layer',layer,'node',node,'='*5,'write model as layer',layer,'node',node
		s32.Bucket(AWS_S3_bucket).upload_file('/tmp/model', 'data/model_'+str(pid)+'_'+str(layer)+'_'+str(node))
		#s3.put_object(Bucket=AWS_S3_bucket,Body='true', Key='flag/flag_'+str(pid)+'_'+str(nowiter)+'_'+str(layer)+'_'+str(node))
	itertime[0]+=time.time()-stt
	data+='merge '+str(nowiter)+' layer '+str(layer)+' node '+str(node)+' model write time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n'
	if len(mergepos)>0:
		thismergepos=mergepos[0]
		del mergepos[0]
		tempd=merge(mlayers,thismergepos,mergepos,nowiter,timer,waittime,itertime,pid)
		if type(tempd)==int:
			return tempd
		else:
			data+=tempd
	
	return data
def train(event):
	st=datetime.datetime.now()
	stt=time.time()
	tcount=time.time()
	if 'roundtime' not in event.keys():
		event['roundtime']=250
	tend=event['roundtime']
	ns=event['ns']
	pos=event['pos']
	mlayers=event['mlayers']
	layers=event['layers']
	maxiter=event['maxiter']
	nowiter=event['nowiter']
	funcname=event['funcname']
	if 'batchnumber' not in event.keys():
		event['batchnumber']=1
	bn=event['batchnumber']
	pid=event['pid']
	if 'testtime' not in event.keys():
		event['testtime']=10
	if 'waittime' not in event.keys():
		event['waittime']=tend*2/3
	waittime=event['waittime']
	timer=s3func.timer([waittime,tend])
	#waittime=tend/4
	if 'round' not in event.keys():
		event['round']=0
	else:
		event['round']+=1
	rounditer=event['rounditer']
	s3 = boto3.client('s3')
	s32 = boto3.resource('s3')
	client=boto3.client('lambda',region_name = AWS_region)
	if nowiter==0 and pos==0:
		s3.put_object(Bucket=AWS_S3_bucket,Body=str(stt), Key='timestamp/timestamp_train_start_'+str(pid))
	response = client.get_function(
		FunctionName=funcname,
	)
	filerecord=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'results/result',0,1,0)
	if filerecord==0:
		filerecord=''
	filerecord+='====='+' merge: '+str(mlayers)+' samples: '+str(ns)+' memory: '+str(event['memory'])+' testtime left :'+str(event['testtime'])+' starttime: '+str(st)+'\n'
	filerecord+='====='+str(stt)+'\n'
	data='train round '+str(event['round'])+', round time '+str(event['roundtime'])+', start at '+str(st)+' ##'+str(time.time())+'\n'
	data+='info: pos '+str(pos)+', memory '+str(response['Configuration']['MemorySize'])+', mlayers '+str(mlayers)+', ns '+str(ns)+', layers '+str(layers)+'\n'
	print '='*5,'train node',pos,'='*5,'train phase start'
	split=10000
	base=int(ns/mlayers[-1])
	remin=ns%mlayers[-1]
	sn=0
	for n in range(pos):
		sn+=base
		if remin:
			sn+=1
			remin-=1
	en=sn+base
	if remin:
		en+=1
	print '='*5,'train node',pos,'='*5,'read samples from',sn,'to',en
	training_inputs=[]
	training_outputs=[]
	sfile=int(sn/split)
	efile=int((en-1)/split)
	print '='*5,'train node',pos,'='*5,'read files from',sfile,'to',efile
	data+='start up time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n'
	s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp')
	#=========================================read========================================
	stt=time.time()
	if os.path.exists('/tmp/samples_save'):
		print '='*5,'train node',pos,'='*5,'found samples!!!'
		with open('/tmp/samples_save', 'r') as f:
			temp=pickle.load(f)
		#os.remove('/tmp/samples_save_'+str(pos))
		training_inputs=temp[0]
		training_outputs=temp[1]
		data+='found samples!!! time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n'
	else:
		print '='*5,'train node',pos,'='*5,'samples not found, downloading'
		for now in range(sfile,efile+1):
			print 'downloading',now,'from range',sfile,efile+1
			#flag=s3func.s3_download_file_v2(s3,s32,AWS_S3_bucket,'data/samples_'+str(now),'/tmp/samples',0,1,0)
			flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_'+str(now),'/tmp/samples',0,1,0)
			if flag==0:
				print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read sample file:',now
			with open('/tmp/samples', 'r') as f:
				temp=pickle.load(f)
			sread=max([split*now,sn])-split*now
			eread=min([split*(now+1),en])-split*now
			
			if training_inputs==[]:
				training_inputs=temp[0][sread:eread]
				training_outputs=temp[1][sread:eread]
			else:
				training_inputs=np.append(training_inputs,temp[0][sread:eread],axis=0)
				training_outputs=np.append(training_outputs,temp[1][sread:eread],axis=0)
			
			#training_inputs.extend(temp[0][sread:eread])
			#training_outputs.extend(temp[1][sread:eread])
		if os.path.exists('/tmp/samples'):
			os.remove('/tmp/samples')
		with open('/tmp/samples_save', 'w') as f:
			pickle.dump([training_inputs,training_outputs], f)
		data+='read from '+str(sfile)+' to '+str(efile)+' time: '+str(time.time()-stt)+' ##'+str(time.time())+' ##'+str(stt)+'--'+str(time.time())+'\n'
	data+='samples length: '+str(len(training_inputs))+'\n'
	if nowiter==0 and pos==0:
		s3.put_object(Bucket=AWS_S3_bucket,Body=str(time.time()), Key='timestamp/timestamp_train_start_2_'+str(pid))
	#=========================================read========================================
	#=========================================initialize==================================
	stt=time.time()
	weights=[[] for i in range(len(layers))]
	biases=[[] for i in range(len(layers))]
	outputs=[[] for i in range(len(layers))]
	
	for l in range(len(layers)):
		if l>0:
			weights[l]=tf.Variable(tf.random_normal([layers[l-1], layers[l]]))
			biases[l]=tf.Variable(tf.random_normal([1, layers[l]]))
	
	x=tf.placeholder(tf.float32,[None,layers[0]])
	y=tf.placeholder(tf.float32,[None,layers[-1]])
	
	outputs[0]=x
	for l in range(len(layers)):
		if l>0:
			outputs[l]= tf.nn.tanh(tf.add(tf.matmul(outputs[l-1], weights[l]), biases[l]))
	
	cost=0.5*(y-outputs[-1])**2

	#train=tf.train.AdamOptimizer(0.1).minimize(cost)
	train=tf.train.GradientDescentOptimizer(1e-1).minimize(cost)
	
	init=tf.global_variables_initializer()
	session=tf.Session()
	session.run(init)
	data+='training initialize time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n'
	#=========================================initialize==================================
	#=========================================LOOP start========================================
	avgitertime=0.0
	avgitertimereal=0.0
	minitertimereal=100000.0
	timerecord=[]
	smt=0.0
	while nowiter<maxiter:
		itertime=[0.0]
		stiter=time.time()
		#flag=s3func.s3_download_file_v2(s3,s32,AWS_S3_bucket,'flag/work_'+str(pid),'/tmp/work',0,1,0)
		flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_'+str(pid),'/tmp/work',0,1,0)
		if flag==0:
			print '='*5,'train node',pos,'='*5,'Abandon!!!! pid:',pid
			return
		stt=time.time()
		print '+'*5,'train node',pos,'pid',pid,'+'*5,'now start iteration',nowiter
		print '='*5,'train node',pos,'='*5,'now start iteration',nowiter
		stt2=time.time()
		#flag=s3func.s3_download_file_v2(s3,s32,AWS_S3_bucket,'data/model_'+str(pid)+'_'+str(nowiter),'/tmp/model',max(waittime,tend-time.time()+tcount),0,0)
		#flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/model_'+str(pid)+'_'+str(nowiter),'/tmp/model',max(waittime,tend-time.time()+tcount),0,0)
		flag=s3func.s3_download_file_timer(s3,AWS_S3_bucket,'data/model_'+str(pid)+'_'+str(nowiter),'/tmp/model',timer,0,0)
		itertime[0]+=time.time()-stt2
		data+='training '+str(nowiter)+' model waiting time: '+str(time.time()-stt2)+' ##'+str(stt2)+'--'+str(time.time())+'\n'
		#print 'flag',flag
		if flag==0:
			if timer.query()[1]>waittime/4:
				print '++++++++lambda train',pos,'at iteration',nowiter,'end at',datetime.datetime.now()
				"""
				with open('/tmp/samples_save_'+str(pos), 'w') as f:
					pickle.dump([training_inputs,training_outputs], f)
				"""
				s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp')
				event['nowiter']=nowiter
				#invoke_lambda(client,funcname,event)
				return
			else:
				print '='*5,'train node',pos,'='*5,'ERROR!!!: fail to read model',nowiter
				s3.put_object(Bucket=AWS_S3_bucket,Body='fail to read model '+str(nowiter), Key='error/error_train_'+str(pid)+'_'+str(pos))
				return
		if nowiter>=(event['round']+1)*rounditer:
			print '++++++++lambda train',pos,'at iteration',nowiter,'end at',datetime.datetime.now()
			if [0,0] in event['mergepos']:
				s3.put_object(Bucket=AWS_S3_bucket,Body=filerecord, Key='results/result')
			"""
			with open('/tmp/samples_save_'+str(pos), 'w') as f:
				pickle.dump([training_inputs,training_outputs], f)
			"""
			s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp')
			event['nowiter']=nowiter
			invoke_lambda(client,funcname,event)
			return
		stt2=time.time()
		with open('/tmp/model', 'r') as f:
			temp=pickle.load(f)
		if temp[0]==[]:
			print '='*5,'train node',pos,'='*5,'ERROR!!!: model format wrong',nowiter
			s3.put_object(Bucket=AWS_S3_bucket,Body='model format wrong '+str(nowiter), Key='error/error_train_'+str(pid)+'_'+str(pos))
			return
		for l in range(len(layers)):
			if l>0:
				session.run(tf.assign(weights[l],temp[0][l]))
				session.run(tf.assign(biases[l],temp[1][l]))
		itertime[0]+=time.time()-stt2
		data+='training '+str(nowiter)+' download model time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n'
		stt=time.time()
		print '='*5,'train node',pos,'='*5,'train start'
		bs=len(training_inputs)/bn
		for b in range(bn):
			session.run(train,feed_dict={x:training_inputs[b*bs:(b+1)*bs,:],y:training_outputs[b*bs:(b+1)*bs,:]})#=========================train
		itertime[0]+=time.time()-stt
		data+='training '+str(nowiter)+' train time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n'
		stt=time.time()
		rew=session.run(weights)
		reb=session.run(biases)
		model=[rew,reb]
		with open('/tmp/model', 'w') as f:
			pickle.dump(model, f)
		print '='*5,'train node',pos,'='*5,'write result as layer',len(mlayers)-1,'node',pos
		s32.Bucket(AWS_S3_bucket).upload_file('/tmp/model', 'data/model_'+str(pid)+'_'+str(len(mlayers)-1)+'_'+str(pos))
		#s3.put_object(Bucket=AWS_S3_bucket,Body='true', Key='flag/flag_'+str(pid)+'_'+str(nowiter)+'_'+str(len(mlayers)-1)+'_'+str(pos))
		itertime[0]+=time.time()-stt
		data+='training '+str(nowiter)+' model write time: '+str(time.time()-stt)+' ##'+str(stt)+'--'+str(time.time())+'\n'
		if len(event['mergepos'])>0:
			mergepos=copy.deepcopy(event['mergepos'])
			thismergepos=mergepos[0]
			del mergepos[0]
			#tempd=merge(mlayers,thismergepos,mergepos,nowiter,timer,max(waittime,tend-time.time()+tcount),itertime,pid)
			smt=time.time()
			tempd=merge(mlayers,thismergepos,mergepos,nowiter,timer,waittime,itertime,pid)
			smt=time.time()-smt
			if tempd==0:
				return
			elif tempd==1:
				print '++++++++lambda train',pos,'at iteration',nowiter,'end at',datetime.datetime.now()
				"""
				with open('/tmp/samples_save_'+str(pos), 'w') as f:
					pickle.dump([training_inputs,training_outputs], f)
				"""
				s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp')
				event['nowiter']=nowiter
				#invoke_lambda(client,funcname,event)
				return
			else:
				data+=tempd
		data+='training '+str(nowiter)+' valid iteration time: '+str(itertime[0])+'\n'
		print '-'*5,'train node',pos,'-'*5,'now end iteration',nowiter
		avgitertime+=itertime[0]
		"""
		if nowiter>=min(10,maxiter-1) and [0,0] in event['mergepos']:
			s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertime/(nowiter+1)), Key='timestamp/timestamp_iteration_'+str(pid)+'.tsp')
		"""
		thisitertime=time.time()-stiter
		filerecord+=str(time.time())+'\n'
		#filerecord+=str(time.time()-stiter)+'\n'
		if thisitertime<minitertimereal:
			minitertimereal=thisitertime
		#insert_sort(time.time()-stiter,timerecord)
		if nowiter>=2:
			avgitertimereal+=time.time()-stiter
			insert_sort(time.time()-stiter,timerecord)
			#filerecord+=str(smt)+'\n'
		if nowiter>=min(10,maxiter-1) and [0,0] in event['mergepos']:
			s3.put_object(Bucket=AWS_S3_bucket,Body=str(timerecord), Key='timestamp/timestamp_iteration_each_'+str(pid)+'.tsp')
			#s3.put_object(Bucket=AWS_S3_bucket,Body=str(avg_no_abnormal(timerecord,2)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
			#s3.put_object(Bucket=AWS_S3_bucket,Body=str(float(np.mean(timerecord))), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
			s3.put_object(Bucket=AWS_S3_bucket,Body=str(float(np.mean(extract_valid(timerecord)))), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
			#s3.put_object(Bucket=AWS_S3_bucket,Body=str(avgitertimereal/(nowiter-2+1)), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
			#s3.put_object(Bucket=AWS_S3_bucket,Body=str(minitertimereal), Key='timestamp/timestamp_iteration_real_'+str(pid)+'.tsp')
		nowiter+=1
	if [0,0] in event['mergepos']:
		s3.put_object(Bucket=AWS_S3_bucket,Body=filerecord, Key='results/result')
		"""
		event['testtime']-=1
		if event['testtime']>0:
			inputs={'state':0,'mod':0,'batchnumber':20,'slayers':[],'mlayers':[1,100],'layers':[20,100,100,100,100,100,1],'pos':[0,0],'ns':1000000,'maxiter':10,'nowiter':0,'roundtime':250,'rounditer':15}
			inputs['testtime']=event['testtime']
			invoke_lambda(client,'testfunc',inputs)
			time.sleep(10)
		"""
	if [0,0] in event['mergepos'] and nowiter>=maxiter:
		s3.put_object(Bucket=AWS_S3_bucket,Body=str(time.time()), Key='timestamp/timestamp_train_end_'+str(pid))
	s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_train_'+str(pid)+'_'+str(pos)+'_'+str(event['round'])+'.tsp')
Esempio n. 10
0
def monitor(event):
	s3 = boto3.client('s3')
	client=boto3.client('lambda',region_name = AWS_region)
	data=''
	print 'monitor begin'
	#====================================================load info===================================================================
	if 'process' not in event.keys():
		print 'ERROR!!!!! process key not found'
		s3.put_object(Bucket=AWS_S3_bucket,Body='process key not found', Key='error/error_monitor.tsp')
		return
	if 'round' not in event.keys():
		event['round']=0
	process=event['process']
	tend=250
	cratio=event['cratio']# cost ratio
	pcratio=event['pcratio']# performance/cost ratio
	rangen=event['rangen']
	rangemem=event['rangemem']
	maxchange=event['maxchange']
	changestep=event['changestep']
	lastpid=[]
	data+='basic info: '+'pcratio: '+str(pcratio)+'rangen: '+str(rangen)+'maxchange: '+str(maxchange)+'changestep: '+str(changestep)+'\n'
	if 'avgrecord' not in event.keys():
		event['avgrecord']=[[0.0,0] for i in range(20)]
	if 'lastslop' not in event.keys():
		event['lastslop']=[]
	if 'regmod' not in event.keys():
		event['regmod']=0
	if 'regcount' not in event.keys():
		event['regcount']=[0,0,0]
	diffrange=event['diffrange']
	tcount=time.time()
	data+='UTC:'+str(datetime.datetime.now())+'. '+str(tcount)+'\n'
	s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp')
	if event['round']>0:
		temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_monitor_'+str(event['round']-1)+'.tsp',0,1,0)
		if temp!=0:
			data+=temp
	data+='=============================='
	best=process[0]
	#====================================================load info===================================================================
	while time.time()-tcount<tend:
		#load the control file "work_monitor". If it is deleted manually, monitor stops.
		flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_monitor','/tmp/work',0,1,0)
		if flag==0:
			print 'monitor terminated!!!!!!'
			return
		if event['regmod']==0:
			if event['regcount'][0]>=event['regtimes'][0]:
				event['regcount'][0]=0
				event['regmod']=1
		if event['regmod']==1:
			if event['regcount'][1]>=event['regtimes'][1]:
				event['regcount'][1]=0
				event['regmod']=0
				event['regcount'][2]+=1
		if event['regcount'][2]>=event['regtimes'][2]:
			print 'final result:',best
			data+='final result: '+str(best)+'\n'
			print 'achieve the maximum regression constraint, terminated!!!!!'
			data+='achieve the maximum regression constraint, terminated!!!!!\n'
			s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp')
			return
		times=[-1,-1]
		print 'regression times:',event['regcount']
		data+='regression times: '+str(event['regcount'])+'\n'
		#+++++++++++++++++++++++++++++++++++++++++++++++++++++++
		temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_startstructure_'+str(process[0][0])+'.tsp',0,1,0)
		if temp==0:
			cevent=copy.deepcopy(event)
			cevent['pid']=process[0][0]
			cevent['mlayers']=[1,process[0][1]]
			cevent['state']=2
			cevent['funcname']=process[0][3]
			cevent['memory']=process[0][2]
			cevent['round']=-1
			if lastpid!=[]:
				cevent['modelname']='data/model_'+str(lastpid)+'_new'
			else:
				cevent['modelname']='data/model'
			print 'invoke',process[0][3]
			invoke_lambda(client,process[0][3],cevent)
			temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_startstructure_'+str(process[0][0])+'.tsp',tend-time.time()+tcount,0,0)
		data+='waiting the first iteration data:'+'timestamp/timestamp_iteration_real_'+str(process[0][0])+'.tsp'+'\n'
		s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp')
		temp=0
		while tend-time.time()+tcount>0 and temp==0:
			flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_monitor','/tmp/work',0,1,0)
			if flag==0:
				print 'monitor terminated!!!!!!'
				return
			temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_iteration_each_'+str(process[0][0])+'.tsp',0,1,0)
			time.sleep(5)
		if temp==0:
			print 'ERROR!!!!! connot read timestamp',process[0]
			s3.put_object(Bucket=AWS_S3_bucket,Body='connot read timestamp: '+str(process[0]), Key='error/error_monitor.tsp')
			break
			#return
		times[0]=json.loads(temp)
		s3func.s3_delete_file(s3,AWS_S3_bucket,'flag/work_'+str(process[0][0]))
		s3func.s3_clear_bucket(AWS_S3_bucket,'timestamp/timestamp_train_'+str(process[0][0]))
		lastpid=process[0][0]
		data+='the time cost is: '+str(times[0])+'\n'
		#--------------------------------------------------------
		temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_startstructure_'+str(process[1][0])+'.tsp',0,1,0)
		if temp==0:
			cevent=copy.deepcopy(event)
			cevent['pid']=process[1][0]
			cevent['mlayers']=[1,process[1][1]]
			cevent['state']=2
			cevent['funcname']=process[1][3]
			cevent['memory']=process[1][2]
			cevent['round']=-1
			if lastpid!=[]:
				cevent['modelname']='data/model_'+str(lastpid)+'_new'
			else:
				cevent['modelname']='data/model'
			print 'invoke',process[1][3]
			invoke_lambda(client,process[1][3],cevent)
			temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_startstructure_'+str(process[1][0])+'.tsp',tend-time.time()+tcount,0,0)
		data+='waiting the second iteration data:'+'timestamp/timestamp_iteration_real_'+str(process[1][0])+'.tsp'+'\n'
		s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp')
		temp=0
		while tend-time.time()+tcount>0 and temp==0:
			flag=s3func.s3_download_file(s3,AWS_S3_bucket,'flag/work_monitor','/tmp/work',0,1,0)
			if flag==0:
				print 'monitor terminated!!!!!!'
				return
			temp=s3func.s3_read_file_v2(s3,AWS_S3_bucket,'timestamp/timestamp_iteration_each_'+str(process[1][0])+'.tsp',0,1,0)
			time.sleep(5)
		if temp==0:
			print 'ERROR!!!!! connot read timesteamp',process[1]
			s3.put_object(Bucket=AWS_S3_bucket,Body='connot read timestamp: '+str(process[1]), Key='error/error_monitor.tsp')
			break
			#return
		times[1]=json.loads(temp)
		s3func.s3_delete_file(s3,AWS_S3_bucket,'flag/work_'+str(process[1][0]))
		s3func.s3_clear_bucket(AWS_S3_bucket,'timestamp/timestamp_train_'+str(process[1][0]))
		lastpid=process[1][0]
		data+='the time cost is: '+str(times[1])+'\n'
		#+++++++++++++++++++++++++++++++++++++++++++++++++++++++
		if event['regmod']==0:
			times[0]=float(np.median(times[0]))
			times[1]=float(np.median(times[1]))
			if times[1]<times[0]:
				process.reverse()
				times.reverse()
			best=process[0]
			nextfunc=process[1][3]
			F=times[0]-times[1]
			data+=str(process[0])+': '+str(times[0])+'. '+str(process[1])+': '+str(times[1])+'. \n'
			data+='delta target value is '+str(F)+'\n'
			if process[0][1]==process[1][1]:
				nextn=process[0][1]
			else:
				if event['lastslop']==[]:
					change=-pcratio[0]*F/(process[0][1]-process[1][1])
					event['lastslop']=F/(process[0][1]-process[1][1])
				else:
					change=-pcratio[0]*F/(process[0][1]-process[1][1])+pcratio[1]*(F/(process[0][1]-process[1][1])-event['lastslop'])
					event['lastslop']=F/(process[0][1]-process[1][1])
				change=ceil_step(change,changestep[0])
				if change>maxchange[0]:
					change=maxchange[0]
				elif change<-maxchange[0]:
					change=-maxchange[0]
				nextn=int(process[0][1]+change)
			nextmem=process[0][2]
		else:
			metric=[0.0,0.0]
			if OPTIMIZATION==0:
				metric[0]=float(np.median([(process[0][2]*ts) for ts in times[0]]))
				metric[1]=float(np.median([(process[1][2]*ts) for ts in times[1]]))
			else:
				metric[0]=float(np.median([1.0/(process[0][2]*ts**2) for ts in times[0]]))
				metric[1]=float(np.median([1.0/(process[1][2]*ts**2) for ts in times[1]]))
			if metric[1]<metric[0]:
			#if process[1][2]*times[1]<process[0][2]*times[0]:
				process.reverse()
				times.reverse()
				metric.reverse()
			best=process[0]
			nextfunc=process[1][3]
			F=metric[0]-metric[1]
			data+=str(process[0])+': '+str(metric[0])+'. '+str(process[1])+': '+str(metric[1])+'. \n'
			data+='delta target value is '+str(F)+'\n'
			nextn=process[0][1]
			if process[0][2]==process[1][2]:
				nextmem=process[0][2]
			else:
				#change=-pcratio[2]*F/(process[0][2]-process[1][2])
				if event['lastslop']==[]:
					change=-pcratio[2]*F/(process[0][2]-process[1][2])
					event['lastslop']=F/(process[0][2]-process[1][2])
				else:
					change=-pcratio[2]*F/(process[0][2]-process[1][2])+pcratio[3]*(F/(process[0][2]-process[1][2])-event['lastslop'])
					event['lastslop']=F/(process[0][2]-process[1][2])
				#change=-change#for 1/mt2
				change=ceil_step(change,changestep[1])
				if change>maxchange[1]:
					change=maxchange[1]
				elif change<-maxchange[1]:
					change=-maxchange[1]
				nextmem=int(process[0][2]+change)
		nextpid=max(process[0][0],process[1][0])+1
		if nextn>rangen[1]:
			nextn=rangen[1]
		elif nextn<rangen[0]:
			nextn=rangen[0]
		if nextmem>rangemem[1]:
			nextmem=rangemem[1]
		elif nextmem<rangemem[0]:
			nextmem=rangemem[0]
		#s3func.s3_clear_bucket(AWS_S3_bucket,'data/model_'+str(process[1][0]))
		process[1]=process[0]
		process[0]=[nextpid,nextn,nextmem,nextfunc]
		#process[1][0]=max(process[0][0],process[1][0])+1
		event['process']=process
		response = client.update_function_configuration(
			FunctionName=nextfunc,
			MemorySize=nextmem,
		)
		time.sleep(5)
		print '---------',process
		if abs(process[0][1]-process[1][1])<1 and process[0][2]==process[1][2]:
			print 'terminated!!!'
			print 'final result:',best
			data+='final result: '+str(best)+'\n'
			#data+='the final result is '+str(process[0])
			s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp')
			return
		data+= '---------'+str(process)+'\n'
		if event['regmod']==0:
			event['regcount'][0]+=1
		else:
			event['regcount'][1]+=1
		s3.put_object(Bucket=AWS_S3_bucket,Body=data, Key='timestamp/timestamp_monitor_'+str(event['round'])+'.tsp')
	event['round']+=1
	invoke_lambda(client,event['funcname'],event)
Esempio n. 11
0
def test_result(layers,num):
	st=time.time()
	s32 = boto3.resource('s3')
	s3 = boto3.client('s3')
	training_inputs=[]
	training_outputs=[]
	
	for now in range(99):
		print now
		flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/samples_'+str(now),'/tmp/samples',0,1,0)
		with open('/tmp/samples', 'r') as f:
			temp=pickle.load(f)
			print len(temp[0])
		if training_inputs==[]:
			training_inputs=temp[0]
			training_outputs=temp[1]
		else:
			training_inputs=np.append(training_inputs,temp[0],axis=0)
			training_outputs=np.append(training_outputs,temp[1],axis=0)
		print len(training_inputs)
	
	print len(training_inputs)
	
	weights=[[] for i in range(len(layers))]
	biases=[[] for i in range(len(layers))]
	outputs=[[] for i in range(len(layers))]
	for l in range(len(layers)):
		if l>0:
			weights[l]=tf.Variable(tf.random_normal([layers[l-1], layers[l]]))
			biases[l]=tf.Variable(tf.random_normal([1, layers[l]]))
	
	
	x=tf.placeholder(tf.float32,[None,layers[0]])
	y=tf.placeholder(tf.float32,[None,layers[-1]])
	
	outputs[0]=x
	for l in range(len(layers)):
		if l>0:
			outputs[l]= tf.nn.tanh(tf.add(tf.matmul(outputs[l-1], weights[l]), biases[l]))
	
	cost=0.5*(y-outputs[-1])**2
	
	
	init=tf.global_variables_initializer()
	session=tf.Session()
	session.run(init)
	
	flag=s3func.s3_download_file(s3,AWS_S3_bucket,'data/model_0_new','/tmp/model',0,1,0)
	with open('/tmp/model', 'r') as f:
		temp=pickle.load(f)
	for l in range(len(layers)):
		if l>0:
			session.run(tf.assign(weights[l],temp[0][l]))
			session.run(tf.assign(biases[l],temp[1][l]))
	
	results=session.run(outputs,feed_dict={x:training_inputs})
	test_output=results[-1]
	test_output=list(test_output)
	for i in range(len(test_output)):
		for j in range(len(test_output[i])):
			if test_output[i][j]>=0:
				test_output[i][j]=1
			else:
				test_output[i][j]=-1
	error=0
	for i in range(len(training_outputs)):
		if training_outputs[i]!=test_output[i]:
			error+=1
	print float(error)/len(training_outputs)
	print time.time()-st