def handle35(FileObject): X = DataUtil.readInt(FileObject) Y = DataUtil.readByte(FileObject) Z = DataUtil.readInt(FileObject) BlockType = DataUtil.readShort(FileObject) BlockMetaData = DataUtil.readByte(FileObject) return {"x": X, "y": Y, "z": Z, "BlockType": BlockType, "MetaData": BlockMetaData}
def handleFA(FileObject): Channel = DataUtil.readString(FileObject) length = DataUtil.readShort(FileObject) message = DataUtil.readByteArray(FileObject, length) return {'Channel': Channel, 'message': message }
def handle09(FileObject): dimension = DataUtil.readInt(FileObject) difficulty = DataUtil.readByte(FileObject) mode = DataUtil.readByte(FileObject) height = DataUtil.readShort(FileObject) world = DataUtil.readString(FileObject) return {"Dimension": dimension, "Difficulty": difficulty, "Mode": mode, "Height": height, "World": world}
def handle34(FileObject): ChunkX = DataUtil.readInt(FileObject) ChunkZ = DataUtil.readInt(FileObject) AffectedBlocks = DataUtil.readShort(FileObject) DataSize = DataUtil.readInt(FileObject) FileObject.read(DataSize) # not going to be using this until I know how to. return {"ChunkX": ChunkX, "ChunkZ": ChunkZ, "AffectedBlocks": AffectedBlocks}
def handle11(FileObject): EntityID = DataUtil.readInt(FileObject) FileObject.read(1) # Unused x = DataUtil.readInt(FileObject) y = DataUtil.readByte(FileObject) z = DataUtil.readInt(FileObject) return {"EntityID": EntityID, "x": x, "y": y, "z": z}
def createFeatureFileGrayScaleSequences(imageSize, imageDirectory, featuresDirectory, featuresName, log): directory = imageDirectory classesPath = os.listdir(directory) featuresSetGray = [] classNumber = 0 for classs in classesPath: sequences = os.listdir(directory+"/"+classs) for s in sequences: files = os.listdir(directory+os.sep+classs+os.sep+s+os.sep) for image in files: if (not "txt" in image and not "db" in image): log.printMessage(("Reading:", directory+os.sep+classs+os.sep+s+os.sep+image)) img = cv2.imread(directory+os.sep+classs+os.sep+s+os.sep+image) featuresGray = grayImage(img,imageSize,False,"") featuresGray = whiten(featuresGray) featuresGray = resize(featuresGray,imageSize) newFeatures = [] newFeatures.append(int(classNumber)) for x in featuresGray: for y in x: newFeatures.append(y) featuresSetGray.append(newFeatures) classNumber = classNumber+1 DataUtil.writeSingleFile(featuresSetGray,featuresDirectory+os.sep+featuresName, False)
def handle1A(FileObject): EntityID = DataUtil.readInt(FileObject) x = DataUtil.readInt(FileObject) y = DataUtil.readInt(FileObject) z = DataUtil.readInt(FileObject) Count = DataUtil.readShort(FileObject) return {"EntityID": EntityID, "x": x, "y": y, "z": z, "Count": Count}
def handle47(FileObject): EntityID = DataUtil.readInt(FileObject) FileObject.read(1) # Boolean don't do nothing x = DataUtil.readInt(FileObject) y = DataUtil.readInt(FileObject) z = DataUtil.readInt(FileObject) return {"EntityID": EntityID, "x": x, "y": y, "z": z}
def handle68(FileObject): WindowID = DataUtil.readByte(FileObject) Count = DataUtil.readShort(FileObject) Slots = [] for i in range(Count): SlotData = DataUtil.readSlotData(FileObject) Slots.append(SlotData) return {"WindowID": WindowID, "Count": Count, "Slots": Slots}
def handle20(FileObject): EntityID = DataUtil.readInt(FileObject) Yaw = DataUtil.readByte(FileObject) Pitch = DataUtil.readByte(FileObject) return {'EntityID': EntityID, 'Yaw': Yaw, 'Pitch': Pitch }
def handleCE(FileObject): name = DataUtil.readString(FileObject) display_text = DataUtil.readString(FileObject) create_or_remove = DataUtil.readBoolean(FileObject) return {'Name' : name, 'Display Name' : display_text, 'Remove' : create_or_remove }
def handle05(FileObject): EntityID = DataUtil.readInt(FileObject) Slot = DataUtil.readShort(FileObject) Item = DataUtil.readSlotData(FileObject) return {'EntityID': EntityID, 'Slot': Slot, 'Item': Item }
def handle08(FileObject): health = DataUtil.readShort(FileObject) food = DataUtil.readShort(FileObject) saturation = DataUtil.readFloat(FileObject) return {'health': health, 'food': food, 'saturation': saturation }
def handle07(FileObject): userID = DataUtil.readInt(FileObject) targetID = DataUtil.readInt(FileObject) mButton = DataUtil.readBoolean(FileObject) return {'userID': userID, 'targetID': targetID, 'mButton': mButton }
def handle69(FileObject): WindowID = DataUtil.readByte(FileObject) Property = DataUtil.readShort(FileObject) Value = DataUtil.readShort(FileObject) return {'WindowID': WindowID, 'Property': Property, 'Value': Value }
def handle6A(FileObject): WindowID = DataUtil.readByte(FileObject) ActionType = DataUtil.readShort(FileObject) Accepted = DataUtil.readBoolean(FileObject) return {'WindowID': WindowID, 'ActionType': ActionType, 'Accepted': Accepted }
def handle06(FileObject): x = DataUtil.readInt(FileObject) y = DataUtil.readInt(FileObject) z = DataUtil.readInt(FileObject) return {'x': x, 'y': y, 'z': z }
def handle67(FileObject): WindowID = DataUtil.readByte(FileObject) Slot = DataUtil.readShort(FileObject) SlotData = DataUtil.readSlotData(FileObject) return {'WindowID': WindowID, 'Slot': Slot, 'SlotData': SlotData }
def handleC9(FileObject): PlayerName = DataUtil.readString(FileObject) Online = DataUtil.readBoolean(FileObject) Ping = DataUtil.readShort(FileObject) return {'PlayerName': PlayerName, 'Online': Online, 'Ping': Ping }
def handle2B(FileObject): ExperienceBar = DataUtil.readFloat(FileObject) Level = DataUtil.readShort(FileObject) TotalExp = DataUtil.readShort(FileObject) return {'ExpBar': ExperienceBar, 'Level': Level, 'TotalExp': TotalExp }
def handle27(FileObject): EntityID = DataUtil.readInt(FileObject) VehicleID = DataUtil.readInt(FileObject) Leash = DataUtil.readBoolean(FileObject) return {'EntityID': EntityID, 'VehicleID': VehicleID, 'Leash': Leash }
def handle85(FileObject): EntityID = DataUtil.readByte(FileObject) X = DataUtil.readInt(FileObject) Y = DataUtil.readInt(FileObject) Z = DataUtil.readInt(FileObject) return {'EntityID': EntityID, 'x': X, 'y': Y, 'z': Z}
def handle83(FileObject): ItemType = DataUtil.readShort(FileObject) ItemID = DataUtil.readShort(FileObject) TextLength = DataUtil.readShort(FileObject) Text = DataUtil.readByteArray(FileObject, TextLength) return {'ItemType': ItemType, 'ItemID': ItemID, 'Text': Text }
def handle2C(FileObject): EntityID = DataUtil.readInt(FileObject) PropertiesCount = DataUtil.readInt(FileObject) Properties = {} for i in range(PropertiesCount): key = DataUtil.readString(FileObject) value = DataUtil.readDouble(FileObject) Properties[key] = value return {"EntityID": EntityID, "Properties": Properties}
def handleCF(FileObject): name = DataUtil.readString(FileObject) remove = DataUtil.readBoolean(FileObject) score_name = DataUtil.readString(FileObject) value = DataUtil.readInt(FileObject) return {'Item Name' : name, 'Remove' : remove, 'Score Name' : score_name, 'Value' : value }
def handle1F(FileObject): EntityID = DataUtil.readInt(FileObject) x = DataUtil.readByte(FileObject) y = DataUtil.readByte(FileObject) z = DataUtil.readByte(FileObject) return {'EntityID': EntityID, 'x': x, 'y': y, 'z': z }
def handle29(FileObject): EntityID = DataUtil.readInt(FileObject) EffectID = DataUtil.readByte(FileObject) Amplifier = DataUtil.readByte(FileObject) Duration = DataUtil.readShort(FileObject) return {'EntityID': EntityID, 'EffectID': EffectID, 'Amplifier': Amplifier, 'Duration': Duration }
def handle64(FileObject): WindowID = DataUtil.readByte(FileObject) InventoryType = DataUtil.readByte(FileObject) WindowTitle = DataUtil.readString(FileObject) NumberOfSlots = DataUtil.readByte(FileObject) return {'WindowID': WindowID, 'InventoryType': InventoryType, 'WindowTitle': WindowTitle, 'NumberOfSlots': NumberOfSlots }
def handleCA(FileObject): # byte - flags Flags = DataUtil.readByte(FileObject) # byte - fly speed FlySpeed = DataUtil.readFloat(FileObject) # byte - walk speed WalkSpeed = DataUtil.readFloat(FileObject) return {"Flags": Flags, "Fly Speed": FlySpeed, "Walk Speed": WalkSpeed}
def handle1C(FileObject): EntityID = DataUtil.readInt(FileObject) VelocityX = DataUtil.readShort(FileObject) VelocityY = DataUtil.readShort(FileObject) VelocityZ = DataUtil.readShort(FileObject) return {'EntityID': EntityID, 'VelocityX': VelocityX, 'VelocityY': VelocityY, 'VelocityZ': VelocityZ }
if __name__ == '__main__': if len(sys.argv)<2: print '''Expected input format: %s graph [-t topic] [-m max_iter] graph: graph file path -t: specify topic words topic: topic words seperated by '-', default with ML words -m: specify max iter count max_iter: max iter count, default with 20 ''' % sys.argv[0] sys.exit(1) load_path = sys.argv[1] topic_words=Config._ML_WORDS max_iter=20 _id = 2 while _id<len(sys.argv)-1: if sys.argv[_id]=='-t': topic_words = sys.argv[_id+1].decode('utf-8') elif sys.argv[_id]=='-m': max_iter = int(sys.argv[_id+1]) _id += 2 G = du.load_graph(load_path) mr = ManifoldRank(G, topic_words=topic_words, max_iter=max_iter) mr.rank() mr.test(verbose=True)
# To add a new cell, type '#%%' # To add a new markdown cell, type '#%% [markdown]' # %% import scipy.io as sio import numpy as np import DataUtil as du import math import matplotlib.pyplot as plot import scipy.stats as sst np.seterr(divide='ignore') # %% # Load data from spamData.mat xtrain, ytrain, xtest, ytest = du.loadData('spamData.mat') # Log-transformation xtrain = np.log(xtrain + 0.1) xtest = np.log(xtest + 0.1) # %% def calcPosteriorProdictiveDist(xtrain, ytrain, xtest): # Get ML Estimation of mu, variance and lambda lambdaMl = du.getLambdaML(ytrain) # Get an array of unique classes, C classes = [0, 1] # Init logP(y = c | x, D) array, index being c
def train_model(hf, f_type, nql=25, nqa=32, numberOfChoices=5, feature_shape=None, lr=0.01, batch_size=8, total_epoch=100, pretrained_model=None, pca_mat_init_file=None): mqa = MovieQA.DataLoader() stories_for_create_dict, full_video_QAs = mqa.get_story_qa_data( 'full', 'subtitle') stories_for_create_dict = DataUtil.preprocess_stories( stories_for_create_dict, max_words=40) w2v_mqa_model_filename = './model/movie_plots_1364.d-300.mc1.w2v' w2v_model = w2v.load(w2v_mqa_model_filename, kind='bin') # Create vocabulary v2i = DataUtil.create_vocabulary_word2vec(full_video_QAs, stories_for_create_dict, word_thresh=1, w2v_vocab=w2v_model, v2i={ '': 0, 'UNK': 1 }) # get 'video-based' QA task training set _, trained_video_QAs = mqa.get_video_list( 'train', 'qa_clips') # key: 'train:<id>', value: list of related clips _, val_video_QAs = mqa.get_video_list('val', 'qa_clips') ''' model parameters ''' size_voc = len(v2i) print('building model ...') if os.path.exists(pca_mat_init_file): pca_mat = pickle.load(open(pca_mat_init_file, 'r')) else: pca_mat = linear_project_pca_initialization( hf, feature_shape, d_w2v=300, output_path=pca_mat_init_file) print('pca_mat.shape:', pca_mat.shape) input_video = tf.placeholder(tf.float32, shape=(None, ) + feature_shape, name='input_video') input_question = tf.placeholder(tf.int32, shape=(None, nql), name='input_question') input_answer = tf.placeholder(tf.int32, shape=(None, numberOfChoices, nqa), name='input_answer') y = tf.placeholder(tf.float32, shape=(None, numberOfChoices)) train, loss, scores = build_model(input_video, input_question, input_answer, v2i, w2v_model, pca_mat=pca_mat, d_w2v=300, d_lproj=300, answer_index=y, lr=lr) ''' configure && runtime environment ''' config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.3 # sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) config.log_device_placement = False sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) ''' training parameters ''' with open('train_split.json') as fid: trdev = json.load(fid) def getTrainDevSplit(trained_video_QAs, trdev): train_data = [] dev_data = [] for k, qa in enumerate(trained_video_QAs): if qa.imdb_key in trdev['train']: train_data.append(qa) else: dev_data.append(qa) return train_data, dev_data train_data, dev_data = getTrainDevSplit(trained_video_QAs, trdev) print('total training samples: %d' % len(train_data)) with sess.as_default(): saver = tf.train.Saver(sharded=True, max_to_keep=total_epoch) if pretrained_model is not None: saver.restore(sess, pretrained_model) print('restore pre trained file:' + pretrained_model) max_acc = 0.0 for epoch in xrange(total_epoch): # # shuffle print('Epoch: %d/%d, Batch_size: %d' % (epoch + 1, total_epoch, batch_size)) # train phase tic = time.time() total_acc, total_loss = exe_model(sess, train_data, batch_size, v2i, hf, feature_shape, loss, scores, input_video, input_question, input_answer, y, numberOfChoices=5, train=train, nql=25, nqa=32) print(' --Train--, Loss: %.5f, Acc: %.5f.......Time:%.3f' % (total_loss, total_acc, time.time() - tic)) # dev phase tic = time.time() total_acc, total_loss = exe_model(sess, dev_data, batch_size, v2i, hf, feature_shape, loss, scores, input_video, input_question, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32) print(' --Train-val--, Loss: %.5f, Acc: %.5f.......Time:%.3f' % (total_loss, total_acc, time.time() - tic)) # eval phase # if total_acc > max_acc: # max_acc = total_acc tic = time.time() total_acc, total_loss = exe_model(sess, val_video_QAs, batch_size, v2i, hf, feature_shape, loss, scores, input_video, input_question, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32) print(' --Val--, Loss: %.5f, Acc: %.5f.......Time:%.3f' % (total_loss, total_acc, time.time() - tic)) #save model export_path = '/home/xyj/usr/local/saved_model/vqa_baseline/video_classifier_semantic' + '_' + f_type + '/' + 'lr' + str( lr) + '_f' + str(feature_shape[0]) if not os.path.exists(export_path): os.makedirs(export_path) print('mkdir %s' % export_path) save_path = saver.save( sess, export_path + '/' + 'E' + str(epoch + 1) + '_A' + str(total_acc) + '.ckpt') print("Model saved in file: %s" % save_path)
def handle17(FileObject): EntityID = DataUtil.readInt(FileObject) Type = DataUtil.readByte(FileObject) x = DataUtil.readInt(FileObject) y = DataUtil.readInt(FileObject) z = DataUtil.readInt(FileObject) yaw = DataUtil.readByte(FileObject) pitch = DataUtil.readByte(FileObject) data = DataUtil.readInt(FileObject) if (data > 0): SpeedX = DataUtil.readShort(FileObject) SpeedY = DataUtil.readShort(FileObject) SpeedZ = DataUtil.readShort(FileObject) return { 'EntityID': EntityID, 'Type': Type, 'x': x, 'y': y, 'z': z, 'yaw': yaw, 'pitch': pitch, 'SpeedX': SpeedX, 'SpeedY': SpeedY, 'SpeedZ': SpeedZ } else: return { 'EntityID': EntityID, 'Type': Type, 'x': x, 'y': y, 'z': z, 'yaw': yaw, 'pitch': pitch }
def handle15(FileObject): EntityID = DataUtil.readInt(FileObject) ItemID = DataUtil.readShort(FileObject) if (ItemID != -1): Count = DataUtil.readByte(FileObject) Damage = DataUtil.readShort(FileObject) ArrayLength = DataUtil.readShort(FileObject) if (ArrayLength != -1): Array = FileObject.read( ArrayLength ) #TODO: find out what this does and do stuff accrodingly x = DataUtil.readInt(FileObject) y = DataUtil.readInt(FileObject) z = DataUtil.readInt(FileObject) Rotation = DataUtil.readByte(FileObject) Pitch = DataUtil.readByte(FileObject) Roll = DataUtil.readByte(FileObject) toReturn = { 'EntityID': EntityID, 'ItemID': ItemID, 'x': x, 'y': y, 'z': z, 'Rotation': Rotation, 'Pitch': Pitch, 'Roll': Roll } if (ItemID != -1): toReturn['Count'] = Count toReturn['Damage'] = Damage return toReturn
rank_sum = 0 neighbors = self.graph[key] for n in neighbors: if self.ranks[n] is not None: outlinks = len(self.graph.neighbors(n)) rank_sum += (1 / float(outlinks)) * self.ranks[n] # actual page rank compution self.ranks[key] = ((1 - float(self.d)) * (1/float(self.V))) + self.d*rank_sum if __name__ == '__main__': if len(sys.argv) != 2: print 'Expected input format: python pageRank.py <graph file path>' sys.exit(1) G = du.load_graph(load_path=sys.argv[1]) p = PageRank(G) p.rank() sorted_r = sorted(p.ranks.iteritems(), key=operator.itemgetter(1), reverse=True) cnt = 100 for key, weight in sorted_r: if not isinstance(key, unicode): if key<10000000000: _acc = Account.objects.get(w_uid=key) key = u'%s\t%s' % (_acc.real_category, _acc) else: _wb = Weibo.objects.get(w_id=key) key = u'%s\t%s' % (_wb.real_category, _wb.text[:20]) else: pass # word
def train_model(pretrained_model=None): task = 'video-based' # video-based or subtitle-based mqa = MovieQA.DataLoader() # get 'subtitile-based' QA task dataset trained_stories, trained_video_QAs = mqa.get_story_qa_data( 'train', 'subtitle') # Create vocabulary QA_words, v2i = DataUtil.create_vocabulary(trained_video_QAs, trained_stories, word_thresh=2, v2i={ '': 0, 'UNK': 1 }) # get 'video-based' QA task training set val_stories, val_video_QAs = mqa.get_story_qa_data('val', 'subtitle') ''' model parameters ''' # preprocess trained_stories size_voc = len(v2i) trained_stories, max_sentences, max_words = DataUtil.normalize_documents( trained_stories, v2i, max_words=20) val_stories, _, _ = DataUtil.normalize_documents(val_stories, v2i, max_words=20) print('trained_stories... max setences:%d, max words:%d' % (max_sentences, max_words)) max_sentences = 1500 story_shape = (max_sentences, max_words) timesteps_q = 16 # sequences length for question timesteps_a = 10 # sequences length for anwser numberOfChoices = 5 # for input choices, one for correct, one for wrong answer word_embedding_size = 300 sentence_embedding_size = 100 common_space_dim = 512 print('building model ...') input_stories = tf.placeholder(tf.int32, shape=(None, max_sentences, max_words), name='input_stories') input_question = tf.placeholder(tf.int32, shape=(None, timesteps_q), name='input_question') input_answer = tf.placeholder(tf.int32, shape=(None, numberOfChoices, timesteps_a), name='input_answer') y = tf.placeholder(tf.float32, shape=(None, numberOfChoices)) train, loss, scores = build_model(input_stories, input_question, size_voc, word_embedding_size, sentence_embedding_size, input_answer, common_space_dim, answer_index=y, lr=0.001) ''' configure && runtime environment ''' config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.5 # sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) config.log_device_placement = False sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) ''' training parameters ''' batch_size = 32 total_train_qa = len(trained_video_QAs) total_val_qa = len(val_video_QAs) num_train_batch = int(round(total_train_qa * 1.0 / batch_size)) num_val_batch = int(round(total_val_qa * 1.0 / batch_size)) total_epoch = 50 export_path = '/home/xyj/usr/local/saved_model/vqa_baseline/rankloss_subtitle_only' if not os.path.exists(export_path): os.makedirs(export_path) print('mkdir %s' % export_path) print('total training samples: %d' % total_train_qa) with sess.as_default(): saver = tf.train.Saver(sharded=True, max_to_keep=total_epoch) if pretrained_model is not None: saver.restore(sess, pretrained_model) print('restore pre trained file:' + pretrained_model) for epoch in xrange(total_epoch): # # shuffle np.random.shuffle(trained_video_QAs) for batch_idx in xrange(num_train_batch): batch_qa = trained_video_QAs[batch_idx * batch_size:min( (batch_idx + 1) * batch_size, total_train_qa)] data_q, data_a, data_y = DataUtil.getBatchIndexedQAs( batch_qa, QA_words, v2i, nql=16, nqa=10, numOfChoices=numberOfChoices) data_s = DataUtil.getBatchIndexedStories( batch_qa, trained_stories, v2i, story_shape) _, l, s = sess.run( [train, loss, scores], feed_dict={ input_stories: data_s, input_question: data_q, input_answer: data_a, y: data_y }) num_correct = np.sum( np.where( np.argmax(s, axis=-1) == np.argmax(data_y, axis=-1), 1, 0)) Acc = num_correct * 1.0 / batch_size print( '--Training--, Epoch: %d/%d, Batch: %d/%d, Batch_size: %d Loss: %.5f, Acc: %.5f' % (epoch + 1, total_epoch, batch_idx + 1, num_train_batch, batch_size, l, Acc)) print('---------Validation---------') total_correct_num = 0 for batch_idx in xrange(num_val_batch): batch_qa = val_video_QAs[batch_idx * batch_size:min( (batch_idx + 1) * batch_size, total_val_qa)] data_q, data_a, data_y = DataUtil.getBatchIndexedQAs( batch_qa, QA_words, v2i, nql=16, nqa=10, numOfChoices=numberOfChoices) data_s = DataUtil.getBatchIndexedStories( batch_qa, val_stories, v2i, story_shape) l, s = sess.run( [loss, scores], feed_dict={ input_stories: data_s, input_question: data_q, input_answer: data_a, y: data_y }) num_correct = np.sum( np.where( np.argmax(s, axis=-1) == np.argmax(data_y, axis=-1), 1, 0)) Acc = num_correct * 1.0 / batch_size total_correct_num += num_correct print( '--Valid--, Epoch: %d/%d, Batch: %d/%d, Batch_size: %d Loss: %.5f, Acc: %.5f' % (epoch + 1, total_epoch, batch_idx + 1, num_val_batch, batch_size, l, Acc)) total_correct_num = total_correct_num * 1.0 / total_val_qa print('--Valid--, val acc: %.5f' % (total_correct_num)) #save model save_path = saver.save( sess, export_path + '/' + 'E' + str(epoch + 1) + '_A' + str(total_correct_num) + '.ckpt') print("Model saved in file: %s" % save_path)
def test_model(model_file, output_file, hf): mqa = MovieQA.DataLoader() _, test_video_QAs = mqa.get_video_list('test', 'qa_clips') # get 'subtitile-based' QA task dataset stories, trained_video_QAs = mqa.get_story_qa_data('train', 'subtitle') # Create vocabulary QA_words, v2i = DataUtil.create_vocabulary(trained_video_QAs, stories, word_thresh=2, v2i={ '': 0, 'UNK': 1 }) ''' model parameters ''' size_voc = len(v2i) video_feature_dims = 2048 timesteps_v = 16 # sequences length for video story_shape = (timesteps_v, video_feature_dims) timesteps_q = 16 # sequences length for question timesteps_a = 10 # sequences length for anwser numberOfChoices = 5 # for input choices, one for correct, one for wrong answer word_embedding_size = 300 sentence_embedding_size = 512 visual_embedding_dims = 512 common_space_dim = 512 print('building model ...') input_stories = tf.placeholder(tf.int32, shape=(None, timesteps_v, video_feature_dims), name='input_stories') input_question = tf.placeholder(tf.int32, shape=(None, timesteps_q), name='input_question') input_answer = tf.placeholder(tf.int32, shape=(None, numberOfChoices, timesteps_a), name='input_answer') scores = build_model(input_stories, visual_embedding_dims, input_question, size_voc, word_embedding_size, sentence_embedding_size, input_answer, common_space_dim, answer_index=None, lr=0.01, isTest=True) ''' configure && runtime environment ''' config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.2 # sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) config.log_device_placement = False sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) # load model saver = tf.train.Saver(sharded=True, max_to_keep=5) saver.restore(sess, model_file) ''' parameters ''' batch_size = 64 total_test_qa = len(test_video_QAs) num_test_batch = int(round(total_test_qa * 1.0 / batch_size)) with open(output_file, 'w') as wf: with sess.as_default(): for batch_idx in xrange(num_test_batch): batch_qa = test_video_QAs[batch_idx * batch_size:min( (batch_idx + 1) * batch_size, total_test_qa)] data_q, data_a = DataUtil.getBatchTestIndexedQAs( batch_qa, QA_words, v2i, nql=16, nqa=10, numOfChoices=numberOfChoices) data_v = DataUtil.getBatchVideoFeature(batch_qa, QA_words, hf, story_shape) s = sess.run( [scores], feed_dict={ input_stories: data_v, input_question: data_q, input_answer: data_a }) res = np.argmax(s[0], axis=-1) for idx, qa in enumerate(batch_qa): wf.write('%s %d\n' % (qa.qid, res[idx])) print('--Valid--, Batch: %d/%d, Batch_size: %d' % (batch_idx + 1, num_test_batch, batch_size))
def _read_and_process_data(trainFile, propertiesFile): """Read and process data. Map categorical features to embeddings and indicators for dnn. Return: inter: pandas.DataFrame, "x" to be split in train_test_split y: pandas.Series, "y" to be split in train_test_split numeric_features: features for linear regression to train on deep_columns: features for deep neural network tro train on """ print("Start reading and processing data") df_train = DataUtil.readTrainFile(trainFile) df_properties = DataUtil.readPropertiesFile(propertiesFile) inter = pd.merge(df_properties, df_train, how="inner", on=["parcelid"]) inter['transactiondate'] = pd.to_datetime(df_train["transactiondate"]) inter['transaction_year'] = inter['transactiondate'].dt.year inter['transaction_month'] = inter['transactiondate'].dt.month inter['transaction_day'] = inter['transactiondate'].dt.day y = inter['logerror'] inter = inter.drop(Parameters.getColumnsToDrop(), axis=1) numeric_cols = inter.select_dtypes(exclude=['object']) numeric_cols = numeric_cols.fillna(-1) inter[numeric_cols.columns] = numeric_cols numeric_features = [ tf.feature_column.numeric_column(col) for col in numeric_cols.columns ] categorical_cols = inter.select_dtypes(include=['object']) categorical_cols = categorical_cols.fillna('none') inter[categorical_cols.columns] = categorical_cols complex_features = ["regionidcity", "regionidneighborhood", "regionidzip"] simple_categorical_features = [ tf.feature_column.categorical_column_with_hash_bucket( col, hash_bucket_size=100) for col in categorical_cols if col not in complex_features ] complex_categorical_features = [ tf.feature_column.categorical_column_with_hash_bucket( col, hash_bucket_size=500) for col in complex_features ] deep_indicator_columns = [ tf.feature_column.indicator_column(col) for col in simple_categorical_features ] deep_embedding_columns = [ tf.feature_column.embedding_column(col, dimension=8) for col in complex_categorical_features ] deep_columns = numeric_features + deep_indicator_columns +\ deep_embedding_columns return inter, y, numeric_features, deep_columns
def handle10(FileObject): slotID = DataUtil.readShort(FileObject) return {'SlotID': slotID}
def handle12(FileObject): EntityID = DataUtil.readInt(FileObject) Animation = DataUtil.readByte(FileObject) return {'EntityID': EntityID, 'AnimationID': Animation}
def trainIters(model_name, inputVoc, outputVoc, pairs, dev_pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, input_embedding, output_embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, check_every, clip, corpus_name, loadFilename): weight = torch.ones(outputVoc.num_words) weight[Params.PAD_token] = 0 criterion = nn.NLLLoss(ignore_index=Params.PAD_token) # criterion = nn.NLLLoss(weight=weight, ignore_index=Params.PAD_token) training_batches = [] batch_num = int(math.ceil(len(pairs) / batch_size)) print("Batch Number (Train):", batch_num) for i in range(batch_num): batch_data = DataUtil.batch2TrainData(inputVoc, outputVoc, pairs[i * batch_size: (i + 1) * batch_size]) training_batches.append(batch_data) dev_batches = [] dev_batch_num = int(math.ceil(len(dev_pairs) / batch_size)) for i in range(dev_batch_num): dev_batch_data = DataUtil.batch2TrainData(inputVoc, outputVoc, dev_pairs[i * batch_size: (i + 1) * batch_size]) dev_batches.append(dev_batch_data) # Initializations print('Initializing ...') start_iteration = 1 if loadFilename: start_iteration = checkpoint['iteration'] + 1 print_loss = 0 larger_count = 0 best_dev_ppl = sys.maxsize # Training loop print("Training...") for iteration in range(start_iteration, n_iteration + 1): training_batch = training_batches[(iteration - 1) % batch_num] # Extract fields from batch input_variable, lengths, target_variable, max_target_len = training_batch # Run a training iteration with batch loss = train(input_variable, lengths, target_variable, max_target_len, criterion, encoder, decoder, encoder_optimizer, decoder_optimizer, clip) print_loss += loss # Print progress if iteration % print_every == 0: print_loss_avg = print_loss / print_every print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg)) print_loss = 0 # Save checkpoint if (iteration % check_every == 0): directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, Params.hidden_size)) if not os.path.exists(directory): os.makedirs(directory) torch.save({ 'iteration': iteration, 'en': encoder.state_dict(), 'de': decoder.state_dict(), 'en_opt': encoder_optimizer.state_dict(), 'de_opt': decoder_optimizer.state_dict(), 'loss': loss, 'input_voc_dict': inputVoc.__dict__, 'output_voc_dict': outputVoc.__dict__, 'input_embedding': input_embedding.state_dict(), 'output_embedding': output_embedding.state_dict() }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint'))) encoder.eval() decoder.eval() dev_ppl = EvaluateUtil.calc_ppl(encoder, decoder, outputVoc.num_words, dev_batches, Params.PAD_token) if (dev_ppl < best_dev_ppl): best_dev_ppl = dev_ppl torch.save({ 'iteration': iteration, 'en': encoder.state_dict(), 'de': decoder.state_dict(), 'en_opt': encoder_optimizer.state_dict(), 'de_opt': decoder_optimizer.state_dict(), 'loss': loss, 'input_voc_dict': inputVoc.__dict__, 'output_voc_dict': outputVoc.__dict__, 'input_embedding': input_embedding.state_dict(), 'output_embedding': output_embedding.state_dict() }, os.path.join(directory, '{}.tar'.format('best_ppl'))) larger_count = 0 else: larger_count += 1 print("#CHECK POINT# Iteration: {}; Best PPL: {:.4f}; Current PPL: {:.4f}; Larger count: {}".format(iteration, best_dev_ppl, dev_ppl, larger_count)) encoder.train() decoder.train() if(larger_count > Params.break_count): print("BREAK: Meet Break Count") break
def train_model(pretrained_model=None): task = 'video-based' # video-based or subtitle-based mqa = MovieQA.DataLoader() stories_for_create_dict, full_video_QAs = mqa.get_story_qa_data('full', 'subtitle') stories_for_create_dict = DataUtil.preprocess_stories(stories_for_create_dict,max_words=40) w2v_mqa_model_filename = './model/movie_plots_1364.d-300.mc1.w2v' w2v_model = w2v.load(w2v_mqa_model_filename, kind='bin') # Create vocabulary v2i = DataUtil.create_vocabulary_word2vec(full_video_QAs, stories_for_create_dict, word_thresh=1, w2v_vocab=w2v_model, v2i={'': 0, 'UNK':1}) trained_stories, trained_video_QAs = mqa.get_story_qa_data('train', 'subtitle') trained_stories = DataUtil.preprocess_stories(trained_stories,max_words=40) trained_stories,max_sentences,max_words = DataUtil.data_in_matrix_form(trained_stories, v2i) print('used_stories_for_create_dict... max setences:%d, max words:%d' %(max_sentences,max_words)) val_stories, val_video_QAs = mqa.get_story_qa_data('val', 'subtitle') val_stories = DataUtil.preprocess_stories(val_stories,max_words=40) val_stories,_,_ = DataUtil.data_in_matrix_form(val_stories, v2i,max_sentences=max_sentences,max_words=max_words) ''' model parameters ''' # preprocess trained_stories size_voc = len(v2i) story_shape = (max_sentences,max_words) nql=25 # sequences length for question nqa=32 # sequences length for anwser numberOfChoices = 5 # for input choices, one for correct, one for wrong answer print('building model ...') input_stories = tf.placeholder(tf.int32, shape=(None, max_sentences, max_words),name='input_stories') input_question = tf.placeholder(tf.int32, shape=(None,nql), name='input_question') input_answer = tf.placeholder(tf.int32, shape=(None,numberOfChoices,nqa), name='input_answer') y = tf.placeholder(tf.float32,shape=(None, numberOfChoices)) #build_model_with_linearProj train,loss,scores = build_model_with_linearProj(input_stories, input_question, input_answer, v2i, w2v_model, answer_index=y, lr=0.01, isTest=False) ''' configure && runtime environment ''' config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.5 # sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) config.log_device_placement=False sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) ''' training parameters ''' batch_size = 8 total_train_qa = len(trained_video_QAs) total_val_qa = len(val_video_QAs) num_train_batch = int(round(total_train_qa*1.0/batch_size)) num_val_batch = int(round(total_val_qa*1.0/batch_size)) total_epoch = 100 export_path = '/home/xyj/usr/local/saved_model/vqa_baseline/cross_entropy_subtitle_only_word2vec_TrainStoreis4Dict' if not os.path.exists(export_path): os.makedirs(export_path) print('mkdir %s' %export_path) print('total training samples: %d' %total_train_qa) with sess.as_default(): saver = tf.train.Saver(sharded=True,max_to_keep=20) if pretrained_model is not None: saver.restore(sess, pretrained_model) print('restore pre trained file:' + pretrained_model) for epoch in xrange(total_epoch): # shuffle np.random.shuffle(trained_video_QAs) total_correct_num = 0 total_loss = 0.0 for batch_idx in xrange(num_train_batch): batch_qa = trained_video_QAs[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_train_qa)] data_q,data_a,data_y = DataUtil.getBatchIndexedQAs(batch_qa, v2i, nql=nql, nqa=nqa, numOfChoices=numberOfChoices) data_s = DataUtil.getBatchIndexedStories(batch_qa,trained_stories,v2i,story_shape) # print(data_q,data_a) _, l, s = sess.run([train,loss,scores],feed_dict={input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y}) num_correct = np.sum(np.where(np.argmax(s,axis=-1)==np.argmax(data_y,axis=-1),1,0)) Acc = num_correct*1.0/batch_size total_correct_num += num_correct total_loss += l print('--Training--, Epoch: %d/%d, Batch: %d/%d, Batch_size: %d Loss: %.5f, Acc: %.5f' %(epoch+1,total_epoch,batch_idx+1,num_train_batch,batch_size,l,Acc)) total_correct_num = total_correct_num*1.0/total_train_qa total_loss = total_loss/num_train_batch print('--Train--, train acc: %.5f || loss: %.5f' %(total_correct_num,total_loss)) print('---------Validation---------') total_correct_num = 0 for batch_idx in xrange(num_val_batch): batch_qa = val_video_QAs[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_val_qa)] data_q,data_a,data_y = DataUtil.getBatchIndexedQAs(batch_qa, v2i, nql=nql, nqa=nqa, numOfChoices=numberOfChoices) data_s = DataUtil.getBatchIndexedStories(batch_qa,val_stories,v2i,story_shape) l, s = sess.run([loss,scores],feed_dict={input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y}) num_correct = np.sum(np.where(np.argmax(s,axis=-1)==np.argmax(data_y,axis=-1),1,0)) Acc = num_correct*1.0/batch_size total_correct_num += num_correct print('--Valid--, Epoch: %d/%d, Batch: %d/%d, Batch_size: %d Loss: %.5f, Acc: %.5f' %(epoch+1,total_epoch,batch_idx+1,num_val_batch,batch_size,l,Acc)) total_correct_num = total_correct_num*1.0/total_val_qa print('--Valid--, val acc: %.5f' %(total_correct_num)) #save model save_path = saver.save(sess, export_path+'/'+'E'+str(epoch+1)+'_A'+str(total_correct_num)+'.ckpt') print("Model saved in file: %s" % save_path)
USE_CUDA = torch.cuda.is_available() device = torch.device("cuda" if USE_CUDA else "cpu") TRAIN_FILE = '../data/train.data' VALID_FILE = '../data/valid.data' TEST_FILE = '../data/test.data' # TRAIN_FILE = "../data/conversation.data" # VALID_FILE = "../data/conversation.data" # TEST_FILE = "../data/conversation.data" pairs = FileUtil.read_pairs(TRAIN_FILE) inputVoc, outputVoc = DataUtil.prepareVoc(pairs) dev_pairs = FileUtil.read_pairs(VALID_FILE) test_pairs = FileUtil.read_pairs(TEST_FILE) def train(input_variable, lengths, target_variable, max_target_len, criterion, encoder, decoder, encoder_optimizer, decoder_optimizer, clip): global device batch_size = input_variable.size()[1] # Zero gradients encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() # Set device options
def sendCD(socket, payload): #packet id socket.send("\xCD") #payload - byte DataUtil.sendByte(socket, payload)
trainPl = variables["General"]["TrainingData"] testPl = variables["General"]["TestingData"] #paramteres for deciding what version of the Tsetlin Machine #and what the pro gram should do convolutional = variables["Connect4"]["Convolutional"] parallel = variables["Connect4"]["Parallel"] CrossVal = variables["General"]["CrossEvaluation"] FindClauses = variables["General"]["FindClauses"] #------------------------------------------------------------ print("Getting Data") #Load the data training = DataUtil.LoadFile(dataPath + trainPl) testing = DataUtil.LoadFile(dataPath + testPl) print(str(len(training[0])) + " entries") #Transform the data to correct form TrainX = np.array(TsUtil.ReshapeData(training[0], convolutional)) TrainY = np.array(training[1]) TestX = np.array(TsUtil.ReshapeData(testing[0], convolutional)) TestY = np.array(testing[1]) #------------------------------------------------------------ print("Setting Up Machine") #Get parameters for Tsetlin Machine clauses = int(variables["Connect4"]["Clause"]) T = variables["Connect4"]["T"]
def handle16(FileObject): CollectedID = DataUtil.readInt(FileObject) CollectorID = DataUtil.readInt(FileObject) return {'CollectedID': CollectedID, 'CollectorID': CollectorID}
def sendFF(socket, reason): #string - disconnect reason DataUtil.sendString(socket, reason)
def exe_model(sess, data, batch_size, v2i, hf, feature_shape, loss, scores, input_video, input_question, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32): if train is not None: np.random.shuffle(data) total_data = len(data) num_batch = int(round(total_data * 1.0 / batch_size)) total_correct_num = 0 total_loss = 0.0 for batch_idx in xrange(num_batch): batch_qa = data[batch_idx * batch_size:min((batch_idx + 1) * batch_size, total_data)] data_q, data_a, data_y = DataUtil.getBatchIndexedQAs( batch_qa, v2i, nql=nql, nqa=nqa, numOfChoices=numberOfChoices) data_v = DataUtil.getBatchVideoFeatureFromQid(batch_qa, hf, feature_shape) if train is not None: _, l, s = sess.run( [train, loss, scores], feed_dict={ input_video: data_v, input_question: data_q, input_answer: data_a, y: data_y }) else: l, s = sess.run( [loss, scores], feed_dict={ input_video: data_v, input_question: data_q, input_answer: data_a, y: data_y }) num_correct = np.sum( np.where( np.argmax(s, axis=-1) == np.argmax(data_y, axis=-1), 1, 0)) total_correct_num += num_correct total_loss += l # print('--Training--, Epoch: %d/%d, Batch: %d/%d, Batch_size: %d Loss: %.5f, Acc: %.5f' %(epoch+1,total_epoch,batch_idx+1,num_batch,batch_size,l,Acc)) total_acc = total_correct_num * 1.0 / total_data total_loss = total_loss / num_batch return total_acc, total_loss
def send03(socket, message): #packet id socket.send("\x03") #-----string - message-----# DataUtil.sendString(socket, message)
def send00(socket, KAid): #packet id socket.send("\x00") #int - keep alive id DataUtil.sendInt(socket, KAid)