def main(): # TODO preprocess the input file to get standard vectors configuration = config.get_config() filepath = configuration['datafile_path'] processed_data = DataProcessing.ProcessData(filepath) """ Model designing part """ # TODO design encoder for epi in range(configuration['max_epochs']): # print "training epoch ", epi # err = 0.0 num_steps = 0 # TODO: shuffle the training data and train this epoch ## train_start = time.time() # seq_lang_numpy = [] seq_world_numpy = [] seq_action_numpy = [] for name_map in configuration['maps_train']: max_steps = len( processed_data.dict_data['train'][name_map] ) print 'max_steps=', max_steps for idx_data, data in enumerate(processed_data.dict_data['train'][name_map]): # seq_lang_numpy, seq_world_numpy and seq_action_numpy will be set seq_lang_numpy, seq_world_numpy, seq_action_numpy = processed_data.process_one_data(idx_data, name_map, 'train') # np.concatenate((seq_lang_numpy, seq_lang)) # np.concatenate((seq_world_numpy, seq_world)) # np.concatenate((seq_action_numpy, seq_action)) """ trainer = Instantiates the model """ model = models.SeqToSeq() cost_numpy = model.build_model( seq_lang_numpy, # list of word indices seq_world_numpy, # matrix of dim (len(one_data['cleanpath'])*78 seq_action_numpy # index value of 1 in one hot vector of action ) print "Cost!!------", cost_numpy print "type = ", type(cost_numpy) print "shape = ", cost_numpy.shape print "---Cost_numpy___=",cost_numpy err += cost_numpy if idx_data % 100 == 99: print "training i-th out of N in map : ", (idx_data, max_steps, name_map) # num_steps += max_steps # train_err = err / num_steps
def trainIters(encoder, attn_decoder, n_iters, learning_rate, print_every=1000, plot_every=100): # TODO preprocess the input file to get standard vectors configuration = config.get_config() filepath = configuration['datafile_path'] """divides the data into train and dev""" processed_data = DataProcessing.ProcessData(filepath) run_model = DataProcessing.RunModel() """ Model designing part """ # TODO design encoder # max_action_len = 30 start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(attn_decoder.parameters(), lr=learning_rate) count = 0 folds = configuration['folds'] criterion = nn.NLLLoss() """ Training part """ for epi in range(n_iters): # print "training epoch ", epi # train_err_epoch = [] val_err_epoch = [] accuracy_for_epoch = [] # TODO: shuffle the training data and train this epoch ## train_start = time.time() # for fold in range(folds): print "Fold: ", fold train_err = 0.0 num_steps = 0 seq_lang_numpy = [] seq_world_numpy = [] seq_action_numpy = [] for name_map in configuration['maps_train'][fold]: max_steps = len(processed_data.dict_data['train'][name_map]) print 'max_steps=', max_steps for idx_data, data in enumerate( processed_data.dict_data['train'][name_map]): count += 1 # seq_lang_numpy, seq_world_numpy and seq_action_numpy will be set seq_lang_numpy, seq_world_numpy, seq_action_numpy = processed_data.process_one_data( idx_data, name_map, 'train') seq_lang_numpy = Variable( torch.LongTensor(seq_lang_numpy).view(-1, 1)) seq_world_numpy = Variable( torch.FloatTensor(seq_world_numpy)) seq_action_numpy = Variable( torch.LongTensor(seq_action_numpy).view(-1, 1)) """ trainer = Instantiates the model """ loss = train(idx_data, name_map, seq_lang_numpy, seq_world_numpy, seq_action_numpy, encoder, attn_decoder, encoder_optimizer, decoder_optimizer, criterion, processed_data, "train", run_model) train_err += loss print_loss_total += loss plot_loss_total += loss if idx_data % 100 == 99: print "training i-th out of N in map : ", (idx_data, max_steps, name_map) if count % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print "----------------calculating training loss------------" print "TimeSince=", time_since(start, count / n_iters) print "Itr=", count print " Percentage of code run=", count / n_iters * 100 print "Loss=", print_loss_avg print "--------------------------------------------" print "" print "" # if idx_data == 20: # break num_steps += max_steps # avg_train_err = train_err / num_steps train_err_epoch.append(avg_train_err) print "validating ... " # val_err = 0.0 num_steps = 0 dev_start = time.time() # for name_map in configuration['maps_train'][fold]: max_steps = len(processed_data.dict_data['dev'][name_map]) for idx_data, data in enumerate( processed_data.dict_data['dev'][name_map]): count += 1 # seq_lang_numpy, seq_world_numpy and seq_action_numpy will be set seq_lang_numpy, seq_world_numpy, seq_action_numpy = processed_data.process_one_data( idx_data, name_map, 'dev') seq_lang_numpy = Variable( torch.LongTensor(seq_lang_numpy).view(-1, 1)) seq_world_numpy = Variable( torch.FloatTensor(seq_world_numpy)) seq_action_numpy = Variable( torch.LongTensor(seq_action_numpy).view(-1, 1)) """ trainer = Instantiates the model """ loss = train(idx_data, name_map, seq_lang_numpy, seq_world_numpy, seq_action_numpy, encoder, attn_decoder, encoder_optimizer, decoder_optimizer, criterion, processed_data, "dev", run_model) val_err += loss print_loss_total += loss plot_loss_total += loss if idx_data % 100 == 99: print "training i-th out of N in map : ", (idx_data, max_steps, name_map) if count % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print "----------------calculating validation loss------------" print "TimeSince=", time_since(start, count / n_iters) print "Itr=", count print " Percentage of code run=", (count / n_iters) * 100 print "Loss=", print_loss_avg print "--------------------------------------------" print "" print "" # if idx_data == 20: # break num_steps += max_steps avg_val_error = val_err / num_steps val_err_epoch.append(avg_val_error) print "Epoch = ", epi, " Train error = ", avg_train_err, " Validation error = ", avg_val_error, " diff = "\ , avg_train_err - avg_val_error # Add testing code here test_map_name = configuration['map_test'][fold] print "maptest = ", test_map_name cnt_success = 0 tag_split = 'train' cnt, total_tuples1, _, _ = evaluate(encoder, attn_decoder, tag_split, test_map_name, processed_data, run_model) cnt_success += cnt tag_split = 'dev' cnt, total_tuples2, _, _ = evaluate(encoder, attn_decoder, tag_split, test_map_name, processed_data, run_model) cnt_success += cnt accuracy_for_fold = (cnt_success / ((total_tuples1 + total_tuples2) * 1.0)) * 100 accuracy_for_epoch.append(accuracy_for_fold) print "Accuracy:for fold:", fold, "= ", accuracy_for_fold, " %" avg_train_err_epi = (sum(train_err_epoch) / 3.0) avg_val_error_epi = (sum(val_err_epoch) / 3.0) avg_accuracy_epi = (sum(accuracy_for_epoch) / 3.0) print "Average train error for epoch ", epi, ": ", avg_train_err_epi print "Average val error for epoch ", epi, ": ", avg_val_error_epi print "Average accuracy for epoch ", epi, ": ", avg_accuracy_epi print "Train error - val error : ", avg_train_err_epi - avg_val_error_epi # Save the model after every epoch tracks = configuration['save_filepath'] id_process = os.getpid() time_current = datetime.datetime.now().isoformat() tag_model = '_PID=' + str(id_process) + '_TIME=' + time_current path_track = tracks + 'track' + "_3FoldEpoch_" + str( epi) + "_" + tag_model + '/' command_mkdir = 'mkdir -p ' + os.path.abspath(path_track) os.system(command_mkdir) # ENCODER_PATH = path_track + 'encoder.pkl' DECODER_PATH = path_track + 'decoder.pkl' torch.save(encoder, ENCODER_PATH) torch.save(attn_decoder, DECODER_PATH)
def evaluate(encoder, decoder, tag_split, max_length=MAX_LENGTH): configuration = config.get_config() filepath = configuration['datafile_path'] name_map = configuration['map_test'][0] processed_data = DataProcessing.ProcessData(filepath) run_model = DataProcessing.RunModel() all_actions = [] all_attentions = [] cnt_success = 0 for idx_data, data in enumerate( processed_data.dict_data[tag_split][name_map]): actions = [] for act in data['action']: actions.append(np.argmax(act)) seq_lang_numpy, seq_world_numpy, seq_action_numpy = processed_data.process_one_data( idx_data, name_map, tag_split) seq_lang = Variable(torch.LongTensor(seq_lang_numpy).view(-1, 1)) seq_world = Variable(torch.FloatTensor(seq_world_numpy)) seq_action = Variable(torch.LongTensor(seq_action_numpy).view(-1, 1)) input_length = seq_lang.size()[0] pos_start, pos_end = processed_data.get_pos(idx_data, name_map, tag_split) pos_curr = pos_start encoder_hidden = encoder.initHidden() encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size)) encoder_outputs = encoder_outputs.cuda( ) if use_cuda else encoder_outputs for ei in range(input_length): encoder_output, encoder_hidden = encoder(seq_lang[ei], encoder_hidden) # encoder_outputs[ei] is an extra term when compared to that in train function encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0] decoder_input = seq_world[0] decoder_input = decoder_input.cuda() if use_cuda else decoder_input decoder_hidden = encoder_hidden.view(1, 1, encoder.hidden_size) decoded_actions = [] decoder_attentions = torch.zeros(max_length, max_length) for di in range(max_length): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) decoder_attentions[di] = decoder_attention.data topv, topi = decoder_output.data.topk(1) ni = topi[0][0] pos_curr = run_model.take_one_step(pos_curr, ni) # world state of next position decoder_input = run_model.get_feat_current_position( pos_curr, name_map) decoder_input = Variable(torch.FloatTensor([decoder_input])) decoder_input = decoder_input.cuda() if use_cuda else decoder_input if ni == STOP: decoded_actions.append(3) break else: decoded_actions.append(ni) all_actions.append(decoded_actions) all_attentions.append(decoder_attentions[:di + 1]) if check_position_end(pos_curr, data['cleanpath'][-1]): cnt_success += 1 print "decoded action = ", decoded_actions return cnt_success, all_actions, all_attentions
def SampleTest(encoder, decoder, idx_data, sentence, map_name, max_length=MAX_LENGTH): """ idx_data: this is the index number of test data of 'l' map's dev set""" configuration = config.get_config() filepath = configuration['datafile_path'] name_map = configuration['map_test'][0] processed_data = DataProcessing.ProcessData(filepath) run_model = DataProcessing.RunModel() idx_data, path = get_data_tuple(idx_data, sentence, processed_data, map_name) all_actions = [] all_attentions = [] print "Given instruction: ", sentence seq_lang_numpy, seq_world_numpy, seq_action_numpy = processed_data.process_one_data( idx_data, name_map, 'dev') seq_lang = Variable(torch.LongTensor(seq_lang_numpy).view(-1, 1)) seq_world = Variable(torch.FloatTensor(seq_world_numpy)) seq_action = Variable(torch.LongTensor(seq_action_numpy).view(-1, 1)) input_length = seq_lang.size()[0] pos_start, pos_end = processed_data.get_pos(idx_data, name_map, 'dev') pos_curr = pos_start encoder_hidden = encoder.initHidden() encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size)) encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs for ei in range(input_length): encoder_output, encoder_hidden = encoder(seq_lang[ei], encoder_hidden) # encoder_outputs[ei] is an extra term when compared to that in train function encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0] decoder_input = seq_world[0] decoder_input = decoder_input.cuda() if use_cuda else decoder_input decoder_hidden = encoder_hidden.view(1, 1, encoder.hidden_size) decoded_actions = [] decoder_attentions = torch.zeros(max_length, max_length) for di in range(max_length): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) decoder_attentions[di] = decoder_attention.data topv, topi = decoder_output.data.topk(1) ni = topi[0][0] pos_curr = run_model.take_one_step(pos_curr, ni) # world state of next position decoder_input = run_model.get_feat_current_position(pos_curr, name_map) decoder_input = Variable(torch.FloatTensor([decoder_input])) decoder_input = decoder_input.cuda() if use_cuda else decoder_input if ni == STOP: decoded_actions.append(3) break else: decoded_actions.append(ni) print "decoded action = ", decoded_actions return decoded_actions, decoder_attentions[:di + 1], path
def trainIters(encoder, attn_decoder, n_iters, learning_rate, print_every=1000, plot_every=100): # TODO preprocess the input file to get standard vectors configuration = config.get_config() filepath = configuration['datafile_path'] """divides the data into train and dev""" processed_data = DataProcessing.ProcessData(filepath) """ Model designing part """ # TODO design encoder # max_action_len = 30 start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(attn_decoder.parameters(), lr=learning_rate) count = 0 criterion = nn.NLLLoss() """ Training part """ for epi in range(n_iters): # print "training epoch ", epi # train_err = 0.0 num_steps = 0 # TODO: shuffle the training data and train this epoch ## train_start = time.time() # seq_lang_numpy = [] seq_world_numpy = [] seq_action_numpy = [] for name_map in configuration['maps_train'][0]: max_steps = len(processed_data.dict_data['train'][name_map]) print 'max_steps=', max_steps for idx_data, data in enumerate( processed_data.dict_data['train'][name_map]): count += 1 # seq_lang_numpy, seq_world_numpy and seq_action_numpy will be set seq_lang_numpy, seq_world_numpy, seq_action_numpy = processed_data.process_one_data( idx_data, name_map, 'train') seq_lang_numpy = Variable( torch.LongTensor(seq_lang_numpy).view(-1, 1)) seq_world_numpy = Variable(torch.FloatTensor(seq_world_numpy)) seq_action_numpy = Variable( torch.LongTensor(seq_action_numpy).view(-1, 1)) """ trainer = Instantiates the model """ loss = train(idx_data, name_map, seq_lang_numpy, seq_world_numpy, seq_action_numpy, encoder, attn_decoder, encoder_optimizer, decoder_optimizer, criterion, processed_data, flag="train") train_err += loss print_loss_total += loss plot_loss_total += loss if idx_data % 100 == 99: print "training i-th out of N in map : ", (idx_data, max_steps, name_map) if count % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print "----------------calculating training loss------------" print "TimeSince=", time_since(start, count / n_iters) print "Itr=", count print " Percentage of code run=", count / n_iters * 100 print "Loss=", print_loss_avg print "--------------------------------------------" print "" print "" # if idx_data == 20: # break num_steps += max_steps # avg_train_err = train_err / num_steps print "validating ... " # val_err = 0.0 num_steps = 0 dev_start = time.time() # for name_map in configuration['maps_train'][0]: max_steps = len(processed_data.dict_data['dev'][name_map]) for idx_data, data in enumerate( processed_data.dict_data['dev'][name_map]): count += 1 # seq_lang_numpy, seq_world_numpy and seq_action_numpy will be set seq_lang_numpy, seq_world_numpy, seq_action_numpy = processed_data.process_one_data( idx_data, name_map, 'dev') seq_lang_numpy = Variable( torch.LongTensor(seq_lang_numpy).view(-1, 1)) seq_world_numpy = Variable(torch.FloatTensor(seq_world_numpy)) seq_action_numpy = Variable( torch.LongTensor(seq_action_numpy).view(-1, 1)) """ trainer = Instantiates the model """ loss = train(idx_data, name_map, seq_lang_numpy, seq_world_numpy, seq_action_numpy, encoder, attn_decoder, encoder_optimizer, decoder_optimizer, criterion, processed_data, flag="validate") val_err += loss print_loss_total += loss plot_loss_total += loss if idx_data % 100 == 99: print "training i-th out of N in map : ", (idx_data, max_steps, name_map) if count % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print "----------------calculating validation loss------------" print "TimeSince=", time_since(start, count / n_iters) print "Itr=", count print " Percentage of code run=", (count / n_iters) * 100 print "Loss=", print_loss_avg print "--------------------------------------------" print "" print "" # if idx_data == 20: # break num_steps += max_steps avg_val_error = val_err / num_steps print "Epoch = ", epi, " Train error = ", avg_train_err, " Validation error = ", avg_val_error tracks = configuration['save_filepath'] id_process = os.getpid() time_current = datetime.datetime.now().isoformat() tag_model = '_PID=' + str(id_process) + '_TIME=' + time_current path_track = tracks + 'track' + "_Global_Epoch_" + str( epi) + "_" + tag_model + '/' command_mkdir = 'mkdir -p ' + os.path.abspath(path_track) os.system(command_mkdir) # ENCODER_PATH = path_track + 'encoder.pkl' DECODER_PATH = path_track + 'decoder.pkl' torch.save(encoder, ENCODER_PATH) torch.save(attn_decoder, DECODER_PATH)
def StoreData(): patientDBSize = PatientDatabase.count_documents({}) print(patientDBSize) if patientDBSize > 0: print('Patient COllection filled', patientDBSize) # Get all patient document From the database and change to Panda DataFrame patientData = PatientDatabase.find({}) data = pd.DataFrame(list(patientData)) for item in files_list: result = pd.read_csv( '/home/bizzzzzzzzzzzzu/Music/MedicalPortal/MedicPortal DataProcessing/FetchedData/' + item) matchedResult = DataProcessing.ProcessData(data, item) # Link the datas if there are any matches between the stored and the fetched if matchedResult is None: print('None is Matched') '''Create New Document in Patients Collection''' # result = pd.read_csv('/home/bizzzzzzzzzzzzu/Music/MedicalPortal/MedicPortal DataProcessing/FetchedData/'+item) # remove the _id column so no redendency appear del result['_id'] # print('done creating new',hospitalData) PatientDatabase.insert_many(result.to_dict('records')) else: # Update the Patient Document print('Possible Matches', matchedResult, item) findPatientData = PatientDatabase.find({}) for matchList in matchedResult: # print(matchList[0],matchList[1]) # Get the history from the left data frame history = json.loads(result.iloc[matchList[1]]['history']) id = findPatientData[matchList[0]]['_id'] PatientDatabase.find_one_and_update( {'_id': id}, {'$push': { 'history': history[0] }}, upsert=True) else: hospitalData = pd.read_csv( '/home/bizzzzzzzzzzzzu/Music/MedicalPortal/MedicPortal DataProcessing/FetchedData/' + files_list[0]) del hospitalData['_id'] print(type(hospitalData)) for i in range(0, len(hospitalData)): history = json.loads(hospitalData.iloc[i]['history']) hospitalData.loc[i, 'history'] = [history] print('EDITED LIST', hospitalData.iloc[i]) # print(hospitalData) PatientDatabase.insert_many(hospitalData.to_dict('records')) return True
def trainIters(encoder, attn_decoder, n_iters, learning_rate, print_every=2, plot_every=100): # TODO preprocess the input file to get standard vectors configuration = config.get_config() filepath = configuration['datafile_path'] """divides the data into train and dev""" processed_data = DataProcessing.ProcessData(filepath) """ Model designing part """ # TODO design encoder # max_action_len = 30 start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(attn_decoder.parameters(), lr=learning_rate) count = 0 criterion = nn.NLLLoss() """ Training part """ for epi in range(n_iters): # print "training epoch ", epi # err = 0.0 num_steps = 0 # TODO: shuffle the training data and train this epoch ## train_start = time.time() # seq_lang_numpy = [] seq_world_numpy = [] seq_action_numpy = [] for name_map in configuration['maps_train'][0]: max_steps = len( processed_data.dict_data['train'][name_map] ) print 'max_steps=', max_steps for idx_data, data in enumerate(processed_data.dict_data['train'][name_map]): count += 1 # seq_lang_numpy, seq_world_numpy and seq_action_numpy will be set seq_lang_numpy, seq_world_numpy, seq_action_numpy = processed_data.process_one_data(idx_data, name_map, 'train') seq_lang_numpy = Variable(torch.LongTensor(seq_lang_numpy).view(-1, 1)) seq_world_numpy = Variable(torch.FloatTensor(seq_world_numpy)) seq_action_numpy = Variable(torch.LongTensor(seq_action_numpy).view(-1, 1)) """ trainer = Instantiates the model """ loss = train(idx_data, name_map, seq_lang_numpy, seq_world_numpy, seq_action_numpy, encoder, attn_decoder, encoder_optimizer, decoder_optimizer, criterion, processed_data, flag="train") print_loss_total += loss plot_loss_total += loss if idx_data % 100 == 99: print "training i-th out of N in map : ", (idx_data, max_steps, name_map) if count % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print "----------------calculating training loss------------" print "TimeSince=", time_since(start, count / n_iters) print "Itr=", count print " Percentage of code run=", count / n_iters * 100 print "Loss=", print_loss_avg print "--------------------------------------------" print "" print "" if idx_data == 20: break num_steps += max_steps # train_err = err / num_steps print "validating ... " # err = 0.0 num_steps = 0 dev_start = time.time() # for name_map in configuration['maps_train'][0]: max_steps = len(processed_data.dict_data['dev'][name_map]) for idx_data, data in enumerate(processed_data.dict_data['dev'][name_map]): count += 1 # seq_lang_numpy, seq_world_numpy and seq_action_numpy will be set seq_lang_numpy, seq_world_numpy, seq_action_numpy = processed_data.process_one_data(idx_data, name_map, 'dev') seq_lang_numpy = Variable(torch.LongTensor(seq_lang_numpy).view(-1, 1)) seq_world_numpy = Variable(torch.FloatTensor(seq_world_numpy)) seq_action_numpy = Variable(torch.LongTensor(seq_action_numpy).view(-1, 1)) """ trainer = Instantiates the model """ loss = train(idx_data, name_map, seq_lang_numpy, seq_world_numpy, seq_action_numpy, encoder, attn_decoder, encoder_optimizer, decoder_optimizer, criterion, processed_data, flag="validate") print_loss_total += loss plot_loss_total += loss if idx_data % 100 == 99: print "training i-th out of N in map : ", (idx_data, max_steps, name_map) if count % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print "----------------calculating validation loss------------" print "TimeSince=", time_since(start, count / n_iters) print "Itr=", count print " Percentage of code run=", (count / n_iters) * 100 print "Loss=", print_loss_avg print "--------------------------------------------" print "" print "" if idx_data == 20: break num_steps += max_steps