def __init__(self, project_name , model_name = 'new'): # prepare dataset prepare_dataset.prepare_dataset(project_name) # call ValidationClass validObj = validation.ValidateClass(project_name) # get all the paths and config file validObj.get_validated_paths() # get validated paths & config file self.paths = validObj.paths self.config = validObj.config # set id to current run self.run_id = self.get_run_id() # get model name from function arg self.model_name = model_name # create object of model class to operate model.py self.modelObj = model.ModelClass(project_name)
def label(): file = request.files["file"] label = request.form['label'] if(os.path.isdir("dataset\\"+label)): print("Word Already Exists") else: print("Word Does not exist") os.mkdir("dataset\\"+label) newDirectory = "dataset\\"+label file.save(os.path.join(newDirectory,file.filename)) print("=-=-=-=-=-=-= \n DONE TRAINING") prepare_dataset.prepare_dataset(DATASET_PATH, JSON_PATH) train.main() return jsonify({})
def run_pconsc(fold): k = fold - 1 print 'Preparing data...' data, target, folds = prepare_dataset.prepare_dataset() print 'Fitting random forest...' forest = fit_data.fit_data(k, data, target, folds) data_io.save_random_forest(forest, constants.intermediate_path, 'pconsc_random_forest_' + str(k) + '.pkl.tar.gz') print 'Predicting test data...' predict_data.predict_data(k, data, folds, forest, 'pconsc/')
def run_pconsc(fold): k = fold - 1 print 'Preparing data...' data, target, folds = prepare_dataset.prepare_dataset() print 'Fitting random forest...' forest = fit_data.fit_data(k, data, target, folds) data_io.save_random_forest( forest, constants.intermediate_path, 'pconsc_random_forest_' + str(k) + '.pkl.tar.gz') print 'Predicting test data...' predict_data.predict_data(k, data, folds, forest, 'pconsc/')
def run_pconsc2(fold): k = fold - 1 print 'Preparing data...' base_data, target, folds = prepare_dataset.prepare_dataset() for i in range(constants.number_of_layers + 1): if i == 0: data = base_data else: print 'Getting layer ' + str(i) + ' data...' data = next_layer_dataset.next_layer_dataset(base_data, data, forest) print 'Fitting random forest...' forest = fit_data.fit_data(k, data, target, folds) data_io.save_random_forest(forest, constants.intermediate_path, 'pconsc2_random_forest_' + str(k) + '_layer_' + str(i) + '.pkl.tar.gz') print 'Predicting test data...' predict_data.predict_data(k, data, folds, forest, 'pconsc2_layer_' + str(i) + '/')
def run_pconsc2(fold): k = fold - 1 print 'Preparing data...' base_data, target, folds = prepare_dataset.prepare_dataset() for i in range(constants.number_of_layers + 1): if i == 0: data = base_data else: print 'Getting layer ' + str(i) + ' data...' data = next_layer_dataset.next_layer_dataset( base_data, data, forest) print 'Fitting random forest...' forest = fit_data.fit_data(k, data, target, folds) data_io.save_random_forest( forest, constants.intermediate_path, 'pconsc2_random_forest_' + str(k) + '_layer_' + str(i) + '.pkl.tar.gz') print 'Predicting test data...' predict_data.predict_data(k, data, folds, forest, 'pconsc2_layer_' + str(i) + '/')
def node(test_node, test_leaf): node_dataset, leaf_dataset, test_dataset = prepare_dataset.prepare_dataset( [[1], [7]]) PATHS = ['./node_net.pth', './left_net.pth', './right_net.pth'] if test_node: for h in range(1): training.train(leaf_dataset, PATHS[1]) training.train(leaf_dataset, PATHS[2]) for i in range(1, 3, 1): print(f'Number of epochs: {i}') node = Node.Node(node_dataset, PATHS[1], PATHS[2], PATHS[0]) node.train(i) testing_node.test(test_dataset, PATHS) #print('NEW TRY') if test_leaf: training.train(leaf_dataset, PATHS[1]) training.train(leaf_dataset, PATHS[2]) testing_leaf.test(test_dataset, PATHS[1]) testing_leaf.test(test_dataset, PATHS[2])
def main(): """ Generates the weights file based on the given text :return: """ # Check for expected number of Arguments if len(argv) != number_of_args: exit("Invalid number of arguments") # Get train, test files path and output folder full path script, txt_file_path = argv # read txt and lowercase it txt = open(txt_file_path).read() txt = txt.lower() conversion_dic, n_chars, n_vocab = parse(txt) char_to_int = conversion_dic["char_to_int"] # prepare the dataset of input to output pairs encoded as integers seq_length = 100 X, y, dataX = prepare_dataset(seq_length, txt, n_chars, char_to_int, n_vocab) # define the LSTM model model = Sequential() model.add(LSTM(512, input_shape=(X.shape[1], X.shape[2]),return_sequences=True)) model.add(Dropout(0.2)) model.add(LSTM(512, return_sequences=True)) model.add(Dropout(0.2)) model.add(LSTM(512)) model.add(Dropout(0.2)) model.add(Dense(y.shape[1], activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') # define the checkpoint filepath = "weights-{epoch:02d}-{loss:.4f}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min') callbacks_list = [checkpoint] # fit the model model.fit(X, y, epochs=2, batch_size=128, callbacks=callbacks_list) print "done!"
import pandas as pd import matplotlib.pyplot as plt from prepare_dataset import prepare_dataset from gains_losses_data import compute_gains_losses def graph_builder_bar(graph): axes = graph.plot(kind='bar', ) plt.axhline(0, color='b') axes.legend(bbox_to_anchor=(1.5, 1.05), ) return plt.show() # Load dataset df_hh = prepare_dataset() df_hh = compute_gains_losses(df_hh) # Compute total revenue from policy tax_revenue_total = (df_hh['total_tax_increase'] * df_hh['hh_weight']).sum() / 1000000 tax_revenue_per_uc = (df_hh['total_tax_increase'] * df_hh['hh_weight']).sum() / (df_hh['consumption_units'] * df_hh['hh_weight']).sum() avg_loss_per_uc = (df_hh['total_expenditures_increase'] * df_hh['hh_weight']).sum() / (df_hh['consumption_units'] * df_hh['hh_weight']).sum() def incidence_decile( data, targeted, amount
def main(): """ Receives 2 parameters - the text to use as a starting point, and the weights file :return: prints out 1000 characters based on the training and the chosen seed """ # Check for expected number of Arguments if len(argv) != number_of_args: exit("Invalid number of arguments") # load ascii text and covert to lowercase filename = argv[1] #text raw_text = open(filename).read() raw_text = raw_text.lower() # create mapping of unique chars to integers, and a reverse mapping chars = sorted(list(set(raw_text))) chars.insert(0, '\r') print chars char_to_int = dict((c, i) for i, c in enumerate(chars)) int_to_char = dict((i, c) for i, c in enumerate(chars)) # summarize the loaded data n_chars = len(raw_text) n_vocab = len(chars) print "Total Characters: ", n_chars print "Total Vocab: ", n_vocab # prepare the dataset of input to output pairs encoded as integers seq_length = 100 X, y, dataX = prepare_dataset(seq_length, raw_text, n_chars, char_to_int, n_vocab) # define the LSTM model print(X.shape[1], X.shape[2]) model = Sequential() model.add( LSTM(512, input_shape=(X.shape[1], X.shape[2]), return_sequences=True)) model.add(Dropout(0.2)) model.add(LSTM(512, return_sequences=True)) model.add(Dropout(0.2)) model.add(LSTM(512)) model.add(Dropout(0.2)) model.add(Dense(y.shape[1], activation='softmax')) # load the network weights weightFile = argv[2] model.load_weights(weightFile) model.compile(loss='categorical_crossentropy', optimizer='adam') # pick a random seed start = numpy.random.randint(0, len(dataX) - 1) pattern = dataX[start] print "Seed:" print "\"", ''.join([int_to_char[value] for value in pattern]), "\"" # generate characters for i in range(1000): x = numpy.reshape(pattern, (1, len(pattern), 1)) x = x / float(n_vocab) prediction = model.predict(x, verbose=0) index = numpy.argmax(prediction) result = int_to_char[index] seq_in = [int_to_char[value] for value in pattern] sys.stdout.write(result) pattern.append(index) pattern = pattern[1:len(pattern)] print "\nDone."
def excute(config): torch.cuda.empty_cache() torch.manual_seed(1) model = HDIClassifier(config['waves'], config['windows'], config['channel_n'], 2) device = torch.device( config['device']) if torch.cuda.is_available() else torch.device('cpu') model = model.to(device) optim = torch.optim.Adam(model.parameters(), lr=0.0005) loss_f = torch.nn.CrossEntropyLoss() #FocalLoss(logits=True) epochs = 15 batch_size = 270 * (int(math.ceil(3 / len(config['waves'])))) tr_dataloader, vd_dataloader, te_dataloader = prepare_dataset( config['time_point'], batch_size, config['sqi']) loss_list = [] result_df_list = [] best_auroc = 0 best_model_path = None save_path = make_save_folder(config) print(save_path) shutil.copytree('./', os.path.join(save_path, 'pyfile')) #/ home / jjong / jjong / workplace / datathon_2019 / pyfile print('Settings: {}M_{}'.format(config['time_point'], '_'.join(config['waves']))) for epoch in range(epochs): model.train() total_loss = 0 tr_pred_digit = [] tr_pred_prob = [] tr_target_digit = [] for idx, (X, y) in enumerate(tr_dataloader): X, y = X.to(device), y.to(device).long() optim.zero_grad() output = model(X) #loss_f(output[:,1],tmp[1].squeeze(1).to(device)) loss = loss_f(output, y.squeeze(1).to(device)) loss.backward() optim.step() total_loss += loss.item() if idx % 50 == 0: print('{}[{}epoch][{}/{}iter][loss:{}]'.format( config['device'], epoch, idx, len(tr_dataloader), loss.item())) tr_target_digit.extend(y.cpu().numpy().ravel().tolist()) tr_pred_digit.extend(output.max(dim=1)[1].cpu().numpy().tolist()) tr_pred_prob.extend(output[:, 1].detach().cpu().numpy().tolist()) else: loss_list.append(total_loss / len(tr_dataloader)) print( '-----------<[{} epoch] Train Result>----------------'.format( epoch)) print('Settings: {}M_{}'.format(config['time_point'], '_'.join(config['waves']))) print('Train total: [loss:{}]'.format(total_loss / len(tr_dataloader))) auroc = roc_auc_score(tr_target_digit, tr_pred_prob) auprc = average_precision_score(tr_target_digit, tr_pred_prob) print('AUROC : {}'.format(auroc)) print('AUPRC : {}'.format(auprc)) print( classification_report(tr_target_digit, tr_pred_digit, labels=[0, 1], target_names=['normal', 'Event'])) report_dict = classification_report( tr_target_digit, tr_pred_digit, labels=[0, 1], target_names=['normal', 'Event'], output_dict=True) report_df = pd.DataFrame(report_dict) report_df['epoch'] = epoch report_df['state'] = 'train' report_df['auroc'] = auroc report_df['auprc'] = auprc result_df_list.append(report_df) '''----Validation----''' with torch.no_grad(): vd_target_digit = [] vd_pred_digit = [] vd_pred_prob = [] model.eval() for idx, (X, y) in enumerate(vd_dataloader): X, y = X.to(device), y.to(device) output = model(X) vd_target_digit.extend(y.cpu().numpy().ravel().tolist()) vd_pred_digit.extend( output.max(dim=1)[1].cpu().numpy().tolist()) vd_pred_prob.extend(output[:, 1].detach().cpu().numpy().tolist()) else: print('-----------<[{} epoch] Valid Result>----------------'. format(epoch)) print('Settings: {}M_{}'.format(config['time_point'], '_'.join(config['waves']))) auroc = roc_auc_score(vd_target_digit, vd_pred_prob) auprc = average_precision_score(vd_target_digit, vd_pred_prob) print('AUROC : {}'.format(auroc)) print('AUPRC : {}'.format(auprc)) print( classification_report(vd_target_digit, vd_pred_digit, labels=[0, 1], target_names=['normal', 'Event'])) report_dict = classification_report( vd_target_digit, vd_pred_digit, labels=[0, 1], target_names=['normal', 'Event'], output_dict=True) report_df = pd.DataFrame(report_dict) report_df['epoch'] = epoch report_df['state'] = 'valid' report_df['auroc'] = auroc report_df['auprc'] = auprc result_df_list.append(report_df) pd.concat(result_df_list, sort=True).to_csv( os.path.join(save_path, 'result_df.csv')) if auroc > best_auroc: print('SAVED') best_auroc = auroc best_model_path = os.path.join( save_path, 'models', '{}_{:.3f}.pth'.format(epoch, auroc)) torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optim.state_dict(), 'auroc': auroc, }, best_model_path) '''----Test----''' with torch.no_grad(): te_target_digit = [] te_pred_digit = [] te_pred_prob = [] checkpoint = torch.load(best_model_path) model.load_state_dict(checkpoint['model_state_dict']) model.eval() for idx, (X, y) in enumerate(te_dataloader): X, y = X.to(device), y.to(device) output = model(X) te_target_digit.extend(y.cpu().numpy().ravel().tolist()) te_pred_digit.extend(output.max(dim=1)[1].cpu().numpy().tolist()) te_pred_prob.extend(output[:, 1].detach().cpu().numpy().tolist()) else: print('-----------< Test Result >----------------') print('[setting]: {}M_{}'.format(config['time_point'], '_'.join(config['waves']))) print('[Best Model]: ', best_model_path) auroc = roc_auc_score(te_target_digit, te_pred_prob) auprc = average_precision_score(te_target_digit, te_pred_prob) print('AUROC : {}'.format(auroc)) print('AUPRC : {}'.format(auprc)) print( classification_report(te_target_digit, te_pred_digit, labels=[0, 1], target_names=['normal', 'Event'])) report_dict = classification_report( te_target_digit, te_pred_digit, labels=[0, 1], target_names=['normal', 'Event'], output_dict=True) report_df = pd.DataFrame(report_dict) report_df['epoch'] = epoch report_df['state'] = 'test' report_df['auroc'] = auroc report_df['auprc'] = auprc result_df_list.append(report_df) f1_score = report_dict['weighted avg']['f1-score'] recall = report_dict['weighted avg']['recall'] precision = report_dict['weighted avg']['precision'] pd.concat(result_df_list, sort=True).to_csv( os.path.join(save_path, 'result_df.csv')) with open( os.path.join( save_path, '[{} {:.4f}]_[{}{:.4f}]_[{}{:.4f}]_[{}{:.4f}]_[{}{:.4f}].txt' .format('auroc', auroc, 'auprc', auprc, 'f1-score', f1_score, 'recall', recall, 'precision', precision)), 'w') as f: f.write(' ') del (model)
def mnist_cnn(): start = time.time() groups = find_pairings.find_pairings() groups_a_l = {} acc = [] losses = [] for g in groups: print(f'Groups: {g}') digits = [] for i in g: for k in i: digits.append(k) (tr_d, te_d) = prepare_dataset.prepare_dataset(digits) (train_d_set, test_d_set) = divide_dataset.divide_dataset(g, tr_d, te_d) loss = training.train(train_d_set) accuracy = testing.test(test_d_set) groups_a_l[str(g)] = [accuracy] groups_a_l[str(g)].append(loss) acc.append(accuracy) losses.append(loss) acc.sort() acc.reverse() losses.sort() for i in groups_a_l: if groups_a_l[i][0] == acc[0]: print(f'Highest accuracy: {acc[0]}, groups: {i}\n') if groups_a_l[i][0] == acc[1]: print(f'Second highest accuracy: {acc[1]}, groups: {i}\n') if groups_a_l[i][0] == acc[len(acc) - 1]: print(f'Lowest accuracy: {acc[len(acc) - 1]}, groups: {i}\n') if groups_a_l[i][0] == acc[len(acc) - 2]: print( f'Second lowest accuracy: {acc[len(acc) - 2]}, groups: {i}\n') for i in groups_a_l: if groups_a_l[i][1] == losses[0]: print(f'Lowest loss: {losses[0]}, groups: {i}\n') if groups_a_l[i][1] == losses[1]: print(f'Second lowest loss: {losses[1]}, groups: {i}\n') if groups_a_l[i][1] == losses[len(losses) - 1]: print(f'Highest loss: {losses[len(losses) - 1]}, groups: {i}\n') if groups_a_l[i][1] == losses[len(losses) - 2]: print( f'Second highest loss: {losses[len(losses) - 2]}, groups: {i}\n' ) finish = time.time() print(groups_a_l) print('Total seconds passed: %.3f' % (finish - start)) x = losses y = [] for i in x: for j in groups_a_l: if groups_a_l[j][1] == i: y.append(groups_a_l[j][0]) plt.plot(x, y) plt.show()
def make_tree(num_epochs_l, num_epochs_n, leaf_groups, node_groups): start = time.time() leaf_PATHS = [] for i in range(len(leaf_groups)): leaf_PATHS.append('./PATHS/leaf' + str(i + 1) + '_net.pth') leaves = [] for i, leaf_group in enumerate(leaf_groups): (leaf_train_set, leaf_test_set) = prepare_leaf_dataset.prepare_leaf_dataset(leaf_group) leaves.append( Node.Node(leaf_train_set, leaf_test_set, None, None, leaf_PATHS[i], True)) acc_leaves = [] for leaf in leaves: leaf.train(num_epochs_l) acc_leaves.append(leaf.test()) train_sets = [] test_sets = [] transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, ))]) train_sets.append( torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)) test_sets.append( torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)) for group in node_groups: (train_set, test_set) = prepare_dataset.prepare_dataset(group) train_sets.append(train_set) test_sets.append(test_set) node_PATHS = [] for i in range(len(train_sets)): node_PATHS.append('./PATHS/node' + str(i + 1) + '_net.pth') train_sets.reverse() test_sets.reverse() node_PATHS.reverse() leaves.reverse() nodes = [] i = 0 for j in range(0, len(leaves), 2): nodes.append( Node.Node(train_sets[i], test_sets[i], leaves[j + 1], leaves[j], node_PATHS[i], False)) i += 1 l = 0 for k in range(i, len(train_sets), 1): nodes.append( Node.Node(train_sets[k], test_sets[k], nodes[l + 1], nodes[l], node_PATHS[k], False)) l += 2 acc_nodes = [] for node in nodes: node.train(num_epochs_n) acc_nodes.append(node.test()) end = time.time() print(f'Seconds passed: {end - start}') print(f'Minutes passed: {(end - start) / 60}') print(f'Hours passed: {(end - start) / 3600}\n') return acc_leaves, acc_nodes
from sklearn import cross_validation from sklearn.metrics import classification_report from classification import Classification ''' Place the Classic dataset (data files) on creating a folder by name "classic". Create another folder "Classic_Dataset" and 4 sub folders by name "med", "cran", "cicsi", "cacm". The hardcoded number(a, b, c, d) of files will be copied from the "classic" to corresponding folders ("cacm", "med", "cisi", "cran"). ''' # Move dataset files into the required format print "Preparing dataset..." from prepare_dataset import prepare_dataset prepare_dataset() #Load The files/dataset cwd = os.getcwd() load_path = cwd + "/Classic_Dataset" dataset = load_files(load_path, description=None, categories=None, load_content=True, shuffle=False, encoding=None, decode_error='strict', random_state=0) #Class names and assigned numbers class_names = list(dataset.target_names)