def model_airports_individually(features_for_modeling, target_variable): if os.path.exists("weather_modeling_scores_test.csv") == False: airline_carriers = [ 'WN', 'AA', 'AS', 'DL', 'F9', 'NK', 'OO', 'B6', 'UA', '9E', 'EV', 'YX', 'YV', 'OH', 'MQ', 'VX', 'G4', 'HA' ] score = pd.DataFrame() features_for_modeling += ["observation"] features_for_modeling += [target_variable] for airline in airline_carriers: merged_df = wrangle.merge_flight_weather_data() merged_df = preprocessing.to_date_time(merged_df) merged_df = preprocessing.create_new_features(merged_df) merged_df = preprocessing.create_target_variable(merged_df) # add weather features merged_df["avg_weather_delay"] = merged_df.groupby( "Type").arr_delay.transform("mean") merged_df[ "type_severity"] = merged_df.Type + "_" + merged_df.Severity merged_df["avg_type_severity"] = merged_df.groupby( "type_severity").arr_delay.transform("mean") merged_df = merged_df[(merged_df.op_carrier == airline)] merged_df = merged_df[features_for_modeling] merged_df = merged_df.set_index("observation") train, validate, test = preprocessing.split_data(merged_df) X_train = train.drop(columns=target_variable) y_train = train[target_variable] X_validate = validate.drop(columns=target_variable) y_validate = validate[target_variable] X_test = test.drop(columns=target_variable) y_test = test[target_variable] scaler, train_scaled, validate_scaled, test_scaled = preprocessing.min_max_scaler( X_train, X_validate, X_test) knn, y_pred = run_knn(train_scaled, y_train, 3) y_pred = knn.predict(test_scaled) report = classification_report(y_test, y_pred, output_dict=True) report = pd.DataFrame.from_dict(report) actual_score = pd.DataFrame( { airline: [report.accuracy.values[0], report["True"].loc["recall"]] }, index=["accuracy", "recall"]) score = pd.concat([score, actual_score], axis=1) return score else: score = pd.read_csv("weather_modeling_scores_test.csv") return score
def k_fold(manager, k, data, weights=0): """ Runs a k fold validation k: Number of folds Returns: List: One confusion matrix per fold """ cms = [] histories = [] test_size = 1 / k n = data.count()[0] for i in range(k): print("Fold " + str(i + 1) + '/' + str(k)) # Run a complete training start_i = n * i // k timed_data = prp.split_data(data, test_size=test_size, start_index=start_i, ordered=True) train_dataset, manager.scaler = prp.scale_and_format(timed_data[0], timed_data[1], timed_data[2], timed_data[3]) manager.model, history = mlp.run_training(train_dataset, layers_sizes=manager.params["layers_sizes"],layers_activations=manager.params["layers_activations"], epochs_nb=manager.params["epochs_nb"], batch_size=manager.params["batch_size"]) pred = manager.get_pred(timed_data[1]) cm = confusion_matrix(timed_data[3]["label"], pred["label"]) cms.append(cm) histories.append(history) return cms, histories
def test_split_data(self): """ Test shape of train and validation sets """ X_train, y_train, X_val, y_val = split_data(self.mock_token_vectors, self.mock_labels, 2, 2) self.assertEqual((2, 7), X_train.shape, "Incorrect shape of training samples after split") self.assertEqual((2, 7), X_val.shape, "Incorrect shape of validation samples after split") self.assertEqual((2, ), y_train.shape, "Incorrect shape of training labels after split") self.assertEqual((2, ), y_val.shape, "Incorrect shpae of validation labels after split")
def run_training(self, dataset=None, weight=0, loss_function = mlp.jaccard_distance, verbose=1): """ Train a new neural network according to the manager's paremeters dataset: DataFrame whose columns are features and lines are train samples Returns: Training history """ # Prepare data split_dataset = prp.split_data(dataset, test_size=self.params["test_size"]) train_dataset, self.scaler = prp.scale_and_format(split_dataset[0], split_dataset[1], split_dataset[2], split_dataset[3]) if weight: weight_list = prp.compute_weights(split_dataset[2]) else: weight_list = None # Train self.model, history = mlp.run_training(train_dataset, layers_sizes=self.params["layers_sizes"], layers_activations=self.params["layers_activations"], epochs_nb=self.params["epochs_nb"], batch_size=self.params["batch_size"], weight_list = weight_list, loss_function=loss_function, verbose=verbose) # return history return history
def test_train_model(self): """ Test if function returns trained model """ texts, labels = preprocess_labels(data_dir_path="data/mock_aclImdb", dataset="train") vectorized_texts, word_index = tokenize_data(texts) mock_X_train, mock_y_train, mock_X_val, mock_y_val = split_data( vectorized_texts, labels) mock_embedding_matrix = pickle.load( open("models/mock_glove.6B/mock_embedding_matrix.p", "rb")) mock_model = build_model(mock_embedding_matrix) mock_trained_model = train_model(mock_model, (mock_X_train, mock_y_train), (mock_X_val, mock_y_val)) self.assertIsNotNone(mock_trained_model[1], "no model trained") self.assertIsNotNone(mock_trained_model[0], "history dict doesn't exist")
from model import TempNN, train_model from preprocessing import (split_data, moving_average, scaler, create_sequences, train_test, generated) if __name__ == "__main__": values_dt = pd.read_csv( '../temp_ds/Power-Networks-LCL-June2015(withAcornGps)v2_2.csv', delimiter=',') values_dt = np.asarray(values_dt['KWH/hh (per half hour) '].dropna( how='any', axis=0)) values_dt[np.where(values_dt == 'Null')] = -1 values_dt = values_dt.astype(np.float32) splited = split_data(values_dt, 50) #Нарезаем на 50 батчей avg_splited = [ moving_average(splited[i], 20) for i in range(len(splited)) ] #Усредняем scalers_data = np.asarray([ scaler(avg_splited[i]) for i in range(len(avg_splited)) ]) #Нормализуем datas = scalers_data[:, 1] # Данные (батчи) scalers = scalers_data[:, 0] # Скейлеры model = TempNN(n_features=1, n_hidden=64, seq_len=30, n_layers=1) for i, data in enumerate(datas): print("Batch №%d" % i) X_train, y_train, X_test, y_test = train_test(data) y_train = torch.reshape(y_train, (-1, 1)) y_test = torch.reshape(y_test, (-1, 1))
def __init__(self, csv_data_loc, images_dir, test__split_percentage): try: self._raw_data = get_data_from_csv(csv_data_loc) self._converted_data = convert_images_to_numpy_array( self._raw_data, images_dir) self._training_x, self._training_y, self._training_val_x, self._training_val_y = split_data( self._converted_data, self._raw_data, test__split_percentage) except RuntimeError: print(failed_to_load_data_for_class) raise
import numpy as np import preprocessing as pre # --- LSTM nao balanceada tomando media em 10 intervalos --- data_num = 500 num_int = 2560 #Lendo todos os dados do experimento X, y = pre.load_3dim('dataset/', data_num, num_int) #Pegando a media em um numero de 10 intervalos para cada componente X = pre.med_intervalo_3dim(X, 10) #Remodelado as dimensões de y para ser aceito na dummy y = np.reshape(y, (y.shape[0], -1)) #Passando y para dummy variables y_dummy = pre.dummy_variables(y) #Separando em conjunto de treino e teste (pego de forma aleatoria, aleatorizando também as variáveis dependentes) X_train, X_test, y_train, y_test = pre.split_data(X, y_dummy, 0.2, None) #Padronizando dados X_train, X_test = pre.standardize_data(X_train, X_test) #Implementando a LSTM from keras.models import Sequential from keras.layers import LSTM from keras.layers import Embedding from keras.layers import Dense #Dimensao da camada invisivel hidden_size = 32 #Criando obbjeto da rede sl_model = Sequential() #gerando uma camada do tipo LSTM que recebe o numero de saidas, o tipo de funcao de ativacao geralmente a tangente hiperbolica # o quanto irei desconsiderar dos dados de entrada nessa camada e quanto irei desconsideradar do estado de recorrencia anterior sl_model.add( LSTM(units=hidden_size,
if inject_anom: if (np.random.binomial(1, 0.01))>0.5: y += np.random.uniform(-0.75,0.75) yield t, y ''' Createe training data ''' signal_generator = generate_signal(inject_anom=False) ''' Generate training data with ength of 1000''' data = [] for i in range(1000): t, sig = next(signal_generator) data.append(sig) train, test = preprocessing.split_data(data) timesteps = 5 X_train, Y_train = preprocessing.arange_data_for_sequence_model(data, timesteps, lookforward=1) def define_model(n_features, timesteps, batch_size=None, stateful=False, forward_pred=1): """ Defines and builds a model. This function can build both stateful and none stateful model. ------------------------------------------------------------------------------------------- args: n_features (int) - Number of features in the data timesteps (int) - number of look back timesteps the model uses to generate predictions batch_size (int) - model training batch size stateful (bool) - forward_pred (int) - number of forward steps to forecast
def main(): # Maybe delete this ? group = 'lung' parser = argparse.ArgumentParser(description='classifier') parser.add_argument('--sample_file', type=str, default='lung.emx.txt', help="the name of the GEM organized by samples (columns) by genes (rows)") parser.add_argument('--label_file', type=str, default='sample_condition.txt', help="name of the label file: two columns that maps the sample to the label") parser.add_argument('--output_name', type=str, default='tissue-run-1', help="name of the output directory to store the output files") #parser.add_argument('--overwrite_output', type=bool, default=False, help="overwrite the output directory file if it already exists") parser.add_argument('--batch_size', type=int, default=16, help="size of batches to split data") parser.add_argument('--max_epoch', type=int, default=100, help="number of passes through a dataset") parser.add_argument('--learning_rate', type=float, default=0.001, help="controls the rate at which the weights of the model update") parser.add_argument('--test_split', type=float, default=0.3, help="percentage of test data, the train data will be the remaining data. 30% -> 0.3") parser.add_argument('--continuous_discrete', type=str, default='continuous', help="type of data in the sample file, typically RNA will be continous and DNA will be discrete") parser.add_argument('--plot_results', type=bool, default=True, help="plots the sample distribution, training/test accuracy/loss, and confusion matrix") parser.add_argument('--use_gpu', type=bool, default=False, help="true to use a gpu, false to use the cpu - if the node does not have a gpu then it will use the cpu") args = parser.parse_args() #If data is discrete, data should only range between 0-3 #if args.continuous_discrete == "discrete": #args.input_num_classes = 4 # Initialize file paths and create output folder LABEL_FILE = os.path.join(INPUT_DIR, args.label_file) SAMPLE_FILE = os.path.join(INPUT_DIR, args.sample_file) OUTPUT_DIR_FINAL = os.path.join(OUTPUT_DIR, args.output_name + "-" + str(datetime.today().strftime('%Y-%m-%d-%H:%M'))) if not os.path.exists(OUTPUT_DIR_FINAL): os.makedirs(OUTPUT_DIR_FINAL) # Create log file to keep track of model parameters logging.basicConfig(filename=os.path.join(OUTPUT_DIR_FINAL,'classifier.log'), filemode='w', format='%(message)s', level=logging.INFO) logger = logging.getLogger(__name__) logger.info('Classifer log file for ' + args.sample_file + ' - Started on ' + str(datetime.today().strftime('%Y-%m-%d-%H:%M')) + '\n') logger.info('Batch size: %d', args.batch_size) logger.info('Number of epochs: %d', args.max_epoch) logger.info('Learning Rate: %f', args.learning_rate) logger.info('Sample filename: ' + args.sample_file) logger.info('Output directory: ' + args.output_name) if args.continuous_discrete != 'continuous' and args.continuous_discrete != 'discrete': logger.error("ERROR: check that the continuous_discrete argument is spelled correctly.") logger.error(" only continuous or discrete data can be processed.") sys.exit("\nCommand line argument error. Please check the log file.\n") # Intialize gpu usage if desired use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda and args.use_gpu else "cpu") train_kwargs = {'batch_size': 16} test_kwargs = {'batch_size': 16} if use_cuda: cuda_kwargs = {'num_workers': 1, 'pin_memory': True, 'shuffle': True} train_kwargs.update(cuda_kwargs) test_kwargs.update(cuda_kwargs) # Load matrix, labels/weights, and number of samples column_names = ("sample", "label") matrix_df = pd.read_csv(SAMPLE_FILE, sep='\t', index_col=[0]) labels_df = pd.read_csv(LABEL_FILE, names=column_names, delim_whitespace=True, header=None) # Error checking for same number of samples in both files and samples are unique samples_unique = set(labels_df.iloc[:,0]) assert len(labels_df) == len(matrix_df.columns) assert len(labels_df) == len(samples_unique) labels, class_weights = preprocessing.labels_and_weights(labels_df) args.output_num_classes = len(labels) is_binary = False if len(labels) == 2: is_binary = True args.output_num_classess = 1 # Define model paramters batch_size = args.batch_size max_epoch = args.max_epoch learning_rate = args.learning_rate #5e-4 num_features = len(matrix_df.index) # Setup model model = utils.Net(input_seq_length=num_features, output_num_classes=args.output_num_classes).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1) if is_binary: loss_fn = torch.nn.BCEWithLogitsLoss() else: loss_fn = torch.nn.CrossEntropyLoss()#(weight=class_weights) logger.info('Number of samples: %d\n', len(labels_df)) logger.info('Labels: ') for i in range(len(labels)): logger.info(' %d - %s', i, labels[i]) # Replace missing data with the global minimum of the dataset val_min, val_max = np.nanmin(matrix_df), np.nanmax(matrix_df) matrix_df.fillna(val_min, inplace=True) # Transposing matrix to align with label file matrix_transposed_df = matrix_df.T # Create density and tsne plot graphs = Plotter(OUTPUT_DIR_FINAL) graphs.density(matrix_df) graphs.tsne(matrix_transposed_df, labels_df, labels, title=args.sample_file) train_data, test_data = preprocessing.split_data(matrix_transposed_df, labels_df, args.test_split, args.output_num_classes) # Convert tuple of df's to tuple of np's # Allows the dataset class to access w/ data[][] instead of data[].iloc[] train_data_np = (train_data[0].values, train_data[1].values) test_data_np = (test_data[0].values, test_data[1].values) train_dataset = dataset.Dataset(train_data_np) test_dataset = dataset.Dataset(test_data_np) train_generator = data.DataLoader(train_dataset, **train_kwargs, drop_last=False) test_generator = data.DataLoader(test_dataset, **test_kwargs, drop_last=False) # drop_last=True would drop the last batch if the sample size is not divisible by the batch size logger.info('\nTraining size: %d \nTesting size: %d\n', len(train_dataset), len(test_dataset)) # Create variables to store accuracy and loss loss_meter = utils.AverageMeter() loss_meter.reset() summary_file = pd.DataFrame([], columns=['Epoch', 'Training Loss', 'Accuracy', 'Accurate Count', 'Total Items']) train_stats = pd.DataFrame([], columns=['accuracy', 'loss']) test_stats = pd.DataFrame([], columns=['accuracy', 'loss']) # Train and test the model for epoch in range(args.max_epoch): train_stats = train(model, device, is_binary, train_generator, optimizer, loss_fn, batch_size, loss_meter, train_stats) test_stats = test(model, device, is_binary, test_generator, loss_fn, epoch, batch_size, loss_meter, test_stats, train_stats, logger) scheduler.step() # Training finished - Below is used for testing the network, plots and saving results if(args.plot_results): y_predict_list = [] y_target_list = [] y_predict_list, y_target_list = forward(model, device, is_binary, test_generator, y_predict_list, y_target_list) graphs.accuracy(train_stats, test_stats, graphs_title=args.sample_file) graphs.confusion(y_predict_list, y_target_list, labels, cm_title=args.sample_file) logger.info("\n\nf1 score: %0.2f" % (f1_score(y_target_list, y_predict_list, average="weighted"))) #summary_file.to_csv(RESULTS_FILE, sep='\t', index=False) logger.info('\nFinal Accuracy: %2.3f', test_stats.iloc[epoch]['accuracy']) logger.info('\nFinished at ' + str(datetime.today().strftime('%Y-%m-%d-%H:%M')))
def train_GAN(params): # ------------------- # Parameters # ------------------- log(str(params), name=params['log_name']) # Clear remaining model if params['ratio_L'] < 1.0 or params['ratio_U'] < 1.0: network.clear(params['name'] + '_R' + str(params['start_run'])) plt.close('all') # ------------------- # CUDA # ------------------- cuda = True if torch.cuda.is_available() else False G_Loss = torch.nn.BCELoss() D_Loss = torch.nn.BCELoss() C_Loss = torch.nn.BCELoss() if cuda: G_Loss.cuda() D_Loss.cuda() C_Loss.cuda() floatTensor = torch.cuda.FloatTensor log("CUDA Training.", name=params['log_name']) network.clear_cache() else: floatTensor = torch.FloatTensor log("CPU Training.", name=params['log_name']) # ------------------- # Data scaling # ------------------- ''' XTL ... Original labelled data XTU ... Original unlabelled data XTV ... Original validation data XL ... Labelled data XU ... Unlabelled data XV ... Validation data ''' dset_L = params['dset_L'] dset_U = params['dset_U'] dset_V = params['dset_V'] if dset_L == dset_U: X, Y = pp.get_data(params, dset_L) XTL, XTU, YTL, YTU = pp.split_data(X, Y) else: XTL, YTL = pp.get_data(params, dset_L) XTU, YTU = pp.get_data(params, dset_U) if dset_V is None: XTV, YTV = XTU, YTU else: XTV, YTV = pp.get_data(params, dset_V) XTL = pp.scale_minmax(XTL) XTU = pp.scale_minmax(XTU) XTV = pp.scale_minmax(XTV) if params['ratio_V'] < 1.0: XTV, YTV = pp.select_random(XTV, YTV, params['ratio_L']) log("Selected %s of validation samples." % (format(params['ratio_V'], '0.2f')), name=params['log_name']) DL_V = pp.get_dataloader(params, XTV, YTV, batch_size=1024) # ------------------- # Load accuracy # ------------------- mat_accuracy_G, mat_accuracy_D, mat_accuracy_C = network.load_Acc(params) if (params['R_active']): mat_accuracy_R = network.load_R_Acc(params) # ------------------- # Final prediction # ------------------- if (params['prediction']): Y_pred = torch.zeros(XTU.shape[0], 8) # ------------------- # Start Training # ------------------- YF = None PF = None RF = None for run in range(params['runs']): # ------------------- # Labelled Data # ------------------- XL, YL = XTL, YTL if params['ratio_L'] < 1.0: XL, YL = pp.select_random(XL, YL, params['ratio_L']) log("Selected %s of labelled samples." % (format(params['ratio_L'], '0.2f')), name=params['log_name']) count_L = YL.shape[0] log("Number of labelled samples = %d." % (count_L), name=params['log_name']) DL_L = pp.get_dataloader(params, XL, YL) # ------------------- # Unlabelled Data # ------------------- XU, YU = XTU, YTU if params['ratio_U'] < 1.0: XU, YU = pp.select_random(XU, YU, params['ratio_U']) log("Selected %s of unlabelled samples." % (format(params['ratio_U'], '0.2f')), name=params['log_name']) log("Number of unlabelled samples = %d." % (XU.shape[0]), name=params['log_name']) DL_U_iter = pp.get_perm_dataloader(params, XU, YU) # ------------------- # Networks # ------------------- G, D, C = network.load_GAN(run, params) if (params['R_active']): R = network.load_Ref(run, params) # ------------------- # Optimizers # ------------------- optimizer_G = torch.optim.Adam(G.parameters(), lr=params['GLR'], betas=(params['GB1'], params['GB2'])) optimizer_D = torch.optim.Adam(D.parameters(), lr=params['DLR'], betas=(params['DB1'], params['DB2'])) optimizer_C = torch.optim.Adam(C.parameters(), lr=params['CLR'], betas=(params['CB1'], params['CB2'])) if (params['R_active']): optimizer_R = torch.optim.Adam(R.parameters(), lr=params['CLR'], betas=(params['CB1'], params['CB2'])) # ------------------- # Training # ------------------- if run >= params['start_run']: if params['oversampling']: XL, YL = pp.over_sampling(params, XL, YL) log("Oversampling: created %d new labelled samples." % (XL.shape[0] - count_L), name=params['log_name']) for epoch in range(params['epochs']): # Jump to start epoch if run == params['start_run']: if epoch < params['start_epoch']: continue running_loss_G = 0.0 running_loss_D = 0.0 running_loss_C = 0.0 """ X1, P1 - Labelled Data, predicted Labels (C) | Regular training of classifier W1 = (X1, Y1), A1 - Labelled Data, actual Labels, predicted Authenticity (D) | Real samples W2 = (X2, Y2), A2 - Unlabelled Data, predicted Labels (C), predicted Authenticity (D) | Real data with fake labels W3 = (X3, Y3), A3 - Synthetic Data (G), actual Labels, predicted Authenticity (D) | Fake data with real labels W4 = (X4, Y4), A4 - Unlabbeled Data, predicted Labels (C), predicted Authenticity (D) | Fake positive to prevent overfitting XV, YV, PV - Validation Data, actual Labels, predicted Labels (C) | Validation samples R1, F2, F3, R4 - Real/Fake Labels """ for i, data in enumerate(DL_L, 1): loss_G = [] loss_D = [] loss_C = [] # ------------------- # Train the classifier on real samples # ------------------- X1, Y1 = data W1 = torch.cat((X1, Y1), dim=1) R1 = floatTensor(W1.shape[0], 1).fill_(1.0) if params['C_basic_train']: optimizer_C.zero_grad() P1 = C(X1) loss = C_Loss(P1, Y1) loss_C.append(loss) loss.backward() optimizer_C.step() if params['R_active']: optimizer_R.zero_grad() PR = R(X1) loss = C_Loss(PR, Y1) loss.backward() optimizer_R.step() # ------------------- # Train the discriminator to label real samples # ------------------- optimizer_D.zero_grad() A1 = D(W1) loss = D_Loss(A1, R1) loss_D.append(loss) loss.backward() optimizer_D.step() # ------------------- # Classify unlabelled data # ------------------- optimizer_C.zero_grad() X2 = DL_U_iter.get_next()[0] Y2 = C(X2) W2 = torch.cat((X2, Y2), dim=1) # ------------------- # Train the classifier to label unlabelled samples # ------------------- A2 = D(W2) R2 = floatTensor(W2.shape[0], 1).fill_(1.0) loss = C_Loss(A2, R2) loss_C.append(loss) loss.backward() optimizer_C.step() # ------------------- # Train the discriminator to label predicted samples # ------------------- optimizer_D.zero_grad() A2 = D(W2.detach()) F2 = floatTensor(W2.shape[0], 1).fill_(0.0) loss = D_Loss(A2, F2) loss_D.append(loss) loss.backward() optimizer_D.step() # ------------------- # Train the discriminator to label fake positive samples # ------------------- X4 = DL_U_iter.get_next()[0] Y4 = C(X4) W4 = torch.cat((X4, Y4), dim=1) optimizer_D.zero_grad() A4 = D(W4) R4 = floatTensor(W4.shape[0], 1).fill_(1.0) loss = D_Loss(A4, R4) loss_D.append(loss) loss.backward() optimizer_D.step() # ------------------- # Create Synthetic Data # ------------------- optimizer_G.zero_grad() if params['G_label_sample']: # Selected Labels from a uniform distribution of available labels Y3 = floatTensor( pp.get_one_hot_labels(params=params, num=Y1.shape[0] * params['G_label_factor'])) else: # Select labels from current training batch Y3 = torch.cat( ([Y1 for _ in range(params['G_label_factor'])]), dim=0) Z = floatTensor( np.random.normal(0, 1, (Y3.shape[0], params['noise_shape']))) I3 = torch.cat((Z, Y3), dim=1) X3 = G(I3) W3 = torch.cat((X3, Y3), dim=1) # ------------------- # Train the generator to fool the discriminator # ------------------- A3 = D(W3) R3 = floatTensor(W3.shape[0], 1).fill_(1.0) loss = G_Loss(A3, R3) loss_G.append(loss) loss.backward() optimizer_G.step() # ------------------- # Train the discriminator to label synthetic samples # ------------------- optimizer_D.zero_grad() A3 = D(W3.detach()) F3 = floatTensor(W3.shape[0], 1).fill_(0.0) loss = D_Loss(A3, F3) loss_D.append(loss) loss.backward() optimizer_D.step() # ------------------- # Calculate overall loss # ------------------- running_loss_G += np.mean([loss.item() for loss in loss_G]) running_loss_D += np.mean([loss.item() for loss in loss_D]) running_loss_C += np.mean([loss.item() for loss in loss_C]) # ------------------- # Post Epoch # ------------------- logString = "[Run %d/%d] [Epoch %d/%d] [G loss: %f] [D loss: %f] [C loss: %f]" % ( run + 1, params['runs'], epoch + 1, params['epochs'], running_loss_G / (i), running_loss_D / (i), running_loss_C / (i)) log(logString, save=False, name=params['log_name']) if (epoch + 1) % params['save_step'] == 0: idx = run, int(epoch / params['save_step']) + 1 acc_D_real = [] acc_D_vs_C = [] acc_D_vs_G = [] acc_C_real = [] for data in DL_V: XV, YV = data # Predict labels PV = C(XV) if params['R_active']: PR = R(XV) mat_accuracy_R[idx] = get_accuracy(PR, YV) network.save_Ref(params['name'], run, R) network.save_R_Acc(params, mat_accuracy_R) # Generate Synthetic Data Z = floatTensor( np.random.normal( 0, 1, (YV.shape[0], params['noise_shape']))) IV = torch.cat((Z, YV), dim=1) XG = G(IV) # Estimate Discriminator Accuracy WV1 = torch.cat((XV, YV), dim=1) WV2 = torch.cat((XV, PV), dim=1) WV3 = torch.cat((XG, YV), dim=1) RV1 = floatTensor(WV1.shape[0], 1).fill_(1.0) FV2 = floatTensor(WV2.shape[0], 1).fill_(0.0) FV3 = floatTensor(WV3.shape[0], 1).fill_(0.0) AV1 = D(WV1) AV2 = D(WV2) AV3 = D(WV3) acc_D_real.append(get_accuracy_binary(AV1, RV1)) acc_D_vs_C.append(get_accuracy_binary(AV2, FV2)) acc_D_vs_G.append(get_accuracy_binary(AV3, FV3)) acc_C_real.append(get_accuracy(PV, YV)) acc_D_real = np.mean(acc_D_real) acc_D_vs_C = np.mean(acc_D_vs_C) acc_D_vs_G = np.mean(acc_D_vs_G) acc_D = .5 * acc_D_real + .25 * acc_D_vs_G + .25 * acc_D_vs_C mat_accuracy_D[idx] = acc_D acc_C_real = np.mean(acc_C_real) acc_C_vs_D = 1.0 - acc_D_vs_C acc_C = .5 * acc_C_real + .5 * acc_C_vs_D mat_accuracy_C[idx] = acc_C_real acc_G = 1.0 - acc_D_vs_G mat_accuracy_G[idx] = acc_G logString = "[Run %d/%d] [Epoch %d/%d] [G acc: %f] [D acc: %f | vs Real: %f | vs G: %f | vs C: %f] [C acc: %f | vs Real: %f | vs D: %f]" % ( run + 1, params['runs'], epoch + 1, params['epochs'], acc_G, acc_D, acc_D_real, acc_D_vs_G, acc_D_vs_C, acc_C, acc_C_real, acc_C_vs_D) log(logString, save=True, name=params['log_name']) network.save_GAN(params['name'], run, G, D, C) params['start_epoch'] = epoch + 1 network.save_Parameter(params) network.save_Acc(params, mat_accuracy_G, mat_accuracy_D, mat_accuracy_C) # End of Training Run params['start_run'] = run + 1 params['start_epoch'] = 0 network.save_Parameter(params) # ------------------- # Post Run # ------------------- acc_C_real = [] for data in DL_V: XV, YV = data # # Generate Synthetic Data # Z = floatTensor(np.random.normal(0, 1, (YV.shape[0], params['noise_shape']))) # IV = torch.cat((Z,YV),dim=1) # XG = G(IV) # Classify Validation data PC = C(XV) acc_C_real.append(get_accuracy(PC, YV)) if params['R_active']: if RF == None: RF = R(XV) else: RF = torch.cat((RF, R(XV).detach()), 0) if YF == None: YF = YV PF = PC else: YF = torch.cat((YF, YV), 0) PF = torch.cat((PF, PC), 0) mat_accuracy_C[run] = np.mean(acc_C_real) # ------------------- # Final prediction # ------------------- if (params['prediction']): C.hard = False XP = pp.get_tensor(XTU, None)[0] YP = C(XP) Y_pred += YP.cpu().detach() C.hard = True # ------------------- # Post Training # ------------------- timeline = np.arange(0, params['epochs'] + 1, params['save_step']) # ------------------- # Plot Accuracy # ------------------- acc_G = np.mean(mat_accuracy_G, axis=0) std_G = np.std(mat_accuracy_G, axis=0) acc_D = np.mean(mat_accuracy_D, axis=0) std_D = np.std(mat_accuracy_D, axis=0) acc_C = np.mean(mat_accuracy_C, axis=0) std_C = np.std(mat_accuracy_C, axis=0) if params['R_active']: acc_R = np.mean(mat_accuracy_R, axis=0) fig, ax = plt.subplots() legend = [] cmap = plt.get_cmap('gnuplot') indices = np.linspace(0, cmap.N, 7) colors = [cmap(int(i)) for i in indices] ax.plot(timeline, acc_C, c=colors[0], linestyle='solid') ax.fill_between(timeline, acc_C - std_C, acc_C + std_C, alpha=0.3, facecolor=colors[0]) legend.append("Accuracy $A_C$") ax.plot(timeline, acc_D, c=colors[1], linestyle='dashed') ax.fill_between(timeline, acc_D - std_D, acc_D + std_D, alpha=0.3, facecolor=colors[1]) legend.append("Accuracy $A_D$") ax.plot(timeline, acc_G, c=colors[2], linestyle='dotted') ax.fill_between(timeline, acc_G - std_G, acc_G + std_G, alpha=0.3, facecolor=colors[2]) legend.append("Accuracy $A_G$") Y_max = 1.15 if params['R_active']: ax.plot(timeline, acc_R, c=colors[3], linestyle='dashdot') legend.append("Accuracy $A_R$") perf = np.zeros_like(acc_C) perf[0] = 0.0 perf[1:] = (acc_C[1:] - acc_R[1:]) / acc_R[1:] ax.plot(timeline, perf + 1, c=colors[4], linestyle='solid') legend.append("Performance $P_C$") ax.set_xlim(0.0, params['epochs']) ax.set_ylim(0.0, Y_max) ax.legend(legend, fontsize=20) ax.set_xlabel('Epoch', fontsize=20) ax.set_ylabel('Accuracy', fontsize=20) ax.grid() save_fig(params, 'eval', fig) # ------------------- # Compare Classifier to Baseline # ------------------- if params['R_active']: maxC = np.argmax(acc_C, axis=0) bestC = acc_C[maxC] maxR = np.argmax(acc_R, axis=0) bestR = acc_R[maxR] log(' - Peak Accuracy: C: %s after %d epochs | R: %s after %d epochs | Inc: %s' % (format((bestC), '0.4f'), timeline[maxC], format( (bestR), '0.4f'), timeline[maxR], format((bestC - bestR) / bestR, '0.4f')), name='results') Y_max = max(Y_max, max(perf + 1) + 0.025) maxP = np.argmax(perf, axis=0) log(' - Hightest $P_C$: %s after %d epochs.' % (format( (perf[maxP]), '0.4f'), timeline[maxP]), name='results') adva = np.zeros_like(acc_C) for i, v1 in enumerate(acc_C): for j, v2 in enumerate(acc_R): if v2 >= v1: adva[i] = j - i break maxA = np.argmax(adva, axis=0) log(' - Biggest Advantage: %d epochs after %d epochs.' % (adva[maxA] * params['save_step'], timeline[maxA]), name='results') # ------------------- # Log Results # ------------------- if params['evaluate']: log(" - %s ( %s | %s ): [C acc: %f ( ± %f )]" % (params['name'], params['dset_V'], params['location'], acc_C[-1], std_C[-1]), name='results') else: log(" - " + params['name'] + ": [C acc: %f ( ± %f )] [D acc: %f ( ± %f )] [G acc: %f ( ± %f )]" % (acc_C[-1], std_C[-1], acc_D[-1], std_D[-1], acc_G[-1], std_G[-1]), name='results') # ------------------- # Generate Confusion Matrix # ------------------- YF = pp.one_hot_to_labels(params, YF) PF = pp.one_hot_to_labels(params, PF) con_mat = confusion_matrix(YF, PF, labels=None, sample_weight=None, normalize='true') if params['evaluate']: plot_confusion_matrix(con_mat, params, name='%s_%s' % (params['dset_V'], params['location']), title='Confusion matrix') else: plot_confusion_matrix(con_mat, params, name='C', title='Confusion matrix') if params['R_active']: RF = pp.one_hot_to_labels(params, RF) con_mat = confusion_matrix(YF, RF, labels=None, sample_weight=None, normalize='true') plot_confusion_matrix(con_mat, params, name='R', title='Confusion matrix') # ------------------- # Final prediction # ------------------- if (params['prediction']): network.make_dir_pre() pred = torch.argmax(Y_pred, axis=1) f = open(network.S_PATH + params['name'] + '_predictions.txt', "w") for y in pred: f.write(' '.join(['%.6f' % (float(y.item() + 1))] * 500) + '\n') f.close()
import numpy as np import preprocessing as pre # --- SVM balanceada tomando a media dos intervalos de 10 em 10 --- data_num = 500 num_int = 2560 #Lendo todos os dados do experimento X, y = pre.load('dataset/', data_num, num_int) #Pegando a media em um numero de 10 intervalos para cada componente X = pre.med_intervalo(X, 10) #Balanceando os dados X, y = pre.proc_balanceado(X, y, data_num) #Separando em conjunto de treino e teste (pego de forma aleatoria, aleatorizando também as variáveis dependentes) X_train, X_test, y_train, y_test = pre.split_data(X, y, 0.2, None) #Padronizando dados X_train, X_test = pre.standardize_data(X_train, X_test) #Implementando a SVM from sklearn.svm import SVC classifier = SVC(kernel='rbf', probability=True, gamma='auto') #Treinando a SVM classifier.fit(X_train, y_train.ravel()) #Prevendo os resultados de teste y_pred = classifier.predict(X_test) svm_predict = classifier.predict_proba(X_test) #Produzindo a confusion matrix da SVM acima from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) print('\n\nConfusion Matrix: \n', cm)
def twod_array(array): return np.concatenate([array[:2560].reshape(-1,1), array[2560:].reshape(-1,1)], axis=1) #MLP - dataset original: ####################################################################### data_num = 500 num_int = 2560 random_seed = 31 X, y = pre.load('dataset/',data_num,num_int) y_dummy = y.reshape(-1, 1) from sklearn.preprocessing import OneHotEncoder onehotencoder = OneHotEncoder() y_dummy = onehotencoder.fit_transform(y_dummy).toarray() X_train, X_test, y_train, y_test = pre.split_data(X,y_dummy,0.2,random_seed) X_train, X_test = pre.standardize_data(X_train,X_test) from keras.models import Sequential from keras.layers import Dense, Dropout mlp_cls = Sequential() mlp_cls.add(Dense(units=128, kernel_initializer='uniform', activation='sigmoid', input_dim=X_train.shape[1])) mlp_cls.add(Dropout(0.5)) mlp_cls.add(Dense(units=y_train.shape[1], kernel_initializer='uniform', activation='softmax')) mlp_cls.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) weigths = mlp_cls.get_weights()
def train_evaluate_model(city, data, predict_n, look_back, hidden, epochs, ratio=0.7, cluster=True, load=False, uncertainty=True): """ Train the model :param city: Name of the city :param data: Dataset :param predict_n: Number of steps ahead to be predicted :param look_back: number of history steps to include in training window :param hidden: Number of Hidden layer :param epochs: number of training epochs :param ratio: ratio of the full dataset to use in training :param cluster: whether to train on features from the city's cluster :param load: Whether to load a previously saved model :return: """ if cluster: target_col = list(data.columns).index("casos_est_{}".format(city)) else: target_col = list(data.columns).index("casos_est") norm_data, max_features = normalize_data(data) factor = max_features[target_col] ##split test and train X_train, Y_train, X_test, Y_test = split_data( norm_data, look_back=look_back, ratio=ratio, predict_n=predict_n, Y_column=target_col, ) print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape) ## Run model model = build_model(hidden, X_train.shape[2], predict_n=predict_n, look_back=look_back) if load: model.load_weights("trained_{}_model.h5".format(city)) history = train(model, X_train, Y_train, batch_size=1, epochs=epochs, geocode=city) model.save('../saved_models/LSTM/{}/lstm_{}_epochs_{}.h5'.format( STATE, city, epochs)) predicted_out, metrics_out = evaluate( city, model, X_test, Y_test, label="out_of_sample_{}".format(city), uncertainty=uncertainty) predicted_in, metrics_in = evaluate(city, model, X_train, Y_train, label="in_sample_{}".format(city), uncertainty=uncertainty) if uncertainty: pout = np.percentile(predicted_out, 50, axis=2) else: pout = predicted_out metrics = calculate_metrics(pout, Y_test, factor) metrics.to_pickle("../saved_models/LSTM/{}/metrics_lstm_{}_8pw.pkl".format( STATE, city)) predicted = np.concatenate((predicted_in, predicted_out), axis=0) with open( "../saved_models/LSTM/{}/predicted_lstm_{}_8pw.pkl".format( STATE, city), "wb") as f: pickle.dump(predicted, f) return predicted, X_test, Y_test, Y_train, factor
import numpy as np import preprocessing as pre # --- MLP nao balanceada tomando todos a media de 10 intervalos --- data_num = 500 num_int = 2560 #Lendo todos os dados do experimento X, y = pre.load('dataset/', data_num, num_int) #Pegando a media em um numero de 10 intervalos para cada componente X = pre.med_intervalo(X, 10) #Remodelado as dimensões de y para ser aceito na dummy y = np.reshape(y, (y.shape[0], -1)) #Passando y para dummy variables y_dummy = pre.dummy_variables(y) #Separando em conjunto de treino e teste (pego de forma aleatoria, aleatorizando também as variáveis dependentes) X_train, X_test, y_train, y_test = pre.split_data(X, y_dummy, 0.2, None) #Padronizando dados X_train, X_test = pre.standardize_data(X_train, X_test) #Implementando a MLP from keras.models import Sequential #modulo responsavel por inicializar a rede from keras.layers import Dense, Dropout #modulo responsavel por gerar as camadas da rede mlp_cls = Sequential() #Saidas do primeiro layer : (183+1)/2 = 92 mlp_cls.add( Dense(units=92, kernel_initializer='uniform', activation='sigmoid',
def run( filename: str = '^GSPC.csv', frac_train: float = .8, save_plots: bool = False, show_plots: bool = False, ): # load data sp = get_data.get_data(filename=filename) # preprocess df_scaled = sp.apply(preprocessing.adjust_to_seasonality, args=([ 'scale', ], )) df_first_diff = df_scaled.apply(preprocessing.adjust_to_seasonality, args=([ 'first_diff', ], )) res = {} for transform, df in { 'scaled': df_scaled, 'first_diff': df_first_diff }.items(): train, test = preprocessing.split_data(df, frac_train=frac_train) # reference model reference = model.reference.Reference(train['volume'], test['volume']) res['reference; ' + transform] = reference.results( show_plots=show_plots, save_plots=save_plots, plot_args={ 'title': str(reference) + '_' + transform, 'ylabel': transform }) # univariate model sarimax = model.statespace_models.Model( train['volume'], test['volume'], model=model.statespace_models.SARIMAX, trend='ct', order=(4, 1, 4), enforce_invertibility=False) res['sarimax; ' + transform] = sarimax.results(show_plots=show_plots, save_plots=save_plots, plot_args={ 'title': str(sarimax) + '_' + transform, 'ylabel': transform }) # multivariate model varmax = model.statespace_models.Model( train[['open', 'close', 'volume']], test[['open', 'close', 'volume']], column='volume', model=model.statespace_models.VARMAX, trend='c', order=(4, 1)) res['varmax; ' + transform] = varmax.results(show_plots=show_plots, save_plots=save_plots, plot_args={ 'title': str(varmax) + '_' + transform, 'ylabel': transform }) pprint(res)
def main(*kargs, **kwargs): get_kwargs(kwargs) train_fname = kwargs['train'] test_fname = kwargs['test'] result_fname = kwargs['output'] word_embeds_fname = kwargs['word_embeds'] char_embeds_fname = kwargs['char_embeds'] logger_fname = kwargs['logger'] mode = kwargs['mode'] max_words = kwargs['max_words'] use_only_exists_words = kwargs['use_only_exists_words'] swear_words_fname = kwargs['swear_words'] wrong_words_fname = kwargs['wrong_words'] embeds_format = kwargs['format_embeds'] config = kwargs['config'] output_dir = kwargs['output_dir'] norm_prob = kwargs['norm_prob'] norm_prob_koef = kwargs['norm_prob_koef'] gpus = kwargs['gpus'] seq_col_name_words = 'comment_seq_lw_use_exist{}_{}k'.format( int(use_only_exists_words), int(max_words / 1000)) seq_col_name_ll3 = 'comment_seq_ll3_use_exist{}_{}k'.format( int(use_only_exists_words), int(max_words / 1000)) model_file = { 'dense': os.path.join(output_dir, 'dense.h5'), 'cnn': os.path.join(output_dir, 'cnn.h5'), 'lstm': os.path.join(output_dir, 'lstm.h5'), 'lr': os.path.join(output_dir, '{}_logreg.bin'), 'catboost': os.path.join(output_dir, '{}_catboost.bin') } # ====Create logger==== logger = Logger(logging.getLogger(), logger_fname) # ====Detect GPUs==== logger.debug(device_lib.list_local_devices()) # ====Load data==== logger.info('Loading data...') train_df = load_data(train_fname) test_df = load_data(test_fname) target_labels = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] num_classes = len(target_labels) # ====Load additional data==== logger.info('Loading additional data...') swear_words = load_data(swear_words_fname, func=lambda x: set(x.T[0]), header=None) wrong_words_dict = load_data(wrong_words_fname, func=lambda x: {val[0]: val[1] for val in x}) # ====Load word vectors==== logger.info('Loading embeddings...') embeds_word = Embeds().load(word_embeds_fname, embeds_format) embeds_ll3 = Embeds().load(char_embeds_fname, embeds_format) # ====Clean texts==== if mode in ('preprocess', 'all'): logger.info('Cleaning text...') train_df['comment_text_clear'] = clean_text(train_df['comment_text'], wrong_words_dict, autocorrect=True) test_df['comment_text_clear'] = clean_text(test_df['comment_text'], wrong_words_dict, autocorrect=True) train_df.to_csv(os.path.join(output_dir, 'train_clear.csv'), index=False) test_df.to_csv(os.path.join(output_dir, 'test_clear.csv'), index=False) # ====Calculate maximum seq length==== logger.info('Calc text length...') train_df.fillna('__NA__', inplace=True) test_df.fillna('__NA__', inplace=True) train_df['text_len'] = train_df['comment_text_clear'].apply( lambda words: len(words.split())) test_df['text_len'] = test_df['comment_text_clear'].apply( lambda words: len(words.split())) max_seq_len = np.round(train_df['text_len'].mean() + 3 * train_df['text_len'].std()).astype(int) max_char_seq_len = 2000 # empirical logger.debug('Max seq length = {}'.format(max_seq_len)) # ====Prepare data to NN==== logger.info('Converting texts to sequences...') if mode in ('preprocess', 'all'): train_df[seq_col_name_words], test_df[ seq_col_name_words], word_index, train_df[ seq_col_name_ll3], test_df[ seq_col_name_ll3], ll3_index = convert_text2seq( train_df['comment_text_clear'].tolist(), test_df['comment_text_clear'].tolist(), max_words, max_seq_len, max_char_seq_len, embeds_word, lower=True, oov_token='__NA__', uniq=False, use_only_exists_words=use_only_exists_words) logger.debug('Dictionary size use_exist{} = {}'.format( int(use_only_exists_words), len(word_index))) logger.debug('Char dict size use_exist{} = {}'.format( int(use_only_exists_words), len(ll3_index))) logger.info('Preparing embedding matrix...') words_not_found = embeds_word.set_matrix(max_words, word_index) embeds_ll3.matrix = np.random.normal(size=(len(ll3_index), embeds_word.shape[1])) embeds_ll3.word_index = ll3_index embeds_ll3.word_index_reverse = { val: key for key, val in ll3_index.items() } embeds_ll3.shape = np.shape(embeds_ll3.matrix) embeds_word.save( os.path.join(output_dir, 'wiki.embeds_lw.{}k'.format(int(max_words / 1000)))) embeds_ll3.save( os.path.join(output_dir, 'wiki.embeds_ll3.{}k'.format(int(max_words / 1000)))) # ====Get text vector==== pooling = { 'max': { 'func': np.max }, 'avg': { 'func': np.sum, 'normalize': True }, 'sum': { 'func': np.sum, 'normalize': False } } for p in ['max', 'avg', 'sum']: train_df['comment_vec_{}'.format( p)] = train_df[seq_col_name_words].apply( lambda x: embed_aggregate(x, embeds_word, **pooling[p])) test_df['comment_vec_{}'.format( p)] = test_df[seq_col_name_words].apply( lambda x: embed_aggregate(x, embeds_word, **pooling[p])) train_df.to_csv(os.path.join(output_dir, 'train_clear1.csv'), index=False) test_df.to_csv(os.path.join(output_dir, 'test_clear1.csv'), index=False) else: for col in train_df.columns: if col.startswith('comment_seq'): train_df[col] = train_df[col].apply( lambda x: parse_seq(x, int)) test_df[col] = test_df[col].apply(lambda x: parse_seq(x, int)) elif col.startswith('comment_vec'): train_df[col] = train_df[col].apply( lambda x: parse_seq(x, float)) test_df[col] = test_df[col].apply( lambda x: parse_seq(x, float)) logger.debug('Embedding matrix shape = {}'.format(embeds_word.shape)) logger.debug('Number of null word embeddings = {}'.format( np.sum(np.sum(embeds_word.matrix, axis=1) == 0))) # ====END OF `PREPROCESS`==== if mode == 'preprocess': return True # ====Train/test split data==== x = np.array(train_df[seq_col_name_words].values.tolist()) y = np.array(train_df[target_labels].values.tolist()) x_train_nn, x_val_nn, y_train, y_val, train_idxs, val_idxs = split_data( x, y, test_size=0.2, shuffle=True, random_state=42) x_test_nn = np.array(test_df[seq_col_name_words].values.tolist()) x_char = np.array(train_df[seq_col_name_ll3].values.tolist()) x_char_train_nn = x_char[train_idxs] x_char_val_nn = x_char[val_idxs] x_char_test_nn = np.array(test_df[seq_col_name_ll3].values.tolist()) x_train_tfidf = train_df['comment_text_clear'].values[train_idxs] x_val_tfidf = train_df['comment_text_clear'].values[val_idxs] x_test_tfidf = test_df['comment_text_clear'].values catboost_cols = catboost_features(train_df, test_df) x_train_cb = train_df[catboost_cols].values[train_idxs].T x_val_cb = train_df[catboost_cols].values[val_idxs].T x_test_cb = test_df[catboost_cols].values.T # ====Train models==== nn_models = {'cnn': cnn, 'dense': dense, 'rnn': rnn} params = Params(config) metrics = {} predictions = {} for param in params['models']: for model_label, model_params in param.items(): if model_params.get('common', {}).get( 'warm_start', False) and os.path.exists( model_params.get('common', {}).get('model_file', '')): logger.info('{} warm starting...'.format(model_label)) model = load_model( model_params.get('common', {}).get('model_file', None)) elif model_label in nn_models: model = nn_models[model_label](embeds_word.matrix, embeds_ll3.matrix, num_classes, max_seq_len, max_char_seq_len, gpus=gpus, **model_params['init']) model_alias = model_params.get('common', {}).get('alias', None) if model_alias is None or not model_alias: model_alias = '{}_{}'.format(model_label, i) logger.info("training {} ...".format(model_label)) if model_label == 'dense': x_tr = [x_train_nn, x_char_train_nn] x_val = [x_val_nn, x_char_val_nn] x_test = [x_test_nn, x_char_test_nn] else: x_tr = x_train_nn x_val = x_val_nn x_test = x_test_nn hist = train(x_tr, y_train, model, logger=logger, **model_params['train']) predictions[model_alias] = model.predict(x_val) save_predictions(test_df, model.predict(x_test), target_labels, model_alias) elif model_label == 'tfidf': model = TFIDF(target_labels, **model_params['init']) model.fit(x_train_tfidf, y_train, **model_params['train']) predictions[model_alias] = model.predict(x_val_tfidf) save_predictions(test_df, model.predict(x_test_tfidf), target_labels, model_alias) elif model_label == 'catboost': model = CatBoost(target_labels, **model_params['init']) model.fit(x_train_cb, y_train, eval_set=(x_val_cb, y_val), use_best_model=True) predictions[model_alias] = model.predict_proba(x_val_cb) save_predictions(test_df, model.predict_proba(x_test_cb), target_labels, model_alias) metrics[model_alias] = get_metrics(y_val, predictions[model_alias], target_labels) logger.debug('{} params:\n{}'.format(model_alias, model_params)) logger.debug('{} metrics:\n{}'.format( model_alias, print_metrics(metrics[model_alias]))) model.save( os.path.join(output_dir, model_params['common']['model_file'])) logger.info('Saving metrics...') with open(os.path.join(output_dir, 'metrics.json'), 'w') as f: f.write(json.dumps(metrics)) # ====END OF `VALIDATE`==== if mode == 'validate': return True # Meta catboost logger.info('training catboost as metamodel...') x_meta = [ predictions[model_alias] for model_alias in sorted(predictions.keys()) ] x_meta = np.array(x_train_meta).T x_train_meta, x_val_meta, y_train_meta, y_val_meta = train_test_split( x_meta, y_val, test_size=0.20, random_state=42) meta_model = CatBoost(target_labels, loss_function='Logloss', iterations=1000, depth=6, learning_rate=0.03, rsm=1) meta_model.fit(x_train_meta, y_train_meta, eval_set=(x_val_meta, y_val_meta), use_best_model=True) y_hat_meta = meta_model.predict_proba(x_val_meta) metrics_meta = get_metrics(y_val_meta, y_hat_meta, target_labels) #model.save(os.path.join(output_dir, 'meta.catboost') logger.debug('{} metrics:\n{}'.format('META', print_metrics(metrics_meta))) # ====Predict==== logger.info('Applying models...') test_cols = [] for model_alias in sorted(predictions.keys()): for label in target_labels: test_cols.append('{}_{}'.format(model_alias, label)) x_test = test_df[test_cols].values preds = meta_model.predict_proba(x_test) for i, label in enumerate(target_labels): test_df[label] = preds[:, i] # ====Normalize probabilities==== if norm_prob: for label in target_labels: test_df[label] = norm_prob_koef * test_df[label] # ====Save results==== logger.info('Saving results...') test_df[[ 'id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ]].to_csv(result_fname, index=False, header=True) test_df.to_csv('{}_tmp'.format(result_fname), index=False, header=True)