def run_cross_validation(self, locs_in=["SAB"]): print(" \n\n--- Cross Validation for locations: {}\n".format(locs_in)) locations = utilities.get_locations(locs_in) data_in = self.data.get_data() data_in = data_in[data_in["Environment"].isin(locations)] label_encoder, pipeline = utilities.prepare_skl_interface( data_in, self.classifier) # shuffle with random seed if specified if self.random_state is not None: data_in_shuffled = skl.utils.shuffle( data_in, random_state=self.random_state) else: data_in_shuffled = skl.utils.shuffle(data_in) # get metrics output_metrics = utilities.cross_validation(pipeline, self.folds, data_in_shuffled, label_encoder, self.srp_dict, data_aug=self.data_aug) metrics.print_metrics(output_metrics, self.paper_metrics_only)
def test_with_svm(dataset_test, classifier, preprocessing, pca_processing, show_testing_metrics, labels, names_test, names) -> List: # Apply PCA/KPCA transformation to testing training_data dataset_test_pca = preprocess_dataset(pca_processing, preprocessing, dataset_test) labels_test_mapped_to_labels_train = [] testing_with_training_dataset = True for label in labels: try: label_mapped = list(names).index(names_test[label]) except: # If name is not in training dataset, then label is not mapped label_mapped = label # We can assume that user is not testing the dataset testing_with_training_dataset = False show_testing_metrics = False labels_test_mapped_to_labels_train.append(label_mapped) sc = StandardScaler() scaled_dataset_test_pca = sc.fit_transform(dataset_test_pca) # Test classifier y_pred = classifier.predict(scaled_dataset_test_pca) # classifier.save(preprocessing, pca_processing) # To obtain metrics print_metrics(y_pred, names, labels, labels_test_mapped_to_labels_train, names_test, testing_with_training_dataset, show_testing_metrics) return [names[int(y_pred[i])] for i in range(len(y_pred))]
def main(): # Add seed random_seed = 42 torch.manual_seed(random_seed) args = parser.get() X_train = load('./datas/X_train.npy') y_train = load('./datas/y_train.npy') X_test = load('./datas/X_test.npy') train_dataset = data.DatasetXy(X_train, y_train) test_dataset = data.DatasetX(X_test) data_class = data.Dataloader(args, train_dataset, test_dataset) train, test = data_class.train(), data_class.test() model = models.get(args) optimizer = optimizers.get(args, model.parameters()) criterion = torch.nn.CrossEntropyLoss() for epoch in range(args.epochs): train_metrics = runner.run( model, criterion, optimizer, train, True, { "loss": metrics.loss, "accuracy": metrics.accuracy }, ) metrics.print_metrics(train_metrics) y_test_pred = runner.run( model, criterion, optimizer, test, False, { "loss": metrics.loss, "accuracy": metrics.accuracy }, ) print(y_test_pred) y_test_pred = [item for sublist in y_test_pred for item in sublist] #print((y_test_pred[0]).shape) #_, y_pred = torch.max(y_test_pred, dim = 1) #y_pred = torch.round(y_test_pred) # _, y_pred = torch.max(y_test_pred, dim = 1) # y_pred = y_pred.cpu().numpy() #print(len(y_pred_list)) #print(y_pred.type) y_test = np.asarray(y_test_pred) pd.DataFrame({ "Id": np.arange(len(y_test)), "Category": y_test }).astype(int).to_csv("solution.csv", index=False)
def svm(): # ********************* load the dataset and divide to X&y *********************** from sklearn.datasets import make_blobs X, Y = make_blobs(cluster_std=0.9, random_state=20, n_samples=1000, centers=10, n_features=10) from Algorithms.ML_.helper.data_helper import split_train_val_test X, Xv, y, Yv, Xt, Yt = split_train_val_test(X, Y) print(X.shape, y.shape, Xv.shape, Yv.shape, Xt.shape, Yt.shape) # ********************* build model *********************** from model import SVM from activation import Activation, Softmax, Hinge from regularization import Regularization, L1, L2, L12 from optimizer import Vanilla model = SVM() learning_rate, reg_rate = 1e-3, 5e-1 model.compile(alpha=learning_rate, lambda_=reg_rate, activation=Softmax(), reg=L2(), opt=Vanilla()) model.describe() # ********************* train *********************** loss_train, loss_val = model.train(X, y, val=(Xv, Yv), iter_=1000, return_loss=True, verbose=True, eps=1e-3) import matplotlib.pyplot as plt plt.plot(range(len(loss_train)), loss_train) plt.plot(range(len(loss_val)), loss_val) plt.legend(['train', 'val']) plt.xlabel('Iteration') plt.ylabel('Training loss') plt.title('Training Loss history') plt.show() # ********************* predict *********************** pred_train = model.predict(X) pred_val = model.predict(Xv) pred_test = model.predict(Xt) import metrics print('train accuracy=', metrics.accuracy(y, pred_train)) print('val accuracy=', metrics.accuracy(Yv, pred_val)) print('test accuracy=', metrics.accuracy(Yt, pred_test)) print('null accuracy=', metrics.null_accuracy(y)) import metrics metrics.print_metrics(Yt, pred_test)
def run_generalisation(self, train_locs=["DA"], test_locs=["DB"], save_classifier=True): print("\n\n --- Generalization across locations --- \n") print("Locations in train set: {}".format(train_locs)) print("Locations in test set: {}".format(test_locs)) train_locs = utilities.get_locations(train_locs) test_locs = utilities.get_locations(test_locs) data_in = self.data.get_data() label_encoder, pipeline = utilities.prepare_skl_interface( data_in, self.classifier) train_data = data_in[data_in["Environment"].isin(train_locs)] test_data = data_in[data_in["Environment"].isin(test_locs)] accuracy, conf_mat = utilities.train_and_test( train_data, test_data, pipeline, label_encoder, self.srp_dict, save_cls=save_classifier, out_folder=self.output_folder) all_metrics = { "overall_accuracy": (accuracy, 0), "per_class_accuracy": (metrics.getPCaccuracy(conf_mat), np.zeros(4)), "per_class_precision": (metrics.getPCPrecision(conf_mat), np.zeros(4)), "per_class_recall": (metrics.getPCRecall(conf_mat), np.zeros(4)), "per_class_iou": (metrics.getPCIoU(conf_mat), np.zeros(4)) } metrics.print_metrics(all_metrics, self.paper_metrics_only)
def main(): # Add seed args = parser.get() data_class = data.Dataset(args) train, validation = data_class.train(), data_class.validation() model = models.get(args) optimizer = optimizers.get(args, model.parameters()) criterion = torch.nn.CrossEntropyLoss() for epoch in range(args.epochs): train_metrics = runner.run( model, criterion, optimizer, train, True, { "loss": metrics.loss, "accuracy": metrics.accuracy }, ) metrics.print_metrics(train_metrics) validation_metrics = runner.run( model, criterion, optimizer, validation, False, { "loss": metrics.loss, "accuracy": metrics.accuracy }, ) metrics.print_metrics(validation_metrics)
def test_with_svm(dataset_test, classifier, preprocessing, pca_processing, show_testing_metrics, labels_test, labels_train, names_test, names): # Apply PCA transformation to testing training_data dataset_test_pca = preprocess_dataset(pca_processing, preprocessing, dataset_test) labels_test_mapped_to_labels_train = [] testing_with_training_dataset = True for label in labels_test: try: label_mapped = list(names).index(names_test[label]) except: # If name is not in training dataset, then label is not mapped label_mapped = label # We can assume that user is not testing the dataset testing_with_training_dataset = False show_testing_metrics = False labels_test_mapped_to_labels_train.append(label_mapped) print(f"Shape of test set {dataset_test_pca}") # Test classifier y_pred = classifier.predict(dataset_test_pca) # classifier.save(preprocessing, pca_processing) # dataset_test = np.array(dataset_test_pca) # for i in range(dataset_test.shape[0]): # pca_processing.reconstruct_image(dataset_test[i], names_test[labels_test[i]], names[y_pred[i]]) # To obtain metrics print_metrics(y_pred, names, labels_test, labels_test_mapped_to_labels_train, names_test, testing_with_training_dataset, show_testing_metrics) return [names[int(y_pred[i])] for i in range(len(y_pred))]
predictions = model.predict_generator( ds.generate_test(BATCH_SIZE), steps=ds.num_batches_test(BATCH_SIZE), workers=7, use_multiprocessing=False, max_queue_size=BATCH_SIZE, verbose=1) # Get test targets from generator y_set = None for x, y in ds.generate_test(BATCH_SIZE): y_set = np.array(y) if y_set is None else np.vstack((y_set, y)) # Compute all the metrics metrics = compute_metrics(y_set, predictions, ds.num_classes) # Print metrics print_metrics(metrics) # Write test results to file with results_file_path.open('a') as f: f.write(F"\n{metrics_to_string(metrics)},{time.time() - start_time}") # Write confussion matrix to text file with cm_file_path.open('a') as f: f.write(f"===== Fold {fold} ======\n\n") f.write(str(metrics['Confusion matrix'])) f.write("\n\n\n\n") del model
def regression(): # ********************* load the dataset and divide to X&y *********************** from sklearn.datasets import make_blobs X, Y = make_blobs(cluster_std=0.9, random_state=20, n_samples=1000, centers=10, n_features=10) from Algorithms.ML_.helper.data_helper import split_train_val_test X, Xv, y, Yv, Xt, Yt = split_train_val_test(X, Y) print(X.shape, y.shape, Xv.shape, Yv.shape, Xt.shape, Yt.shape) # ********************* build model *********************** from model import Regression from layer import Layer, Dense from activation import Activation, Softmax, Sigmoid, ReLU from regularization import Regularization, L1, L2, L12 from optimizer import Vanilla model = Regression() input_size = X.shape[1] hidden_size = 50 num_classes = 10 learning_rate, reg_rate = 1e-3, 0.5 model = Regression([ Dense(hidden_size, input_shape=(input_size, ), activation=ReLU(), alpha=learning_rate, lambda_=reg_rate), ]) model += Dense(num_classes, activation=Softmax(), alpha=learning_rate, lambda_=reg_rate) # add layer with += model.compile() model.describe() # ********************* train *********************** loss_train, loss_val = model.train(X, y, val=(Xv, Yv), iter_=5000, batch=32, return_loss=True, verbose=True) import matplotlib.pyplot as plt plt.plot(range(len(loss_train)), loss_train) plt.plot(range(len(loss_val)), loss_val) plt.legend(['train', 'val']) plt.xlabel('Iteration') plt.ylabel('Training loss') plt.title('Training Loss history') plt.show() # ********************* predict *********************** pred_train = model.predict(X) pred_val = model.predict(Xv) pred_test = model.predict(Xt) import metrics print('train accuracy=', metrics.accuracy(y, pred_train)) print('val accuracy=', metrics.accuracy(Yv, pred_val)) print('test accuracy=', metrics.accuracy(Yt, pred_test)) print('null accuracy=', metrics.null_accuracy(y)) import metrics metrics.print_metrics(Yt, pred_test)
#train data image = misc.imread("pics/cat.jpg", flatten=True, mode="L") image_list = [image] image_list = tuple(image_list) #train auto = LSTMAutoencoder() auto.train(image_list, block_size, layer_step, number_of_layers, epoches, batch_size, split_rate) auto.save_models(num=str(num_try)) # auto.load_models(num = str(num_try)) #test image = misc.imread("pics/lena.jpg", flatten=True, mode="L") r = auto.encode(image) s = auto.decode(r) result = Image.fromarray((s * 255).astype(np.uint8)) result.save(path_to_result + "decoded.jpg") misc.imsave(path_to_result + 'using_imsave.jpg', s) r = np.array(r) s = np.array(s) print_metrics(image, r, s) write_to_file(path_to_result, image, r, s, len(image_list), block_size, layer_step, number_of_layers, epoches, batch_size, split_rate) num_try += 1
def main(*kargs, **kwargs): get_kwargs(kwargs) train_fname = kwargs['train'] test_fname = kwargs['test'] result_fname = kwargs['output'] embeds_fname = kwargs['embeds'] logger_fname = kwargs['logger'] swear_words_fname = kwargs['swear_words'] wrong_words_fname = kwargs['wrong_words'] warm_start = kwargs['warm_start'] format_embeds = kwargs['format_embeds'] config = kwargs['config'] train_clear = kwargs['train_clear'] test_clear = kwargs['test_clear'] output_dir = kwargs['output_dir'] norm_prob = kwargs['norm_prob'] norm_prob_koef = kwargs['norm_prob_koef'] gpus = kwargs['gpus'] model_file = { 'dense': os.path.join(output_dir, 'dense.h5'), 'cnn': os.path.join(output_dir, 'cnn.h5'), 'lstm': os.path.join(output_dir, 'lstm.h5'), 'concat': os.path.join(output_dir, 'concat.h5'), 'lr': os.path.join(output_dir, '{}_logreg.bin'), 'catboost': os.path.join(output_dir, '{}_catboost.bin') } # ====Create logger==== logger = Logger(logging.getLogger(), logger_fname) # ====Detect GPUs==== logger.debug(device_lib.list_local_devices()) # ====Load data==== logger.info('Loading data...') train_df = load_data(train_fname) test_df = load_data(test_fname) target_labels = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] num_classes = len(target_labels) # ====Load additional data==== logger.info('Loading additional data...') swear_words = load_data(swear_words_fname, func=lambda x: set(x.T[0]), header=None) wrong_words_dict = load_data(wrong_words_fname, func=lambda x: {val[0]: val[1] for val in x}) tokinizer = RegexpTokenizer(r'\w+') regexps = [ re.compile("([a-zA-Z]+)([0-9]+)"), re.compile("([0-9]+)([a-zA-Z]+)") ] # ====Load word vectors==== logger.info('Loading embeddings...') embed_dim = 300 embeds = Embeds(embeds_fname, 'fasttext', format=format_embeds) # ====Clean texts==== logger.info('Cleaning text...') if warm_start: logger.info('Use warm start...') else: train_df['comment_text_clear'] = clean_text(train_df['comment_text'], tokinizer, wrong_words_dict, swear_words, regexps) test_df['comment_text_clear'] = clean_text(test_df['comment_text'], tokinizer, wrong_words_dict, swear_words, regexps) train_df.to_csv(train_clear, index=False) test_df.to_csv(test_clear, index=False) # ====Calculate maximum seq length==== logger.info('Calc text length...') train_df.fillna('unknown', inplace=True) test_df.fillna('unknown', inplace=True) train_df['text_len'] = train_df['comment_text_clear'].apply( lambda words: len(words.split())) test_df['text_len'] = test_df['comment_text_clear'].apply( lambda words: len(words.split())) max_seq_len = np.round(train_df['text_len'].mean() + 3 * train_df['text_len'].std()).astype(int) logger.debug('Max seq length = {}'.format(max_seq_len)) # ====Prepare data to NN==== logger.info('Converting texts to sequences...') max_words = 100000 train_df['comment_seq'], test_df[ 'comment_seq'], word_index = convert_text2seq( train_df['comment_text_clear'].tolist(), test_df['comment_text_clear'].tolist(), max_words, max_seq_len, lower=True, char_level=False, uniq=True) logger.debug('Dictionary size = {}'.format(len(word_index))) logger.info('Preparing embedding matrix...') embedding_matrix, words_not_found = get_embedding_matrix( embed_dim, embeds, max_words, word_index) logger.debug('Embedding matrix shape = {}'.format( np.shape(embedding_matrix))) logger.debug('Number of null word embeddings = {}'.format( np.sum(np.sum(embedding_matrix, axis=1) == 0))) logger.info('Deleting unknown words from seq...') train_df['comment_seq'] = clean_seq(train_df['comment_seq'], embedding_matrix, max_seq_len) test_df['comment_seq'] = clean_seq(test_df['comment_seq'], embedding_matrix, max_seq_len) # ====Train/test split data==== x = np.array(train_df['comment_seq'].tolist()) y = np.array(train_df[target_labels].values) x_train_nn, x_test_nn, y_train_nn, y_test_nn, train_idxs, test_idxs = split_data( x, y, test_size=0.2, shuffle=True, random_state=42) test_df_seq = np.array(test_df['comment_seq'].tolist()) y_nn = [] logger.debug('X shape = {}'.format(np.shape(x_train_nn))) # ====Train models==== params = Params(config) cnn = get_cnn(embedding_matrix, num_classes, max_seq_len, num_filters=params.get('cnn').get('num_filters'), l2_weight_decay=params.get('cnn').get('l2_weight_decay'), dropout_val=params.get('cnn').get('dropout_val'), dense_dim=params.get('cnn').get('dense_dim'), add_sigmoid=True, train_embeds=params.get('cnn').get('train_embeds'), gpus=gpus) lstm = get_lstm(embedding_matrix, num_classes, max_seq_len, l2_weight_decay=params.get('lstm').get('l2_weight_decay'), lstm_dim=params.get('lstm').get('lstm_dim'), dropout_val=params.get('lstm').get('dropout_val'), dense_dim=params.get('lstm').get('dense_dim'), add_sigmoid=True, train_embeds=params.get('lstm').get('train_embeds'), gpus=gpus) concat = get_concat_model( embedding_matrix, num_classes, max_seq_len, n_layers=params.get('concat').get('n_layers'), concat=params.get('concat').get('concat'), pool=params.get('concat').get('pool'), num_filters=params.get('concat').get('num_filters'), l2_weight_decay=params.get('concat').get('l2_weight_decay'), lstm_dim=params.get('concat').get('lstm_dim'), dropout_val=params.get('concat').get('dropout_val'), dense_dim=params.get('concat').get('dense_dim'), add_sigmoid=True, train_embeds=params.get('concat').get('train_embeds'), gpus=gpus) models = [] for model_label in params.get('models'): if model_label == 'cnn': models.append([model_label, cnn]) elif model_label == 'dense': models.append([model_label, dense]) elif model_label == 'lstm': models.append([model_label, lstm]) elif model_label == 'concat': models.append([model_label, concat]) else: raise ValueError( 'Invalid model {}. Model hasn`t defined.'.format(model_label)) for i in range(len(models)): model_label, model = models[i] logger.info("training {} ...".format(model_label)) if params.get(model_label).get('warm_start') and os.path.exists( params.get(model_label).get('model_file')): logger.info('{} warm starting...'.format(model_label)) model = load_model(params.get(model_label).get('model_file')) models[i][1] = model else: hist = train( x_train_nn, y_train_nn, model, batch_size=params.get(model_label).get('batch_size'), num_epochs=params.get(model_label).get('num_epochs'), learning_rate=params.get(model_label).get('learning_rate'), early_stopping_delta=params.get(model_label).get( 'early_stopping_delta'), early_stopping_epochs=params.get(model_label).get( 'early_stopping_epochs'), use_lr_strategy=params.get(model_label).get('use_lr_strategy'), lr_drop_koef=params.get(model_label).get('lr_drop_koef'), epochs_to_drop=params.get(model_label).get('epochs_to_drop'), logger=logger) y_nn.append(model.predict(x_test_nn)) save_predictions(test_df, model.predict(test_df_seq), target_labels, model_label) metrics = get_metrics(y_test_nn, y_nn[-1], target_labels) logger.debug('{} metrics:\n{}'.format(model_label, print_metrics(metrics))) logger.debug('Model path = {}'.format(model_file[model_label])) model.save(model_file[model_label]) # TFIDF + LogReg logger.info('training LogReg over tfidf...') train_tfidf, val_tfidf, test_tfidf, word_tfidf, char_tfidf = get_tfidf( train_df['comment_text_clear'].values[train_idxs], train_df['comment_text_clear'].values[test_idxs], test_df['comment_text_clear'].values) models_lr = [] metrics_lr = {} y_tfidf = [] for i, label in enumerate(target_labels): model = LogisticRegression(C=4.0, solver='sag', max_iter=1000, n_jobs=16) model.fit(train_tfidf, y_train_nn[:, i]) y_tfidf.append(model.predict_proba(val_tfidf)[:, 1]) test_df['tfidf_{}'.format(label)] = model.predict_proba(test_tfidf)[:, 1] metrics_lr[label] = calc_metrics(y_test_nn[:, i], y_tfidf[-1]) models_lr.append(model) joblib.dump(model, model_file['lr'].format(label)) metrics_lr['Avg'] = { 'Logloss': np.mean([metric['Logloss'] for label, metric in metrics_lr.items()]) } logger.debug('LogReg(TFIDF) metrics:\n{}'.format( print_metrics(metrics_lr))) # Bow for catboost if params.get('catboost').get('add_bow'): top_pos_words = [] top_neg_words = [] for i in range(num_classes): top_pos_words.append([]) top_neg_words.append([]) top_pos_words[-1], top_neg_words[ -1] = get_most_informative_features( [word_tfidf, char_tfidf], models_lr[i], n=params.get('catboost').get('bow_top')) top_pos_words = list( set( np.concatenate([[val for score, val in top] for top in top_pos_words]))) top_neg_words = list( set( np.concatenate([[val for score, val in top] for top in top_neg_words]))) top = list(set(np.concatenate([top_pos_words, top_neg_words]))) train_bow = get_bow(train_df['comment_text_clear'].values[train_idxs], top) val_bow = get_bow(train_df['comment_text_clear'].values[test_idxs], top) test_bow = get_bow(test_df['comment_text_clear'].values, top) logger.debug('Count bow words = {}'.format(len(top))) # Meta catboost logger.info('training catboost as metamodel...') train_df['text_unique_len'] = train_df['comment_text_clear'].apply( calc_text_uniq_words) test_df['text_unique_len'] = test_df['comment_text_clear'].apply( calc_text_uniq_words) train_df['text_unique_koef'] = train_df['text_unique_len'] / train_df[ 'text_len'] test_df[ 'text_unique_koef'] = test_df['text_unique_len'] / test_df['text_len'] text_len_features = train_df[[ 'text_len', 'text_unique_len', 'text_unique_koef' ]].values[test_idxs] x_train_catboost = [] y_train_catboost = y_test_nn features = y_nn features.extend([text_len_features, np.array(y_tfidf).T]) if params.get('catboost').get('add_bow'): features.append(val_bow) for feature in zip(*features): x_train_catboost.append(np.concatenate(feature)) models_cb = [] metrics_cb = {} x_train_cb, x_val_cb, y_train_cb, y_val_cb = train_test_split( x_train_catboost, y_train_catboost, test_size=0.20, random_state=42) for i, label in enumerate(target_labels): model = CatBoostClassifier( loss_function='Logloss', iterations=params.get('catboost').get('iterations'), depth=params.get('catboost').get('depth'), rsm=params.get('catboost').get('rsm'), learning_rate=params.get('catboost').get('learning_rate'), device_config=params.get('catboost').get('device_config')) model.fit(x_train_cb, y_train_cb[:, i], eval_set=(x_val_cb, y_val_cb[:, i]), use_best_model=True) y_hat_cb = model.predict_proba(x_val_cb) metrics_cb[label] = calc_metrics(y_val_cb[:, i], y_hat_cb[:, 1]) models_cb.append(model) joblib.dump(model, model_file['catboost'].format(label)) metrics_cb['Avg'] = { 'Logloss': np.mean([metric['Logloss'] for label, metric in metrics_cb.items()]) } logger.debug('CatBoost metrics:\n{}'.format(print_metrics(metrics_cb))) # ====Predict==== logger.info('Applying models...') text_len_features = test_df[[ 'text_len', 'text_unique_len', 'text_unique_koef' ]].values y_tfidf_test = test_df[[ 'tfidf_{}'.format(label) for label in target_labels ]].values x_test_cb = [] features = [] for model_label, _ in models: features.append(test_df[[ '{}_{}'.format(model_label, label) for label in target_labels ]].values) features.extend([text_len_features, y_tfidf_test]) if params.get('catboost').get('add_bow'): features.append(test_bow) for feature in tqdm(zip(*features)): x_test_cb.append(np.concatenate(feature)) for label, model in zip(target_labels, models_cb): pred = model.predict_proba(x_test_cb) test_df[label] = np.array(list(pred))[:, 1] # ====Normalize probabilities==== if norm_prob: for label in target_labels: test_df[label] = norm_prob_koef * test_df[label] # ====Save results==== logger.info('Saving results...') test_df[[ 'id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ]].to_csv(result_fname, index=False, header=True)
def main(*kargs, **kwargs): get_kwargs(kwargs) train_fname = kwargs['train'] test_fname = kwargs['test'] result_fname = kwargs['output'] word_embeds_fname = kwargs['word_embeds'] char_embeds_fname = kwargs['char_embeds'] logger_fname = kwargs['logger'] mode = kwargs['mode'] max_words = kwargs['max_words'] use_only_exists_words = kwargs['use_only_exists_words'] swear_words_fname = kwargs['swear_words'] wrong_words_fname = kwargs['wrong_words'] embeds_format = kwargs['format_embeds'] config = kwargs['config'] output_dir = kwargs['output_dir'] norm_prob = kwargs['norm_prob'] norm_prob_koef = kwargs['norm_prob_koef'] gpus = kwargs['gpus'] seq_col_name_words = 'comment_seq_lw_use_exist{}_{}k'.format( int(use_only_exists_words), int(max_words / 1000)) seq_col_name_ll3 = 'comment_seq_ll3_use_exist{}_{}k'.format( int(use_only_exists_words), int(max_words / 1000)) model_file = { 'dense': os.path.join(output_dir, 'dense.h5'), 'cnn': os.path.join(output_dir, 'cnn.h5'), 'lstm': os.path.join(output_dir, 'lstm.h5'), 'lr': os.path.join(output_dir, '{}_logreg.bin'), 'catboost': os.path.join(output_dir, '{}_catboost.bin') } # ====Create logger==== logger = Logger(logging.getLogger(), logger_fname) # ====Detect GPUs==== logger.debug(device_lib.list_local_devices()) # ====Load data==== logger.info('Loading data...') train_df = load_data(train_fname) test_df = load_data(test_fname) target_labels = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] num_classes = len(target_labels) # ====Load additional data==== logger.info('Loading additional data...') swear_words = load_data(swear_words_fname, func=lambda x: set(x.T[0]), header=None) wrong_words_dict = load_data(wrong_words_fname, func=lambda x: {val[0]: val[1] for val in x}) # ====Load word vectors==== logger.info('Loading embeddings...') embeds_word = Embeds().load(word_embeds_fname, embeds_format) embeds_ll3 = Embeds().load(char_embeds_fname, embeds_format) # ====Clean texts==== if mode in ('preprocess', 'all'): logger.info('Cleaning text...') train_df['comment_text_clear'] = clean_text(train_df['comment_text'], wrong_words_dict, autocorrect=True) test_df['comment_text_clear'] = clean_text(test_df['comment_text'], wrong_words_dict, autocorrect=True) train_df.to_csv(os.path.join(output_dir, 'train_clear.csv'), index=False) test_df.to_csv(os.path.join(output_dir, 'test_clear.csv'), index=False) # ====Calculate maximum seq length==== logger.info('Calc text length...') train_df.fillna('__NA__', inplace=True) test_df.fillna('__NA__', inplace=True) train_df['text_len'] = train_df['comment_text_clear'].apply( lambda words: len(words.split())) test_df['text_len'] = test_df['comment_text_clear'].apply( lambda words: len(words.split())) max_seq_len = np.round(train_df['text_len'].mean() + 3 * train_df['text_len'].std()).astype(int) max_char_seq_len = 2000 # empirical logger.debug('Max seq length = {}'.format(max_seq_len)) # ====Prepare data to NN==== logger.info('Converting texts to sequences...') if mode in ('preprocess', 'all'): train_df[seq_col_name_words], test_df[ seq_col_name_words], word_index, train_df[ seq_col_name_ll3], test_df[ seq_col_name_ll3], ll3_index = convert_text2seq( train_df['comment_text_clear'].tolist(), test_df['comment_text_clear'].tolist(), max_words, max_seq_len, max_char_seq_len, embeds_word, lower=True, oov_token='__NA__', uniq=False, use_only_exists_words=use_only_exists_words) logger.debug('Dictionary size use_exist{} = {}'.format( int(use_only_exists_words), len(word_index))) logger.debug('Char dict size use_exist{} = {}'.format( int(use_only_exists_words), len(ll3_index))) logger.info('Preparing embedding matrix...') words_not_found = embeds_word.set_matrix(max_words, word_index) embeds_ll3.matrix = np.random.normal(size=(len(ll3_index), embeds_word.shape[1])) embeds_ll3.word_index = ll3_index embeds_ll3.word_index_reverse = { val: key for key, val in ll3_index.items() } embeds_ll3.shape = np.shape(embeds_ll3.matrix) embeds_word.save( os.path.join(output_dir, 'wiki.embeds_lw.{}k'.format(int(max_words / 1000)))) embeds_ll3.save( os.path.join(output_dir, 'wiki.embeds_ll3.{}k'.format(int(max_words / 1000)))) # ====Get text vector==== pooling = { 'max': { 'func': np.max }, 'avg': { 'func': np.sum, 'normalize': True }, 'sum': { 'func': np.sum, 'normalize': False } } for p in ['max', 'avg', 'sum']: train_df['comment_vec_{}'.format( p)] = train_df[seq_col_name_words].apply( lambda x: embed_aggregate(x, embeds_word, **pooling[p])) test_df['comment_vec_{}'.format( p)] = test_df[seq_col_name_words].apply( lambda x: embed_aggregate(x, embeds_word, **pooling[p])) train_df.to_csv(os.path.join(output_dir, 'train_clear1.csv'), index=False) test_df.to_csv(os.path.join(output_dir, 'test_clear1.csv'), index=False) else: for col in train_df.columns: if col.startswith('comment_seq'): train_df[col] = train_df[col].apply( lambda x: parse_seq(x, int)) test_df[col] = test_df[col].apply(lambda x: parse_seq(x, int)) elif col.startswith('comment_vec'): train_df[col] = train_df[col].apply( lambda x: parse_seq(x, float)) test_df[col] = test_df[col].apply( lambda x: parse_seq(x, float)) logger.debug('Embedding matrix shape = {}'.format(embeds_word.shape)) logger.debug('Number of null word embeddings = {}'.format( np.sum(np.sum(embeds_word.matrix, axis=1) == 0))) # ====END OF `PREPROCESS`==== if mode == 'preprocess': return True # ====Train/test split data==== x = np.array(train_df[seq_col_name_words].values.tolist()) y = np.array(train_df[target_labels].values.tolist()) x_train_nn, x_val_nn, y_train, y_val, train_idxs, val_idxs = split_data( x, y, test_size=0.2, shuffle=True, random_state=42) x_test_nn = np.array(test_df[seq_col_name_words].values.tolist()) x_char = np.array(train_df[seq_col_name_ll3].values.tolist()) x_char_train_nn = x_char[train_idxs] x_char_val_nn = x_char[val_idxs] x_char_test_nn = np.array(test_df[seq_col_name_ll3].values.tolist()) x_train_tfidf = train_df['comment_text_clear'].values[train_idxs] x_val_tfidf = train_df['comment_text_clear'].values[val_idxs] x_test_tfidf = test_df['comment_text_clear'].values catboost_cols = catboost_features(train_df, test_df) x_train_cb = train_df[catboost_cols].values[train_idxs].T x_val_cb = train_df[catboost_cols].values[val_idxs].T x_test_cb = test_df[catboost_cols].values.T # ====Train models==== nn_models = {'cnn': cnn, 'dense': dense, 'rnn': rnn} params = Params(config) metrics = {} predictions = {} for param in params['models']: for model_label, model_params in param.items(): if model_params.get('common', {}).get( 'warm_start', False) and os.path.exists( model_params.get('common', {}).get('model_file', '')): logger.info('{} warm starting...'.format(model_label)) model = load_model( model_params.get('common', {}).get('model_file', None)) elif model_label in nn_models: model = nn_models[model_label](embeds_word.matrix, embeds_ll3.matrix, num_classes, max_seq_len, max_char_seq_len, gpus=gpus, **model_params['init']) model_alias = model_params.get('common', {}).get('alias', None) if model_alias is None or not model_alias: model_alias = '{}_{}'.format(model_label, i) logger.info("training {} ...".format(model_label)) if model_label == 'dense': x_tr = [x_train_nn, x_char_train_nn] x_val = [x_val_nn, x_char_val_nn] x_test = [x_test_nn, x_char_test_nn] else: x_tr = x_train_nn x_val = x_val_nn x_test = x_test_nn hist = train(x_tr, y_train, model, logger=logger, **model_params['train']) predictions[model_alias] = model.predict(x_val) save_predictions(test_df, model.predict(x_test), target_labels, model_alias) elif model_label == 'tfidf': model = TFIDF(target_labels, **model_params['init']) model.fit(x_train_tfidf, y_train, **model_params['train']) predictions[model_alias] = model.predict(x_val_tfidf) save_predictions(test_df, model.predict(x_test_tfidf), target_labels, model_alias) elif model_label == 'catboost': model = CatBoost(target_labels, **model_params['init']) model.fit(x_train_cb, y_train, eval_set=(x_val_cb, y_val), use_best_model=True) predictions[model_alias] = model.predict_proba(x_val_cb) save_predictions(test_df, model.predict_proba(x_test_cb), target_labels, model_alias) metrics[model_alias] = get_metrics(y_val, predictions[model_alias], target_labels) logger.debug('{} params:\n{}'.format(model_alias, model_params)) logger.debug('{} metrics:\n{}'.format( model_alias, print_metrics(metrics[model_alias]))) model.save( os.path.join(output_dir, model_params['common']['model_file'])) logger.info('Saving metrics...') with open(os.path.join(output_dir, 'metrics.json'), 'w') as f: f.write(json.dumps(metrics)) # ====END OF `VALIDATE`==== if mode == 'validate': return True # Meta catboost logger.info('training catboost as metamodel...') x_meta = [ predictions[model_alias] for model_alias in sorted(predictions.keys()) ] x_meta = np.array(x_train_meta).T x_train_meta, x_val_meta, y_train_meta, y_val_meta = train_test_split( x_meta, y_val, test_size=0.20, random_state=42) meta_model = CatBoost(target_labels, loss_function='Logloss', iterations=1000, depth=6, learning_rate=0.03, rsm=1) meta_model.fit(x_train_meta, y_train_meta, eval_set=(x_val_meta, y_val_meta), use_best_model=True) y_hat_meta = meta_model.predict_proba(x_val_meta) metrics_meta = get_metrics(y_val_meta, y_hat_meta, target_labels) #model.save(os.path.join(output_dir, 'meta.catboost') logger.debug('{} metrics:\n{}'.format('META', print_metrics(metrics_meta))) # ====Predict==== logger.info('Applying models...') test_cols = [] for model_alias in sorted(predictions.keys()): for label in target_labels: test_cols.append('{}_{}'.format(model_alias, label)) x_test = test_df[test_cols].values preds = meta_model.predict_proba(x_test) for i, label in enumerate(target_labels): test_df[label] = preds[:, i] # ====Normalize probabilities==== if norm_prob: for label in target_labels: test_df[label] = norm_prob_koef * test_df[label] # ====Save results==== logger.info('Saving results...') test_df[[ 'id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ]].to_csv(result_fname, index=False, header=True) test_df.to_csv('{}_tmp'.format(result_fname), index=False, header=True)
data = pd.DataFrame({'PATH': img_list, 'COUNTS': face_counts}) return data def print_output(paths, counts): for i, path in enumerate(paths): print(config.OUTPUT_FORMAT % (path.name, counts[i])) if __name__ == '__main__': #Path.cwd().n # Parse arguments parser = argparse.ArgumentParser(description='Train/Test Model') parser.add_argument('path', type=existing_path, help='image file/directory for testing') parser.add_argument('--model', type=str, default='MTCNN', choices=['MTCNN'], help='The model to use for face counts, default MTCNN') parser.add_argument('--print-metrics', action='store_true', help='Compute and Output any metrics') args = parser.parse_args() logger = get_logger() logger.info("App Log initiated") data = get_data(args.path) logger.info("Data built with size %d" %(len(data))) model = models.get_model(args.model)() logger.info("Model built") counts = model.count_faces(data['PATH']) print_output(data['PATH'], counts) if args.print_metrics: metrics.print_metrics(data, counts) logger.info("Output/metrics printed, app done!")
from metrics import print_metrics from PIL import Image import numpy as np image = Image.open('/home/maria/Documents/test/pics/lena.jpg') image = image.convert('L') # convert the image to *greyscale* image = np.array(image) print_metrics(image, image, image)
def run_train_and_save_classifier(self, save_classifier=True, locs_in=["SAB"], data_split_ratio=0.15): locations = utilities.get_locations(locs_in) print(" \n\n--- Train a classifier for locations: {}\n".format( locations)) print("Data split into train and test set at ratio: {}\n".format( data_split_ratio)) data_in = self.data.get_data() data_in = data_in[data_in["Environment"].isin(locations)] label_encoder, pipeline = utilities.prepare_skl_interface( data_in, self.classifier) # stratified split data into train and test - stratification considers location and the class labels temp_df = data_in[['Environment', 'Recording ID', 'Class']] temp_df = temp_df.drop( temp_df[temp_df['Class'] == 'front'].index) # just to avoid repeated Recording IDs train_bags, test_bags = train_test_split( temp_df, test_size=data_split_ratio, random_state=self.random_state, stratify=temp_df[['Environment', 'Class']]) # check if samples from same recordings are present in both train and test for bag in list(test_bags['Recording ID']): if bag in list(train_bags['Recording ID']): print("Error: {}".format(bag)) train_data = data_in[data_in['Recording ID'].isin( train_bags['Recording ID'])] test_data = data_in[data_in['Recording ID'].isin( test_bags['Recording ID'])] accuracy, conf_mat = utilities.train_and_test( train_data, test_data, pipeline, label_encoder, self.srp_dict, save_cls=save_classifier, out_folder=self.output_folder) all_metrics = { "overall_accuracy": (accuracy, 0), "per_class_accuracy": (metrics.getPCaccuracy(conf_mat), np.zeros(4)), "per_class_precision": (metrics.getPCPrecision(conf_mat), np.zeros(4)), "per_class_recall": (metrics.getPCRecall(conf_mat), np.zeros(4)), "per_class_iou": (metrics.getPCIoU(conf_mat), np.zeros(4)) } metrics.print_metrics(all_metrics, self.paper_metrics_only)
def _train_one_epoch(self, criterion, optimizer, training_data_loader, train_metrics, train_metrics_results, epoch, global_step, scheduler): aggregate_batches = 1 for m in train_metrics: m.reset() if self.state.cuda: self.model.cuda() self.model.train() optimizer.zero_grad() for idx, batch in enumerate(training_data_loader): assert (isinstance(batch[0], list) and isinstance(batch[1], list)) data = [Variable(b) for b in batch[0]] target = [Variable(b, requires_grad=False) for b in batch[1]] if self.state.cuda: data = [d.cuda() for d in data] target = [t.cuda() for t in target] output = self.model(data) if isinstance(criterion, (tuple, list)): loss_val = [c(output, target) for c in criterion] loss = sum(loss_val) / (len(loss_val)) else: loss_val = criterion(output, target) loss = loss_val loss.backward() if (idx + 1) % aggregate_batches == 0: #for name, param in self.model.named_parameters(): # self.tb_writer.add_scalar('misc/grad-max-{}'.format(name), torch.max(torch.abs(param.grad)).cpu().numpy(), global_step) #for param in self.model.parameters(): # param.grad.data = torch.clamp(param.grad.data, min=-1.0,max=1.0) optimizer.step() optimizer.zero_grad() if scheduler is not None: scheduler.step() for m in train_metrics: m.update(output, target) for idx, l in enumerate(loss_val): self.tb_writer.add_scalar('loss/loss-{}'.format(idx), l.item(), global_step) for idx, param_group in enumerate(optimizer.param_groups): self.tb_writer.add_scalar('misc/lr-{}'.format(idx), param_group['lr'], global_step) global_step = global_step + 1 for m in train_metrics: train_metrics_results[m.name].append(m.get()) metrics.print_metrics(self.tb_writer, m, 'train/', epoch) self.state.optimizer_state = optimizer.state_dict() return global_step
def _evaluate_and_save(self, evaluation_data_loader, split_into_tiles, val_metrics, track_metric, val_metrics_results, epoch, comparator): for m in val_metrics: m.reset() self.model.eval() for batch in evaluation_data_loader: gc.collect() #torch.cuda.empty_cache() assert (isinstance(batch[0], list) and isinstance(batch[1], list)) data = batch[0] target = batch[1] if split_into_tiles and not self.eval_cpu: #TODO: this is a workaround to support tiling for only signle input # add tiling for selected inputs ( not just the 0th one) output = torch.zeros_like(batch[1][0]) input = batch[0][0] tile_shape = (192, 192, 192) center_shape = (48, 48, 48) border = (72, 72, 72) grid = [ int(np.ceil(j / i)) for i, j in zip(center_shape, input.shape[2:]) ] for i in range(grid[0]): for j in range(grid[1]): for k in range(grid[2]): index_min, index_max = loader_helper.get_indices( position=(i, j, k), center_shape=center_shape, border=border) tile = loader_helper.copy(data=input, tile_shape=tile_shape, index_min=index_min, index_max=index_max) if self.state.cuda: tile = tile.cuda() with torch.no_grad(): out = self.model([tile])[0].detach().cpu() loader_helper.copy_back(data=output, tile=out, center_shape=center_shape, index_min=index_min, index_max=index_max, border=border) output = [output] elif self.eval_cpu: tmp_model = self.model.module.cpu() tmp_model.eval() with torch.no_grad(): output = tmp_model(data) else: with torch.no_grad(): if self.state.cuda: data = [d.cuda() for d in data] target = [t.cuda() for t in target] output = self.model(data) for m in val_metrics: m.update(target, output) val = 0.0 for m in val_metrics: if m.name == track_metric: val = m.get() metrics.print_metrics(self.tb_writer, m, 'val/', epoch) val_metrics_results[m.name].append(m.get()) if comparator(val, self.state.best_val): self.state.best_val = val self._save(suffix='best_model') print('model saved')
parser = argparse.ArgumentParser(description="Simulation parameters") parser.add_argument("--config", help="YAML config") args = parser.parse_args() with open(args.config) as cfg: config = yaml.load(cfg, Loader=yaml.FullLoader) configure_root_logger() configure_csv_logger(config["simulation"]["output"]) context, demand = create_scenario(config) if config["solvers"]["greedy_matcher"]["router"] == "linear": router = routers.LinearRouter(context.clock, config["routers"]["linear"]["speed"]) elif config["solvers"]["greedy_matcher"]["router"] == "osrm": router = routers.OSRMRouter(context.clock, server=config["routers"]["osrm"]["server"]) else: raise Exception("Unknown router") logging.info(f"Matcher router {router}") matcher = GreedyMatcher(context, router, config) simulator = Simulator(matcher, context) simulator.simulate(demand, config["simulation"]["duration"]) print_metrics(config["simulation"]["output"], context.clock)
pcd3 = pred_ref_cloud o3d.visualization.draw_geometries([pcd1, pcd2, pcd3]) r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = \ summary_metrics(r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic) return dura, r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic if __name__ == '__main__': seed = 222 random.seed(seed) np.random.seed(seed) args = config_params() test_set = CustomData(args.root, args.infer_npts, False) test_loader = DataLoader(test_set, batch_size=1, shuffle=False) if args.method == 'benchmark': dura, r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = \ evaluate_benchmark(args, test_loader) print_metrics(args.method, dura, r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic) elif args.method == 'icp': dura, r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = \ evaluate_icp(args, test_loader) print_metrics(args.method, dura, r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic) else: raise NotImplementedError
def train_model(model, dataloaders, policy_learner, optimizer, scheduler, num_epochs, device, writer, n_images=None): loader = {'val': dataloaders['val']} # best_model_wts = copy.deepcopy(model.state_dict()) best_loss = 1e10 if n_images is None: n_images = {'train': 0, 'val': 0} for epoch in range(num_epochs): loader['train'] = policy_learner() print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) since = time.time() # Each epoch has a training and validation phase for phase in ['train', 'val']: # print('+++++++++ len loader', len(loader[phase])) if phase == 'train': if scheduler: scheduler.step() for param_group in optimizer.param_groups: print("LR", param_group['lr']) model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode metrics = defaultdict(float) epoch_samples = 0 for enum_id, (idxs, inputs, labels) in tqdm(enumerate(loader[phase]), total=len(loader[phase])): inputs = inputs.to(device) labels = labels.to(device) # if phase == 'train' and enum_id < 3: # for idx in idxs: # torch.save(torch.tensor(1), # f'tmp/trash/{policy_learner.__class__.__name__}_{epoch}_{enum_id}__{idx}' # ) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) # loss, loss_sum, loss_bce, loss_dice = calc_loss(outputs, labels, 0) loss = dice_loss(outputs, labels) acc_f1 = calc_f1(outputs, labels) # acc_iou = calc_IOU(outputs, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() plot_grad_flow(epoch, enum_id, model.named_parameters()) optimizer.step() # statistics epoch_samples += inputs.size(0) n_images[phase] += inputs.size(0) writer.add_scalar(f'{phase}/loss', loss.data.cpu().numpy(), n_images[phase]) # writer.add_scalar(f'{phase}/bce', loss_bce, n_images[phase]) # writer.add_scalar(f'{phase}/dice', loss_dice, n_images[phase]) metrics['loss'] += loss * inputs.size(0) metrics['f1'] += acc_f1 * inputs.size(0) # metrics['iou'] += acc_iou * inputs.size(0) print_metrics(writer, metrics, epoch_samples, phase) epoch_loss = metrics['loss'] / epoch_samples writer.add_scalar(f'{phase}/epoch_loss', epoch_loss, epoch) epoch_f1 = metrics['f1'] / epoch_samples writer.add_scalar(f'{phase}/epoch_F1', epoch_f1, epoch) # epoch_iou = metrics['iou'] / epoch_samples # writer.add_scalar(f'{phase}/epoch_IOU', epoch_iou, epoch) # # deep copy the model # if phase == 'val' and epoch_loss < best_loss: # print("saving best model") # best_loss = epoch_loss # best_model_wts = copy.deepcopy(model.state_dict()) time_elapsed = time.time() - since print('{:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) print('Best val loss: {:4f}'.format(best_loss)) # load best model weights # model.load_state_dict(best_model_wts) return model, n_images