def fit(cnf, predict, per_patient, features_file, n_iter, blend_cnf, test_dir): config = util.load_module(cnf).config image_files = data.get_image_files(config.get('train_dir')) names = data.get_names(image_files) labels = data.get_labels(names).astype(np.float32)[:, np.newaxis] if features_file is not None: runs = {'run': [features_file]} else: runs = data.parse_blend_config(yaml.load(open(blend_cnf))) scalers = {run: StandardScaler() for run in runs} tr, te = data.split_indices(image_files, labels) y_preds = [] for i in range(n_iter): print("iteration {} / {}".format(i + 1, n_iter)) for run, files in runs.items(): print("fitting features for run {}".format(run)) X = data.load_features(files) X = scalers[run].fit_transform(X) X = data.per_patient_reshape(X) if per_patient else X est = get_estimator(X.shape[1], image_files, labels, eval_size=0.0 if predict else 0.1) est.fit(X, labels) if not predict: y_pred = est.predict(X[te]).ravel() y_preds.append(y_pred) y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred).astype(int), np.min(labels), np.max(labels)) print("kappa after run {}, iter {}: {}".format( run, i, util.kappa(labels[te], y_pred))) print("confusion matrix") print(confusion_matrix(labels[te], y_pred)) else: X = data.load_features(files, test=True) X = scalers[run].transform(X) X = data.per_patient_reshape(X) if per_patient else X y_pred = est.predict(X).ravel() y_preds.append(y_pred) if predict: y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred), np.min(labels), np.max(labels)).astype(int) submission_filename = util.get_submission_filename() image_files = data.get_image_files(test_dir or config.get('test_dir')) names = data.get_names(image_files) image_column = pd.Series(names, name='image') level_column = pd.Series(y_pred, name='level') predictions = pd.concat([image_column, level_column], axis=1) print("tail of predictions file") print(predictions.tail()) predictions.to_csv(submission_filename, index=False) print("saved predictions to {}".format(submission_filename))
def fit(cnf, predict, per_patient, features_file, n_iter, blend_cnf, test_dir): config = util.load_module(cnf).config image_files = data.get_image_files(config.get('train_dir')) names = data.get_names(image_files) labels = data.get_labels(names).astype(np.float32)[:, np.newaxis] if features_file is not None: runs = {'run': [features_file]} else: runs = data.parse_blend_config(yaml.load(open(blend_cnf))) scalers = {run: StandardScaler() for run in runs} tr, te = data.split_indices(image_files, labels) y_preds = [] for i in range(n_iter): print("iteration {} / {}".format(i + 1, n_iter)) for run, files in list(runs.items()): print("fitting features for run {}".format(run)) X = data.load_features(files) X = scalers[run].fit_transform(X) X = data.per_patient_reshape(X) if per_patient else X est = get_estimator(X.shape[1], image_files, labels, eval_size=0.0 if predict else 0.1) est.fit(X, labels) if not predict: y_pred = est.predict(X[te]).ravel() y_preds.append(y_pred) y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred).astype(int), np.min(labels), np.max(labels)) print("kappa after run {}, iter {}: {}".format( run, i, util.kappa(labels[te], y_pred))) print("confusion matrix") print(confusion_matrix(labels[te], y_pred)) else: X = data.load_features(files, test=True) X = scalers[run].transform(X) X = data.per_patient_reshape(X) if per_patient else X y_pred = est.predict(X).ravel() y_preds.append(y_pred) if predict: y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred), np.min(labels), np.max(labels)).astype(int) submission_filename = util.get_submission_filename() image_files = data.get_image_files(test_dir or config.get('test_dir')) names = data.get_names(image_files) image_column = pd.Series(names, name='image') level_column = pd.Series(y_pred, name='level') predictions = pd.concat([image_column, level_column], axis=1) print("tail of predictions file") print(predictions.tail()) predictions.to_csv(submission_filename, index=False) print("saved predictions to {}".format(submission_filename))
def process_post(self, post): """ Process post received from the message queue. """ # is this a post matching one or more persons? post_add = False text = data.normalize(post['text']).lower() self.first_person = None # check post language if data.get_text_language(text) == 'fr': for person in self.persons: names = data.get_names(person) if data.check_names(names, text, person['words']) == 1: # one more post for this person if not post_add: post_add = True # get next post id post_id = self.db.incr('nextPostId') # add post to person's posts list key = 'person:%d:posts:%d' % (person['id'], self.stats_last_update) self.db.rpush(key, post_id) # update stats for this person self.update_person_stats(person) if post_add: # add post to db self.db.set_post(int(post_id), json.dumps(post)) # add post id to current hour key = 'posts:%d' % (self.stats_last_update) self.db.rpush(key, post_id) else: logging.debug('found english word in %s', text)
def main(cnf, weights_from): config = util.load_module(cnf).config # print(config) if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) print(config.get('train_dir')) files = data.get_image_files(config.get('train_dir')) names = data.get_names(files) labels = data.get_labels(names).astype(np.float32) print("Checkpoint 5") net = create_net(config) print("Checkpoint 6") print(weights_from) # print(net.load_params_from()) try: print("Checkpoint 7") net.load_params_from(weights_from) print("Checkpoint 8") print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights starting from scratch") print("fitting ...") print(files) print(labels) net.fit(files, labels)
def main(cnf, weights_from): config = util.load_module(cnf).config if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) files = data.get_image_files(config.get('train_dir')) names = data.get_names(files) labels = data.get_labels(names).astype(np.float32) net = create_net(config) try: net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights starting from scratch") print("Shape of files: " + str(files.shape)) print("Shape of labels: " + str(labels.shape)) start = time.time() print("fitting ...") net.fit(files, labels) end = time.time() print("Time elapsed for fitting: " + str(end - start))
def main(cnf, classes, weights_from, predict): config = util.load_module(cnf).config files = data.get_image_files(config.get('train_dir')) names = data.get_names(files) names = [int(x) for x in names ] data.classes = int(classes) labels = data.get_labels(names) net = create_net(config) print files.shape print labels.shape if predict : if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) print weights_from try: net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights starting from scratch") if not predict: print("fitting ...") net.fit(files, labels) else: print("predicting ...") test_files = data.get_image_files(config.get('test_dir')) y_pred = net.predict(test_files) y_pred = y_pred.transpose() print y_pred y_pred = np.clip(np.round(y_pred), np.min(labels), np.max(labels)).astype(int) #print y_pred submission_filename = util.get_submission_filename() image_files = data.get_image_files(config.get('test_dir')) names = data.get_names(image_files) image_column = pd.Series(names, name='photo_id') level_column = pd.DataFrame(y_pred)#name='labels') level_column = level_column.apply(lambda x : string_submit(x)) predictions = pd.concat([image_column, level_column], axis=1) print("tail of predictions file") print(predictions.tail()) predictions.columns = ['photo_id', 'labels'] predictions.to_csv(submission_filename, index=False) print("saved predictions to {}".format(submission_filename))
def main(cnf, classes, weights_from, predict): config = util.load_module(cnf).config files = data.get_image_files(config.get('train_dir')) names = data.get_names(files) names = [int(x) for x in names] data.classes = int(classes) labels = data.get_labels(names) net = create_net(config) print files.shape print labels.shape if predict: if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) print weights_from try: net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights starting from scratch") if not predict: print("fitting ...") net.fit(files, labels) else: print("predicting ...") test_files = data.get_image_files(config.get('test_dir')) y_pred = net.predict(test_files) y_pred = y_pred.transpose() print y_pred y_pred = np.clip(np.round(y_pred), np.min(labels), np.max(labels)).astype(int) #print y_pred submission_filename = util.get_submission_filename() image_files = data.get_image_files(config.get('test_dir')) names = data.get_names(image_files) image_column = pd.Series(names, name='photo_id') level_column = pd.DataFrame(y_pred) #name='labels') level_column = level_column.apply(lambda x: string_submit(x)) predictions = pd.concat([image_column, level_column], axis=1) print("tail of predictions file") print(predictions.tail()) predictions.columns = ['photo_id', 'labels'] predictions.to_csv(submission_filename, index=False) print("saved predictions to {}".format(submission_filename))
def main(directory, convert_directory, test, crop_size, extension): try: os.mkdir(convert_directory) except OSError: pass filenames = [os.path.join(dp, f) for dp, dn, fn in os.walk(directory) for f in fn if f.endswith('jpeg') or f.endswith('tiff')] filenames = sorted(filenames) if test: names = data.get_names(filenames) y = data.get_labels(names) for f, level in zip(filenames, y): if level == 1: try: img = convert(f, crop_size) img.show() Image.open(f).show() real_raw_input = vars(__builtins__).get('raw_input',input) real_raw_input('enter for next') except KeyboardInterrupt: exit(0) print("Resizing images in {} to {}, this takes a while." "".format(directory, convert_directory)) n = len(filenames) # process in batches, sometimes weird things happen with Pool on my machine batchsize = 500 batches = n // batchsize + 1 pool = Pool(N_PROC) args = [] label= {} csv = open('trainLabels.csv') csv_lines = csv.readlines()[1:] for line in csv_lines: line = line.rstrip('\n') cols = line.split(',') label[cols[0]] = cols[1] csv.close() for f in filenames: args.append((convert, (directory, convert_directory, f, crop_size, extension), label)) for i in range(batches): print("batch {:>2} / {}".format(i + 1, batches)) pool.map(process, args[i * batchsize: (i + 1) * batchsize]) pool.close() print('done')
def main(directory, convert_directory, test, crop_size, extension): try: os.mkdir(convert_directory) except OSError: pass filenames = [os.path.join(dp, f) for dp, dn, fn in os.walk(directory) for f in fn if f.endswith('jpeg') or f.endswith('tiff')] filenames = sorted(filenames) if test: names = data.get_names(filenames) y = data.get_labels(names) for f, level in zip(filenames, y): if level == 1: try: img = convert(f, crop_size) img.show() Image.open(f).show() real_raw_input = vars(__builtins__).get('raw_input',input) real_raw_input('enter for next') except KeyboardInterrupt: exit(0) print("Resizing images in {} to {}, this takes a while." "".format(directory, convert_directory)) n = len(filenames) # process in batches, sometimes weird things happen with Pool on my machine batchsize = 500 batches = n // batchsize + 1 pool = Pool(N_PROC) args = [] for f in filenames: args.append((convert, (directory, convert_directory, f, crop_size, extension))) for i in range(batches): print("batch {:>2} / {}".format(i + 1, batches)) pool.map(process, args[i * batchsize: (i + 1) * batchsize]) pool.close() print('done')
def main(cnf, weights_from, fold, exp_run_folder, train_retina): config = util.load_module(cnf).config config.cnf[ 'fold'] = fold # <-- used to change the directories for weights_best, weights_epoch and weights_final config.cnf['exp_run_folder'] = exp_run_folder protocol = data.settings['protocol'] if train_retina != 'train_retina': folds = yaml.load(open('folds/' + protocol + '.yml')) f0, f1 = fold.split('x') train_list = folds['Fold_' + f0][int(f1) - 1] files = data.get_image_files(config.get('train_dir'), train_list) else: files = data.get_image_files(config.get('train_dir')) if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) names = data.get_names(files) labels = data.get_labels(names, label_file='folds/' + protocol + '.csv').astype(np.int32) net = nn.create_net(config) try: net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights, starting from scratch") #Print layerinfo print("## Layer information") import nolearn layer_info = nolearn.lasagne.PrintLayerInfo() print(layer_info._get_greeting(net)) layer_info, legend = layer_info._get_layer_info_conv(net) print(layer_info) print(legend) print("fitting ...") net.fit(files, labels)
def main(cnf, weights_from): config = util.load_module(cnf).config if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) files = data.get_image_files(config.get('train_dir')) names = data.get_names(files) labels = data.get_labels(names).astype(np.float32) net = create_net(config) try: net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights starting from scratch") print("fitting ...") net.fit(files, labels)
def build(cnf, weights_from): config = util.load_module(cnf).config if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) files = data.get_image_files(config.get('train_dir')) names = data.get_names(files) labels = data.get_labels(names).astype(np.float32) net = create_net(config) try: net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights starting from scratch") print("fitting ...") # net.fit(files, labels) return net, files, names, labels
import argparse import numpy as np import tensorflow as tf from model import ANN import data parser = argparse.ArgumentParser(description='Visualize ANN') parser.add_argument('-d', '--dataset', type=str, default='mnist', choices=data.get_names()) parser.add_argument('--num_iter', type=int, default=5000) args = parser.parse_args() dataset = data.init_dataset(name=args.dataset) model = ANN(dataset.shape) model.train(dataset.tr_data, dataset.tr_labels, num_iter=args.num_iter)
and the final loss for each minibatch is calculated from this average. CrossEntropyLoss is used because this is a categorization task, since we're predicting the next letter of the name given previous letters. """ import torch import torch.nn as nn import torch.optim as optim import data import helpers from NamesRNN import NamesRNN from hyperparameters import hps import random model = NamesRNN() names = data.get_names(hps['filename']) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=hps['learning_rate']) batch_size = hps['batch_size'] for t in range(hps['epochs']): batch = random.sample(names, batch_size) # keep losses from each batch to be averaged later batch_losses = [] for name in batch: xs, ys = helpers.name_to_xy(name) # keep losses from each sequence (name) to be averaged later seq_losses = []
#@click.option('--test_dir', default=None, show_default=True, # help="Override directory with test set images.") cnf = 'configs/c_512_5x5_32.py' predict = True per_patient = True features_file = None n_iter =3 blend_cnf = 'blend.yml' test_dir = None #def fit(cnf, predict, per_patient, features_file, n_iter, blend_cnf, test_dir): config = util.load_module(cnf).config image_files = data.get_image_files(config.get('train_dir')) names = data.get_names(image_files) labels = data.get_labels(names).astype(np.float32)[:, np.newaxis] if features_file is not None: runs = {'run': [features_file]} else: runs = data.parse_blend_config(yaml.load(open(blend_cnf))) scalers = {run: StandardScaler() for run in runs} tr, te = data.split_indices(image_files, labels) y_preds = [] for i in range(n_iter): print("iteration {} / {}".format(i + 1, n_iter)) for run, files in runs.items():
def fit(cnf, exp_run_folder, classifier, features_file, n_iter, blend_cnf, test_dir, fold): config = util.load_module(cnf).config config.cnf[ 'fold'] = fold # <-- used to change the directories for weights_best, weights_epoch and weights_final config.cnf['exp_run_folder'] = exp_run_folder folds = yaml.load(open('folds/' + data.settings['protocol'] + '.yml')) f0, f1 = fold.split('x') train_list = folds['Fold_' + f0][int(f1) - 1] test_list = folds['Fold_' + f0][0 if f1 == '2' else 1] image_files = data.get_image_files(config.get('train_dir'), train_list) names = data.get_names(image_files) labels = data.get_labels(names, label_file='folds/' + data.settings['protocol'] + '.csv').astype(np.int32)[:, np.newaxis] if features_file is not None: runs = {'run': [features_file]} else: runs = { run: [ os.path.join(exp_run_folder + '/data/features', f) for f in files ] for run, files in yaml.load(open(blend_cnf)).items() } scalers = {run: StandardScaler() for run in runs} y_preds = [] y_preds_proba = [] for i in range(n_iter): print("iteration {} / {}".format(i + 1, n_iter)) for run, files in runs.items(): files = [ f.replace('f0xf1.npy', '{}.npy'.format(fold)) for f in files ] if classifier is None: X_test = data.load_features(files, test=True) if data.settings['protocol'] != 'protocol3': y_pred_proba = X_test y_proba = [] for i in range(0, len(X_test)): y_proba.append( y_pred_proba[i][1]) #using score from the positive y_pred = np.clip(np.round(y_proba), 0, 1).astype(int) else: y_pred_proba = est.predict_proba(X) else: print("fitting features for run {}".format(run)) X_train = data.load_features(files) l2Norm = np.linalg.norm(X_train, axis=1) X_train = np.divide(X_train.T, l2Norm).T est = estimator(data.settings['protocol'], classifier, X_train.shape[1], image_files, X_train, labels, run, fold, eval_size=0.1) open( exp_run_folder + "/best_estimator_fold_{}.txt".format(fold), "w").write(str(est)) X_test = data.load_features(files, test=True) l2Norm = np.linalg.norm(X_test, axis=1) X_test = np.divide(X_test.T, l2Norm).T if data.settings['protocol'] != 'protocol3': y_pred = est.predict(X_test).ravel() y_pred_proba = est.predict_proba(X_test).ravel() y_proba = [] for i in range(0, 2 * len(X_test), 2): y_proba.append( y_pred_proba[i + 1]) #using score from the positive else: y_pred_binary = est.predict(X_test) y_pred = preprocessing.LabelBinarizer().fit([0, 1, 2]) y_pred = y_pred.inverse_transform(y_pred_binary) y_proba = est.predict_proba(X_test) image_files = data.get_image_files(test_dir or config.get('test_dir'), test_list) names = data.get_names(image_files) labels = data.get_labels( names, label_file='folds/' + data.settings['protocol'] + '.csv').astype(np.int32)[:, np.newaxis] # , per_patient=per_patient image_column = pd.Series(names, name='image') labels_column = pd.Series(np.squeeze(labels), name='true') level_column = pd.Series(y_pred, name='pred') if data.settings['protocol'] != 'protocol3': proba_column = pd.Series(y_proba, name='proba') predictions = pd.concat( [image_column, labels_column, level_column, proba_column], axis=1) else: proba_label_0 = pd.Series(y_proba[:, 0], name='proba_label_0') proba_label_1 = pd.Series(y_proba[:, 1], name='proba_label_1') proba_label_2 = pd.Series(y_proba[:, 2], name='proba_label_2') predictions = pd.concat([ image_column, labels_column, level_column, proba_label_0, proba_label_1, proba_label_2 ], axis=1) predictions.to_csv(exp_run_folder + "/ranked_list_fold_{}.csv".format(fold), sep=';') print("tail of predictions") print(predictions.tail()) acc = len(filter(lambda (l, y): l == y, zip(labels, y_pred))) / float(len(labels)) print("accuracy: {}".format(acc)) print("confusion matrix") print(confusion_matrix(labels, y_pred)) if data.settings['protocol'] != 'protocol3': auc = calc_auc(y_proba, labels, exp_run_folder, classifier, fold) print("AUC: {}".format(auc)) average_precision = average_precision_score(labels, y_proba) print("average precision: {}".format(average_precision)) c_matrix = confusion_matrix(labels, y_pred) print("sensitivity: {}".format(c_matrix[1][1] / (c_matrix[1][1] + c_matrix[0][1]))) print("specificity: {}".format(c_matrix[0][0] / (c_matrix[0][0] + c_matrix[1][0]))) else: y_test = label_binarize(labels, classes=[0, 1, 2]) auc = roc_auc_score(y_test, y_proba, average='macro') print("AUC: {}".format(auc)) average_precision = average_precision_score(y_test, y_proba, average="macro") print("mean average precision: {}".format(average_precision)) results = pd.concat([ pd.Series(exp_run_folder, name='folder'), pd.Series(fold, name='fold'), pd.Series(auc, name='auc'), pd.Series(average_precision, name='ap'), pd.Series(acc, name='acc') ], axis=1) with open('results.csv', 'a') as f: results.to_csv(f, header=False)