def main(cnf, weights_from): config = util.load_module(cnf).config if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) files = data.get_image_files(config.get('train_dir')) names = data.get_names(files) labels = data.get_labels(names).astype(np.float32) net = create_net(config) try: net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights starting from scratch") print("Shape of files: " + str(files.shape)) print("Shape of labels: " + str(labels.shape)) start = time.time() print("fitting ...") net.fit(files, labels) end = time.time() print("Time elapsed for fitting: " + str(end - start))
def main(): scores = load('scores') labels = get_labels() scores_with_raw = np.vstack(scores.values()).T scores_without_raw = np.vstack([scores[n] for n in scores if ('raw:' not in n)]).T print 'Best Model:', print max([(calculateRocScore(scores[name]), name) for name in scores]) print print calculateRocScore(scores_with_raw.mean(axis=1)), print 'Average data with raw' print calculateRocScore(getWeightforLabel(scores_with_raw, labels)), print 'Weighted with raw data' print calculateRocScore(getSelectedWeight(scores_with_raw, labels)), print 'Selected Weight with raw' print print calculateRocScore(scores_without_raw.mean(axis=1)), print 'Mean without raw data' print calculateRocScore(getWeightforLabel(scores_without_raw, labels)), print 'Weighted data without raw' print calculateRocScore(getSelectedWeight(scores_without_raw, labels)), print 'Selected weight without raw data' print final = getSelectedWeight(scores_without_raw, labels) submit(final[len(labels):])
def main(): scores = load('scores') labels = get_labels() scores_with_raw = np.vstack(scores.values()).T scores_without_raw = np.vstack([scores[n] for n in scores if ('raw:' not in n)]).T print ('Best Model:'), print (max([(auc(scores[name]), name) for name in scores])) print print (auc(scores_with_raw.mean(axis=1)), print ('Simple Average') print auc(weighted(scores_with_raw, labels)), print ('Weighted') print auc(weight_selected(scores_with_raw, labels)), print ('Weight selected') print print auc(scores_without_raw.mean(axis=1)), print ('Simple Average (without raw)') print auc(weighted(scores_without_raw, labels)), print ('Weighted (without raw)') print auc(weight_selected(scores_without_raw, labels)), print ('Weight selected (without raw)') print final = weight_selected(scores_without_raw, labels) submit(final[len(labels):]) if __name__ == "__main__": main()
def fit(cnf, predict, per_patient, features_file, n_iter, blend_cnf, test_dir): config = util.load_module(cnf).config image_files = data.get_image_files(config.get('train_dir')) names = data.get_names(image_files) labels = data.get_labels(names).astype(np.float32)[:, np.newaxis] if features_file is not None: runs = {'run': [features_file]} else: runs = data.parse_blend_config(yaml.load(open(blend_cnf))) scalers = {run: StandardScaler() for run in runs} tr, te = data.split_indices(image_files, labels) y_preds = [] for i in range(n_iter): print("iteration {} / {}".format(i + 1, n_iter)) for run, files in runs.items(): print("fitting features for run {}".format(run)) X = data.load_features(files) X = scalers[run].fit_transform(X) X = data.per_patient_reshape(X) if per_patient else X est = get_estimator(X.shape[1], image_files, labels, eval_size=0.0 if predict else 0.1) est.fit(X, labels) if not predict: y_pred = est.predict(X[te]).ravel() y_preds.append(y_pred) y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred).astype(int), np.min(labels), np.max(labels)) print("kappa after run {}, iter {}: {}".format( run, i, util.kappa(labels[te], y_pred))) print("confusion matrix") print(confusion_matrix(labels[te], y_pred)) else: X = data.load_features(files, test=True) X = scalers[run].transform(X) X = data.per_patient_reshape(X) if per_patient else X y_pred = est.predict(X).ravel() y_preds.append(y_pred) if predict: y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred), np.min(labels), np.max(labels)).astype(int) submission_filename = util.get_submission_filename() image_files = data.get_image_files(test_dir or config.get('test_dir')) names = data.get_names(image_files) image_column = pd.Series(names, name='image') level_column = pd.Series(y_pred, name='level') predictions = pd.concat([image_column, level_column], axis=1) print("tail of predictions file") print(predictions.tail()) predictions.to_csv(submission_filename, index=False) print("saved predictions to {}".format(submission_filename))
def main(): scores = load("scores") labels = get_labels() scores_with_raw = np.vstack(scores.values()).T scores_without_raw = np.vstack([scores[n] for n in scores if ("raw:" not in n)]).T print "Best Model:", print max([(auc(scores[name]), name) for name in scores]) print print auc(scores_with_raw.mean(axis=1)), print "Simple Average" print auc(weighted(scores_with_raw, labels)), print "Weighted" print auc(weight_selected(scores_with_raw, labels)), print "Weight selected" print print auc(scores_without_raw.mean(axis=1)), print "Simple Average (without raw)" print auc(weighted(scores_without_raw, labels)), print "Weighted (without raw)" print auc(weight_selected(scores_without_raw, labels)), print "Weight selected (without raw)" print final = weight_selected(scores_without_raw, labels) submit(final[len(labels) :])
def main(cnf, weights_from): config = util.load_module(cnf).config # print(config) if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) print(config.get('train_dir')) files = data.get_image_files(config.get('train_dir')) names = data.get_names(files) labels = data.get_labels(names).astype(np.float32) print("Checkpoint 5") net = create_net(config) print("Checkpoint 6") print(weights_from) # print(net.load_params_from()) try: print("Checkpoint 7") net.load_params_from(weights_from) print("Checkpoint 8") print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights starting from scratch") print("fitting ...") print(files) print(labels) net.fit(files, labels)
def main(): os.system("mkdir generated; mv score generated/score") os.system("mkdir data; mv trainfile data/train.tsv; mv testfile data/test.tsv") get_test() scores = load('score') labels = get_labels() scores_with_raw = np.vstack(scores.values()).T scores_without_raw = np.vstack([scores[n] for n in scores if ('raw:' not in n)]).T print 'Best Model:', print max([(auc(scores[name]), name) for name in scores]) print print auc(scores_with_raw.mean(axis=1)), print 'Simple Average' print auc(weighted(scores_with_raw, labels)), print 'Weighted' print auc(weight_selected(scores_with_raw, labels)), print 'Weight selected' print print auc(scores_without_raw.mean(axis=1)), print 'Simple Average (without raw)' print auc(weighted(scores_without_raw, labels)), print 'Weighted (without raw)' print auc(weight_selected(scores_without_raw, labels)), print 'Weight selected (without raw)' print final = weight_selected(scores_without_raw, labels) submit(final[len(labels):])
def handle_image_generation(classifier, feature_set, imagepath, title=''): ''' Train a classifier and return it's scores on the train and test split. Save a contour image of it's predictions if it is only trained on two features. :param classifier: A string or object describing a classifier. :param feature_set: A list of column names describing the feature set to train the model on. :param imagepath: The path to store the contour plot. :param title: The title of the plot with scores. :return: The train and test scores for the classifier. ''' train_table, test_table = get_split_table() train_labels, test_labels = get_labels(train_table, test_table) classifier = fit(classifier, feature_set, train_table) train_score = classifier.score(train_table[feature_set], train_labels) test_score = classifier.score(test_table[feature_set], test_labels) if (len(feature_set) == 2): fig = plt.figure() ax = visualize_confidence(classifier, train_table, *feature_set) plot_with_columns(train_table, *feature_set, ax=ax, marker='+', label='train') plot_with_columns(test_table, *feature_set, ax=ax, label='test') ax.legend() try: ax.set_title( title.format(train_score=train_score, test_score=test_score)) except ValueError: ax.set_title(title) fig.savefig(imagepath) return train_score, test_score
def fit(cnf, predict, per_patient, features_file, n_iter, blend_cnf, test_dir): config = util.load_module(cnf).config image_files = data.get_image_files(config.get('train_dir')) names = data.get_names(image_files) labels = data.get_labels(names).astype(np.float32)[:, np.newaxis] if features_file is not None: runs = {'run': [features_file]} else: runs = data.parse_blend_config(yaml.load(open(blend_cnf))) scalers = {run: StandardScaler() for run in runs} tr, te = data.split_indices(image_files, labels) y_preds = [] for i in range(n_iter): print("iteration {} / {}".format(i + 1, n_iter)) for run, files in list(runs.items()): print("fitting features for run {}".format(run)) X = data.load_features(files) X = scalers[run].fit_transform(X) X = data.per_patient_reshape(X) if per_patient else X est = get_estimator(X.shape[1], image_files, labels, eval_size=0.0 if predict else 0.1) est.fit(X, labels) if not predict: y_pred = est.predict(X[te]).ravel() y_preds.append(y_pred) y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred).astype(int), np.min(labels), np.max(labels)) print("kappa after run {}, iter {}: {}".format( run, i, util.kappa(labels[te], y_pred))) print("confusion matrix") print(confusion_matrix(labels[te], y_pred)) else: X = data.load_features(files, test=True) X = scalers[run].transform(X) X = data.per_patient_reshape(X) if per_patient else X y_pred = est.predict(X).ravel() y_preds.append(y_pred) if predict: y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred), np.min(labels), np.max(labels)).astype(int) submission_filename = util.get_submission_filename() image_files = data.get_image_files(test_dir or config.get('test_dir')) names = data.get_names(image_files) image_column = pd.Series(names, name='image') level_column = pd.Series(y_pred, name='level') predictions = pd.concat([image_column, level_column], axis=1) print("tail of predictions file") print(predictions.tail()) predictions.to_csv(submission_filename, index=False) print("saved predictions to {}".format(submission_filename))
def _get_next_minibatch(self): try: dataBlob, labelBlob,_ = self.iterator.next() except StopIteration: filenames = data.get_sentence(self.config.get('datafile')) labels = data.get_labels(self.config.get('labelfile')) self.iterator = iter(self.sampleIter(filenames,labels)) dataBlob, labelBlob,_ = self.iterator.next() return {'data': dataBlob, 'labels': labelBlob }
def main(directory, convert_directory, test, crop_size, extension): try: os.mkdir(convert_directory) except OSError: pass filenames = [os.path.join(dp, f) for dp, dn, fn in os.walk(directory) for f in fn if f.endswith('jpeg') or f.endswith('tiff')] filenames = sorted(filenames) if test: names = data.get_names(filenames) y = data.get_labels(names) for f, level in zip(filenames, y): if level == 1: try: img = convert(f, crop_size) img.show() Image.open(f).show() real_raw_input = vars(__builtins__).get('raw_input',input) real_raw_input('enter for next') except KeyboardInterrupt: exit(0) print("Resizing images in {} to {}, this takes a while." "".format(directory, convert_directory)) n = len(filenames) # process in batches, sometimes weird things happen with Pool on my machine batchsize = 500 batches = n // batchsize + 1 pool = Pool(N_PROC) args = [] label= {} csv = open('trainLabels.csv') csv_lines = csv.readlines()[1:] for line in csv_lines: line = line.rstrip('\n') cols = line.split(',') label[cols[0]] = cols[1] csv.close() for f in filenames: args.append((convert, (directory, convert_directory, f, crop_size, extension), label)) for i in range(batches): print("batch {:>2} / {}".format(i + 1, batches)) pool.map(process, args[i * batchsize: (i + 1) * batchsize]) pool.close() print('done')
def generate_train_test_segments(): labels_per_file = data.get_labels() users = list(range(1, 31)) random.shuffle(users) train_users = users[:21] train_labels, test_labels = split_dict( labels_per_file, lambda exp_user: exp_user[1] in train_users) segmenting.save_segments(segmenting.segment_activities(train_labels), 'train_segments.txt') segmenting.save_segments(segmenting.segment_activities(test_labels), 'test_segments.txt')
def get_scores(data): labels = get_labels() scores = [] for train_idx, test_idx in KFold(len(labels), 10): score = predict(data[train_idx], labels[train_idx], data[test_idx]) scores.append(score) score = predict(data[:len(labels)], labels, data[len(labels):]) scores.append(score) return np.hstack(scores)
def main(directory, convert_directory, test, crop_size, extension): try: os.mkdir(convert_directory) except OSError: pass filenames = [os.path.join(dp, f) for dp, dn, fn in os.walk(directory) for f in fn if f.endswith('jpeg') or f.endswith('tiff')] filenames = sorted(filenames) if test: names = data.get_names(filenames) y = data.get_labels(names) for f, level in zip(filenames, y): if level == 1: try: img = convert(f, crop_size) img.show() Image.open(f).show() real_raw_input = vars(__builtins__).get('raw_input',input) real_raw_input('enter for next') except KeyboardInterrupt: exit(0) print("Resizing images in {} to {}, this takes a while." "".format(directory, convert_directory)) n = len(filenames) # process in batches, sometimes weird things happen with Pool on my machine batchsize = 500 batches = n // batchsize + 1 pool = Pool(N_PROC) args = [] for f in filenames: args.append((convert, (directory, convert_directory, f, crop_size, extension))) for i in range(batches): print("batch {:>2} / {}".format(i + 1, batches)) pool.map(process, args[i * batchsize: (i + 1) * batchsize]) pool.close() print('done')
def main(cnf, classes, weights_from, predict): config = util.load_module(cnf).config files = data.get_image_files(config.get('train_dir')) names = data.get_names(files) names = [int(x) for x in names] data.classes = int(classes) labels = data.get_labels(names) net = create_net(config) print files.shape print labels.shape if predict: if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) print weights_from try: net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights starting from scratch") if not predict: print("fitting ...") net.fit(files, labels) else: print("predicting ...") test_files = data.get_image_files(config.get('test_dir')) y_pred = net.predict(test_files) y_pred = y_pred.transpose() print y_pred y_pred = np.clip(np.round(y_pred), np.min(labels), np.max(labels)).astype(int) #print y_pred submission_filename = util.get_submission_filename() image_files = data.get_image_files(config.get('test_dir')) names = data.get_names(image_files) image_column = pd.Series(names, name='photo_id') level_column = pd.DataFrame(y_pred) #name='labels') level_column = level_column.apply(lambda x: string_submit(x)) predictions = pd.concat([image_column, level_column], axis=1) print("tail of predictions file") print(predictions.tail()) predictions.columns = ['photo_id', 'labels'] predictions.to_csv(submission_filename, index=False) print("saved predictions to {}".format(submission_filename))
def main(cnf, classes, weights_from, predict): config = util.load_module(cnf).config files = data.get_image_files(config.get('train_dir')) names = data.get_names(files) names = [int(x) for x in names ] data.classes = int(classes) labels = data.get_labels(names) net = create_net(config) print files.shape print labels.shape if predict : if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) print weights_from try: net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights starting from scratch") if not predict: print("fitting ...") net.fit(files, labels) else: print("predicting ...") test_files = data.get_image_files(config.get('test_dir')) y_pred = net.predict(test_files) y_pred = y_pred.transpose() print y_pred y_pred = np.clip(np.round(y_pred), np.min(labels), np.max(labels)).astype(int) #print y_pred submission_filename = util.get_submission_filename() image_files = data.get_image_files(config.get('test_dir')) names = data.get_names(image_files) image_column = pd.Series(names, name='photo_id') level_column = pd.DataFrame(y_pred)#name='labels') level_column = level_column.apply(lambda x : string_submit(x)) predictions = pd.concat([image_column, level_column], axis=1) print("tail of predictions file") print(predictions.tail()) predictions.columns = ['photo_id', 'labels'] predictions.to_csv(submission_filename, index=False) print("saved predictions to {}".format(submission_filename))
def setup(self, bottom, top): """Setup the ResamplerDataLayer.""" # parse the layer parameter string layer_config = self.param_str self.config = util.load_module(layer_config).config filenames = data.get_sentence(self.config.get('datafile')) labels = data.get_labels(self.config.get('labelfile')) self.sampleIter = iterator.SharedIterator(self.config, deterministic=True,batch_size=self.config.get('batch_size')) self.iterator = iter(self.sampleIter(filenames,labels)) self._name_to_top_map = { 'data': 0, 'labels': 1} top[0].reshape(self.config.get('batch_size'), 3, self.config.get('h'), self.config.get('w')) top[1].reshape(self.config.get('batch_size'))
def main(cnf, weights_from, fold, exp_run_folder, train_retina): config = util.load_module(cnf).config config.cnf[ 'fold'] = fold # <-- used to change the directories for weights_best, weights_epoch and weights_final config.cnf['exp_run_folder'] = exp_run_folder protocol = data.settings['protocol'] if train_retina != 'train_retina': folds = yaml.load(open('folds/' + protocol + '.yml')) f0, f1 = fold.split('x') train_list = folds['Fold_' + f0][int(f1) - 1] files = data.get_image_files(config.get('train_dir'), train_list) else: files = data.get_image_files(config.get('train_dir')) if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) names = data.get_names(files) labels = data.get_labels(names, label_file='folds/' + protocol + '.csv').astype(np.int32) net = nn.create_net(config) try: net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights, starting from scratch") #Print layerinfo print("## Layer information") import nolearn layer_info = nolearn.lasagne.PrintLayerInfo() print(layer_info._get_greeting(net)) layer_info, legend = layer_info._get_layer_info_conv(net) print(layer_info) print(legend) print("fitting ...") net.fit(files, labels)
def main(cnf, weights_from): config = util.load_module(cnf).config if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) files = data.get_image_files(config.get('train_dir')) names = data.get_names(files) labels = data.get_labels(names).astype(np.float32) net = create_net(config) try: net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights starting from scratch") print("fitting ...") net.fit(files, labels)
def build(cnf, weights_from): config = util.load_module(cnf).config if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) files = data.get_image_files(config.get('train_dir')) names = data.get_names(files) labels = data.get_labels(names).astype(np.float32) net = create_net(config) try: net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights starting from scratch") print("fitting ...") # net.fit(files, labels) return net, files, names, labels
# help="Override directory with test set images.") cnf = 'configs/c_512_5x5_32.py' predict = True per_patient = True features_file = None n_iter =3 blend_cnf = 'blend.yml' test_dir = None #def fit(cnf, predict, per_patient, features_file, n_iter, blend_cnf, test_dir): config = util.load_module(cnf).config image_files = data.get_image_files(config.get('train_dir')) names = data.get_names(image_files) labels = data.get_labels(names).astype(np.float32)[:, np.newaxis] if features_file is not None: runs = {'run': [features_file]} else: runs = data.parse_blend_config(yaml.load(open(blend_cnf))) scalers = {run: StandardScaler() for run in runs} tr, te = data.split_indices(image_files, labels) y_preds = [] for i in range(n_iter): print("iteration {} / {}".format(i + 1, n_iter)) for run, files in runs.items(): print("fitting features for run {}".format(run))
# 3. Test data test_tokens = test['tokens'] test_counts = test['counts'] args.num_docs_test = len(test_tokens) test_1_tokens = test['tokens_1'] test_1_counts = test['counts_1'] args.num_docs_test_1 = len(test_1_tokens) test_2_tokens = test['tokens_2'] test_2_counts = test['counts_2'] args.num_docs_test_2 = len(test_2_tokens) # 4. Labels can_classify = True if can_classify: _, labels_ts, _ = data.get_labels(args.data_path) _, embed_ts, _ = data.get_embeddings(args.data_path) embeddings = None if not args.train_embeddings: emb_path = args.emb_path vect_path = os.path.join(args.data_path.split('/')[0], 'embeddings.pkl') vectors = {} with open(emb_path, 'rb') as f: for l in f: line = l.decode().split() word = line[0] if word in vocab: vect = np.array(line[1:]).astype(np.float) vectors[word] = vect embeddings = np.zeros((vocab_size, args.emb_size))
# warnings.filterwarnings("ignore") RUN_CNN = False ROOT_PATH = 'input/synimg/' # relative to project folder if __name__ == '__main__': # Read input MAX_PER_CLASS = 500 MAX_TEST_ROWS = None train_data, train_images = read_data_from_file('synimg/train/data.csv', max_per_class=MAX_PER_CLASS) test_data, test_images = read_data_from_file( 'synimg/test/data_nostyle.csv', nrows=MAX_TEST_ROWS) label_encoder, train_data = get_labels( train_data, print_classes=False) # one-hot encode, returns in column 'style_id' if RUN_CNN: test_data = run_CNN_model(train_data, train_images, test_data, test_images) # skip over feature processing write_output(test_data, label_encoder) exit(0) # Preprocess features X_train, X_test = extract_features(train_images, cachefile="cache/train_{}".format(MAX_PER_CLASS)),\ extract_features(test_images, cachefile="cache/test".format(MAX_TEST_ROWS)) # X_train, X_test = rescale(X_train), rescale(X_test) y_train = list(train_data['style_id']) # Train / select model model = model_selection(
def auc(s): labels = get_labels() v = roc_auc_score(labels, s[:len(labels)]) return round(v, 5)
acc_file, gyro_file = data.get_raw_acc_gyro(expr_id, user_id) acc_values = data.format_raw_data(acc_file) gyro_values = data.format_raw_data(gyro_file) for segment, label in segments: acc_seg = acc_values[segment[0]: segment[1]] gyro_seg = gyro_values[segment[0]: segment[1]] segmented_data.append(( { "acc": acc_seg, "gyro": gyro_seg }, label)) return np.array(segmented_data) def save_segments(segments, filename): with open(filename, 'wb') as output_file: pickle.dump(segments, output_file) def load_segments(filename): with open(filename, 'rb') as input_file: return pickle.load(input_file) if __name__ == '__main__': print(len(segment_activities(data.get_labels())))
def calculateRocScore(s): labels = get_labels() v = roc_auc_score(labels, s[:len(labels)]) return round(v, 5)
g, features, target_id_to_node, id_to_node = construct_graph(get_files(args.edges, args.training_dir), get_files(args.nodes, args.training_dir), args.target_ntype) mean, stdev, features = normalize(th.from_numpy(features)) print('feature mean shape:{}, std shape:{}'.format(mean.shape, stdev.shape)) g.nodes['target'].data['features'] = features print("Getting labels") n_nodes = g.number_of_nodes('target') labels, _, test_mask = get_labels(target_id_to_node, n_nodes, args.target_ntype, get_files(args.labels, args.training_dir), get_files(args.new_accounts, args.training_dir)) print("Got labels") labels = th.from_numpy(labels).float() test_mask = th.from_numpy(test_mask).float() n_nodes = th.sum(th.tensor([g.number_of_nodes(n_type) for n_type in g.ntypes])) n_edges = th.sum(th.tensor([g.number_of_edges(e_type) for e_type in g.etypes])) print("""----Data statistics------' #Nodes: {} #Edges: {} #Features Shape: {} #Labeled Test samples: {}""".format(n_nodes,
def fit(cnf, exp_run_folder, classifier, features_file, n_iter, blend_cnf, test_dir, fold): config = util.load_module(cnf).config config.cnf[ 'fold'] = fold # <-- used to change the directories for weights_best, weights_epoch and weights_final config.cnf['exp_run_folder'] = exp_run_folder folds = yaml.load(open('folds/' + data.settings['protocol'] + '.yml')) f0, f1 = fold.split('x') train_list = folds['Fold_' + f0][int(f1) - 1] test_list = folds['Fold_' + f0][0 if f1 == '2' else 1] image_files = data.get_image_files(config.get('train_dir'), train_list) names = data.get_names(image_files) labels = data.get_labels(names, label_file='folds/' + data.settings['protocol'] + '.csv').astype(np.int32)[:, np.newaxis] if features_file is not None: runs = {'run': [features_file]} else: runs = { run: [ os.path.join(exp_run_folder + '/data/features', f) for f in files ] for run, files in yaml.load(open(blend_cnf)).items() } scalers = {run: StandardScaler() for run in runs} y_preds = [] y_preds_proba = [] for i in range(n_iter): print("iteration {} / {}".format(i + 1, n_iter)) for run, files in runs.items(): files = [ f.replace('f0xf1.npy', '{}.npy'.format(fold)) for f in files ] if classifier is None: X_test = data.load_features(files, test=True) if data.settings['protocol'] != 'protocol3': y_pred_proba = X_test y_proba = [] for i in range(0, len(X_test)): y_proba.append( y_pred_proba[i][1]) #using score from the positive y_pred = np.clip(np.round(y_proba), 0, 1).astype(int) else: y_pred_proba = est.predict_proba(X) else: print("fitting features for run {}".format(run)) X_train = data.load_features(files) l2Norm = np.linalg.norm(X_train, axis=1) X_train = np.divide(X_train.T, l2Norm).T est = estimator(data.settings['protocol'], classifier, X_train.shape[1], image_files, X_train, labels, run, fold, eval_size=0.1) open( exp_run_folder + "/best_estimator_fold_{}.txt".format(fold), "w").write(str(est)) X_test = data.load_features(files, test=True) l2Norm = np.linalg.norm(X_test, axis=1) X_test = np.divide(X_test.T, l2Norm).T if data.settings['protocol'] != 'protocol3': y_pred = est.predict(X_test).ravel() y_pred_proba = est.predict_proba(X_test).ravel() y_proba = [] for i in range(0, 2 * len(X_test), 2): y_proba.append( y_pred_proba[i + 1]) #using score from the positive else: y_pred_binary = est.predict(X_test) y_pred = preprocessing.LabelBinarizer().fit([0, 1, 2]) y_pred = y_pred.inverse_transform(y_pred_binary) y_proba = est.predict_proba(X_test) image_files = data.get_image_files(test_dir or config.get('test_dir'), test_list) names = data.get_names(image_files) labels = data.get_labels( names, label_file='folds/' + data.settings['protocol'] + '.csv').astype(np.int32)[:, np.newaxis] # , per_patient=per_patient image_column = pd.Series(names, name='image') labels_column = pd.Series(np.squeeze(labels), name='true') level_column = pd.Series(y_pred, name='pred') if data.settings['protocol'] != 'protocol3': proba_column = pd.Series(y_proba, name='proba') predictions = pd.concat( [image_column, labels_column, level_column, proba_column], axis=1) else: proba_label_0 = pd.Series(y_proba[:, 0], name='proba_label_0') proba_label_1 = pd.Series(y_proba[:, 1], name='proba_label_1') proba_label_2 = pd.Series(y_proba[:, 2], name='proba_label_2') predictions = pd.concat([ image_column, labels_column, level_column, proba_label_0, proba_label_1, proba_label_2 ], axis=1) predictions.to_csv(exp_run_folder + "/ranked_list_fold_{}.csv".format(fold), sep=';') print("tail of predictions") print(predictions.tail()) acc = len(filter(lambda (l, y): l == y, zip(labels, y_pred))) / float(len(labels)) print("accuracy: {}".format(acc)) print("confusion matrix") print(confusion_matrix(labels, y_pred)) if data.settings['protocol'] != 'protocol3': auc = calc_auc(y_proba, labels, exp_run_folder, classifier, fold) print("AUC: {}".format(auc)) average_precision = average_precision_score(labels, y_proba) print("average precision: {}".format(average_precision)) c_matrix = confusion_matrix(labels, y_pred) print("sensitivity: {}".format(c_matrix[1][1] / (c_matrix[1][1] + c_matrix[0][1]))) print("specificity: {}".format(c_matrix[0][0] / (c_matrix[0][0] + c_matrix[1][0]))) else: y_test = label_binarize(labels, classes=[0, 1, 2]) auc = roc_auc_score(y_test, y_proba, average='macro') print("AUC: {}".format(auc)) average_precision = average_precision_score(y_test, y_proba, average="macro") print("mean average precision: {}".format(average_precision)) results = pd.concat([ pd.Series(exp_run_folder, name='folder'), pd.Series(fold, name='fold'), pd.Series(auc, name='auc'), pd.Series(average_precision, name='ap'), pd.Series(acc, name='acc') ], axis=1) with open('results.csv', 'a') as f: results.to_csv(f, header=False)
def setUpClass(self): # Set up data for the whole TestCase self.train_data, self.train_images = read_data_from_file('synimg/train/data.csv', max_per_class=MAX_PER_CLASS) self.label_encoder, self.train_data = get_labels(self.train_data, print_classes=False) # one-hot encode, returns in column 'style_id' self.X_train = extract_features(self.train_images) self.y_train = list(self.train_data['style_id'])
#Training samples is an array of samples taken from LCS Spring Split 2019 used to train the model #Testing samples is an array of samples taken from LCS Spring Playoffs 2020 used to test the model's accuracy #indexes of features params = [7, 8, 11, 16] #training samples and class size array inputs, sample_sizes = data.get_feature_vec(params) #scaling training samples inputs = data.scaled_feature_vec(inputs) #array of labels corresponding to training data #0: top #1: jg #2: mid #3: adc #4: sup type_label = data.get_labels(sample_sizes) #Fit SVM Model model = svm.LinearSVC(max_iter=10000) #model = svm.SVC(gamma='scale', C=1, kernel='rbf') model.fit(inputs, type_label) #takes in an int corresponding to position #prints position def classify(num): if num == 0: print('Top') elif num == 1: print('Jungle') elif num == 2:
def fit(classifier, feature_set, train_table): ''' Train a classifier on the specified feature set. :param classifier: String or object describing a classifier. :param feature_set: The column names of the following table used for prediction. :param train_table: The dataframe used for training the model. :return: The trained classifier. ''' if isinstance(classifier, str): if classifier not in classifier_map.keys(): raise ValueError(f'No classifier with name \'{classifier}\'') classifier = classifier_map[classifier] train_data = train_table[feature_set].copy() train_labels = train_table[['diabetes']].copy() train_labels['diabetes'] = train_labels['diabetes'].apply(lambda val: 1 if val == 'pos' else 0) classifier.fit(train_data, train_labels) return classifier if __name__=='__main__': from data import get_split_table, get_numerical_columns, get_labels train_table, test_table = get_split_table() train_labels, test_labels = get_labels(train_table, test_table) feature_set = get_numerical_columns(train_table) import random classifier = fit(random.choice(list(classifier_map.keys())), feature_set, train_table) train_score = classifier.score(train_table[feature_set], train_labels) test_score = classifier.score(test_table[feature_set], test_labels) print(f'train_score {train_score} test_score {test_score}')
g, features, target_id_to_node, id_to_node = construct_graph( args.training_dir, args.edges, args.nodes, args.target_ntype) mean, stdev, features = normalize(th.from_numpy(features)) print('feature mean shape:{}, std shape:{}'.format(mean.shape, stdev.shape)) g.nodes['target'].data['features'] = features print("Getting labels") n_nodes = g.number_of_nodes('target') labels, _, test_mask = get_labels( target_id_to_node, n_nodes, args.target_ntype, os.path.join(args.training_dir, args.labels), os.path.join(args.training_dir, args.new_accounts)) print("Got labels") labels = th.from_numpy(labels).float() test_mask = th.from_numpy(test_mask).float() n_nodes = th.sum( th.tensor([g.number_of_nodes(n_type) for n_type in g.ntypes])) n_edges = th.sum( th.tensor([g.number_of_edges(e_type) for e_type in g.etypes])) print("""----Data statistics------' #Nodes: {} #Edges: {} #Features Shape: {}