Exemple #1
0
def main(cnf, weights_from):

    config = util.load_module(cnf).config

    if weights_from is None:
        weights_from = config.weights_file
    else:
        weights_from = str(weights_from)

    files = data.get_image_files(config.get('train_dir'))
    names = data.get_names(files)
    labels = data.get_labels(names).astype(np.float32)

    net = create_net(config)

    try:
        net.load_params_from(weights_from)
        print("loaded weights from {}".format(weights_from))
    except IOError:
        print("couldn't load weights starting from scratch")
    print("Shape of files: " + str(files.shape))
    print("Shape of labels: " + str(labels.shape))
    start = time.time()
    print("fitting ...")
    net.fit(files, labels)
    end = time.time()
    print("Time elapsed for fitting: " + str(end - start))
def main():
    scores = load('scores')
    labels = get_labels()

    scores_with_raw = np.vstack(scores.values()).T
    scores_without_raw = np.vstack([scores[n] for n in scores if ('raw:' not in n)]).T

    print 'Best Model:',
    print max([(calculateRocScore(scores[name]), name) for name in scores])
    print
    print calculateRocScore(scores_with_raw.mean(axis=1)),
    print 'Average data with raw'
    print calculateRocScore(getWeightforLabel(scores_with_raw, labels)),
    print 'Weighted with raw data'
    print calculateRocScore(getSelectedWeight(scores_with_raw, labels)),
    print 'Selected Weight with raw'
    print
    print calculateRocScore(scores_without_raw.mean(axis=1)),
    print 'Mean without raw data'
    print calculateRocScore(getWeightforLabel(scores_without_raw, labels)),
    print 'Weighted data without raw'
    print calculateRocScore(getSelectedWeight(scores_without_raw, labels)),
    print 'Selected weight without raw data'
    print

    final = getSelectedWeight(scores_without_raw, labels)
    submit(final[len(labels):])
Exemple #3
0
def main():
    scores = load('scores')
    labels = get_labels()

    scores_with_raw = np.vstack(scores.values()).T
    scores_without_raw = np.vstack([scores[n] for n in scores if ('raw:' not in n)]).T

    print ('Best Model:'),
    print (max([(auc(scores[name]), name) for name in scores]))
    print
    print (auc(scores_with_raw.mean(axis=1)),
    print ('Simple Average')
    print auc(weighted(scores_with_raw, labels)),
    print ('Weighted')
    print auc(weight_selected(scores_with_raw, labels)),
    print ('Weight selected')
    print
    print auc(scores_without_raw.mean(axis=1)),
    print ('Simple Average (without raw)')
    print auc(weighted(scores_without_raw, labels)),
    print ('Weighted (without raw)')
    print auc(weight_selected(scores_without_raw, labels)),
    print ('Weight selected (without raw)')
    print

    final = weight_selected(scores_without_raw, labels)
    submit(final[len(labels):])


if __name__ == "__main__":
    main()
Exemple #4
0
def fit(cnf, predict, per_patient, features_file, n_iter, blend_cnf, test_dir):

    config = util.load_module(cnf).config
    image_files = data.get_image_files(config.get('train_dir'))
    names = data.get_names(image_files)
    labels = data.get_labels(names).astype(np.float32)[:, np.newaxis]

    if features_file is not None:
        runs = {'run': [features_file]}
    else:
        runs = data.parse_blend_config(yaml.load(open(blend_cnf)))

    scalers = {run: StandardScaler() for run in runs}

    tr, te = data.split_indices(image_files, labels)

    y_preds = []
    for i in range(n_iter):
        print("iteration {} / {}".format(i + 1, n_iter))
        for run, files in runs.items():
            print("fitting features for run {}".format(run))
            X = data.load_features(files)
            X = scalers[run].fit_transform(X)
            X = data.per_patient_reshape(X) if per_patient else X
            est = get_estimator(X.shape[1], image_files, labels,
                                eval_size=0.0 if predict else 0.1)
            est.fit(X, labels)
            if not predict:
                y_pred = est.predict(X[te]).ravel()
                y_preds.append(y_pred)
                y_pred = np.mean(y_preds, axis=0)
                y_pred = np.clip(np.round(y_pred).astype(int),
                                 np.min(labels), np.max(labels))
                print("kappa after run {}, iter {}: {}".format(
                    run, i, util.kappa(labels[te], y_pred)))
                print("confusion matrix")
                print(confusion_matrix(labels[te], y_pred))
            else:
                X = data.load_features(files, test=True)
                X = scalers[run].transform(X)
                X = data.per_patient_reshape(X) if per_patient else X
                y_pred = est.predict(X).ravel()
                y_preds.append(y_pred)

    if predict:
        y_pred = np.mean(y_preds, axis=0)
        y_pred = np.clip(np.round(y_pred),
                         np.min(labels), np.max(labels)).astype(int)
        submission_filename = util.get_submission_filename()
        image_files = data.get_image_files(test_dir or config.get('test_dir'))
        names = data.get_names(image_files)
        image_column = pd.Series(names, name='image')
        level_column = pd.Series(y_pred, name='level')
        predictions = pd.concat([image_column, level_column], axis=1)

        print("tail of predictions file")
        print(predictions.tail())

        predictions.to_csv(submission_filename, index=False)
        print("saved predictions to {}".format(submission_filename))
def main():
    scores = load("scores")
    labels = get_labels()

    scores_with_raw = np.vstack(scores.values()).T
    scores_without_raw = np.vstack([scores[n] for n in scores if ("raw:" not in n)]).T

    print "Best Model:",
    print max([(auc(scores[name]), name) for name in scores])
    print
    print auc(scores_with_raw.mean(axis=1)),
    print "Simple Average"
    print auc(weighted(scores_with_raw, labels)),
    print "Weighted"
    print auc(weight_selected(scores_with_raw, labels)),
    print "Weight selected"
    print
    print auc(scores_without_raw.mean(axis=1)),
    print "Simple Average (without raw)"
    print auc(weighted(scores_without_raw, labels)),
    print "Weighted (without raw)"
    print auc(weight_selected(scores_without_raw, labels)),
    print "Weight selected (without raw)"
    print

    final = weight_selected(scores_without_raw, labels)
    submit(final[len(labels) :])
Exemple #6
0
def main(cnf, weights_from):

    config = util.load_module(cnf).config
    # print(config)
    if weights_from is None:
        weights_from = config.weights_file
    else:
        weights_from = str(weights_from)
    print(config.get('train_dir'))
    files = data.get_image_files(config.get('train_dir'))
    names = data.get_names(files)
    labels = data.get_labels(names).astype(np.float32)
    print("Checkpoint 5")
    net = create_net(config)
    print("Checkpoint 6")
    print(weights_from)
    # print(net.load_params_from())
    try:
        print("Checkpoint 7")
        net.load_params_from(weights_from)
        print("Checkpoint 8")
        print("loaded weights from {}".format(weights_from))
    except IOError:
        print("couldn't load weights starting from scratch")

    print("fitting ...")
    print(files)
    print(labels)
    net.fit(files, labels)
def main():
    
    os.system("mkdir generated; mv score generated/score")
    os.system("mkdir data; mv trainfile data/train.tsv; mv testfile data/test.tsv")
    get_test() 
    scores = load('score')
    labels = get_labels()

    scores_with_raw = np.vstack(scores.values()).T
    scores_without_raw = np.vstack([scores[n] for n in scores if ('raw:' not in n)]).T

    print 'Best Model:',
    print max([(auc(scores[name]), name) for name in scores])
    print
    print auc(scores_with_raw.mean(axis=1)),
    print 'Simple Average'
    print auc(weighted(scores_with_raw, labels)),
    print 'Weighted'
    print auc(weight_selected(scores_with_raw, labels)),
    print 'Weight selected'
    print
    print auc(scores_without_raw.mean(axis=1)),
    print 'Simple Average (without raw)'
    print auc(weighted(scores_without_raw, labels)),
    print 'Weighted (without raw)'
    print auc(weight_selected(scores_without_raw, labels)),
    print 'Weight selected (without raw)'
    print

    final = weight_selected(scores_without_raw, labels)
    submit(final[len(labels):])
Exemple #8
0
def handle_image_generation(classifier, feature_set, imagepath, title=''):
    '''
    Train a classifier and return it's scores on the train and test split.
    Save a contour image of it's predictions if it is only trained on two features.

    :param classifier: A string or object describing a classifier.
    :param feature_set: A list of column names describing the feature set to train the model on.
    :param imagepath: The path to store the contour plot.
    :param title: The title of the plot with scores.
    :return: The train and test scores for the classifier.
    '''
    train_table, test_table = get_split_table()
    train_labels, test_labels = get_labels(train_table, test_table)
    classifier = fit(classifier, feature_set, train_table)
    train_score = classifier.score(train_table[feature_set], train_labels)
    test_score = classifier.score(test_table[feature_set], test_labels)

    if (len(feature_set) == 2):
        fig = plt.figure()
        ax = visualize_confidence(classifier, train_table, *feature_set)
        plot_with_columns(train_table,
                          *feature_set,
                          ax=ax,
                          marker='+',
                          label='train')
        plot_with_columns(test_table, *feature_set, ax=ax, label='test')
        ax.legend()
        try:
            ax.set_title(
                title.format(train_score=train_score, test_score=test_score))
        except ValueError:
            ax.set_title(title)
        fig.savefig(imagepath)

    return train_score, test_score
Exemple #9
0
def fit(cnf, predict, per_patient, features_file, n_iter, blend_cnf, test_dir):

    config = util.load_module(cnf).config
    image_files = data.get_image_files(config.get('train_dir'))
    names = data.get_names(image_files)
    labels = data.get_labels(names).astype(np.float32)[:, np.newaxis]

    if features_file is not None:
        runs = {'run': [features_file]}
    else:
        runs = data.parse_blend_config(yaml.load(open(blend_cnf)))

    scalers = {run: StandardScaler() for run in runs}

    tr, te = data.split_indices(image_files, labels)

    y_preds = []
    for i in range(n_iter):
        print("iteration {} / {}".format(i + 1, n_iter))
        for run, files in list(runs.items()):
            print("fitting features for run {}".format(run))
            X = data.load_features(files)
            X = scalers[run].fit_transform(X)
            X = data.per_patient_reshape(X) if per_patient else X
            est = get_estimator(X.shape[1], image_files, labels,
                                eval_size=0.0 if predict else 0.1)
            est.fit(X, labels)
            if not predict:
                y_pred = est.predict(X[te]).ravel()
                y_preds.append(y_pred)
                y_pred = np.mean(y_preds, axis=0)
                y_pred = np.clip(np.round(y_pred).astype(int),
                                 np.min(labels), np.max(labels))
                print("kappa after run {}, iter {}: {}".format(
                    run, i, util.kappa(labels[te], y_pred)))
                print("confusion matrix")
                print(confusion_matrix(labels[te], y_pred))
            else:
                X = data.load_features(files, test=True)
                X = scalers[run].transform(X)
                X = data.per_patient_reshape(X) if per_patient else X
                y_pred = est.predict(X).ravel()
                y_preds.append(y_pred)

    if predict:
        y_pred = np.mean(y_preds, axis=0)
        y_pred = np.clip(np.round(y_pred),
                         np.min(labels), np.max(labels)).astype(int)
        submission_filename = util.get_submission_filename()
        image_files = data.get_image_files(test_dir or config.get('test_dir'))
        names = data.get_names(image_files)
        image_column = pd.Series(names, name='image')
        level_column = pd.Series(y_pred, name='level')
        predictions = pd.concat([image_column, level_column], axis=1)

        print("tail of predictions file")
        print(predictions.tail())

        predictions.to_csv(submission_filename, index=False)
        print("saved predictions to {}".format(submission_filename))
 def _get_next_minibatch(self):
     try:
         dataBlob, labelBlob,_ = self.iterator.next()
     except StopIteration:
         filenames = data.get_sentence(self.config.get('datafile'))
         labels = data.get_labels(self.config.get('labelfile'))
         self.iterator = iter(self.sampleIter(filenames,labels))
         dataBlob, labelBlob,_ = self.iterator.next()
     return {'data': dataBlob, 'labels': labelBlob }
Exemple #11
0
def main(directory, convert_directory, test, crop_size, extension):
    try:
        os.mkdir(convert_directory)
    except OSError:
        pass

    filenames = [os.path.join(dp, f) for dp, dn, fn in os.walk(directory)
                 for f in fn if f.endswith('jpeg') or f.endswith('tiff')]
    filenames = sorted(filenames)

    if test:
        names = data.get_names(filenames)
        y = data.get_labels(names)
        for f, level in zip(filenames, y):
            if level == 1:
                try:
                    img = convert(f, crop_size)
                    img.show()
                    Image.open(f).show()
                    real_raw_input = vars(__builtins__).get('raw_input',input)
                    real_raw_input('enter for next')
                except KeyboardInterrupt:
                    exit(0)

    print("Resizing images in {} to {}, this takes a while."
          "".format(directory, convert_directory))

    n = len(filenames)
    # process in batches, sometimes weird things happen with Pool on my machine
    batchsize = 500
    batches = n // batchsize + 1
    pool = Pool(N_PROC)

    args = []
    label= {}
    csv = open('trainLabels.csv')
    csv_lines = csv.readlines()[1:]
    for line in csv_lines:
        line = line.rstrip('\n')
        cols = line.split(',')
        label[cols[0]] = cols[1]
    csv.close()

    for f in filenames:
        args.append((convert, (directory, convert_directory, f, crop_size,
                           extension), label))


    for i in range(batches):
        print("batch {:>2} / {}".format(i + 1, batches))
        pool.map(process, args[i * batchsize: (i + 1) * batchsize])

    pool.close()

    print('done')
Exemple #12
0
def generate_train_test_segments():
    labels_per_file = data.get_labels()

    users = list(range(1, 31))
    random.shuffle(users)
    train_users = users[:21]
    train_labels, test_labels = split_dict(
        labels_per_file, lambda exp_user: exp_user[1] in train_users)

    segmenting.save_segments(segmenting.segment_activities(train_labels),
                             'train_segments.txt')
    segmenting.save_segments(segmenting.segment_activities(test_labels),
                             'test_segments.txt')
Exemple #13
0
def get_scores(data):
    labels = get_labels()

    scores = []

    for train_idx, test_idx in KFold(len(labels), 10):
        score = predict(data[train_idx], labels[train_idx], data[test_idx])
        scores.append(score)

    score = predict(data[:len(labels)], labels, data[len(labels):])
    scores.append(score)

    return np.hstack(scores)
def get_scores(data):
    labels = get_labels()

    scores = []

    for train_idx, test_idx in KFold(len(labels), 10):
        score = predict(data[train_idx], labels[train_idx], data[test_idx])
        scores.append(score)

    score = predict(data[:len(labels)], labels, data[len(labels):])
    scores.append(score)

    return np.hstack(scores)
Exemple #15
0
def main(directory, convert_directory, test, crop_size, extension):

    try:
        os.mkdir(convert_directory)
    except OSError:
        pass

    filenames = [os.path.join(dp, f) for dp, dn, fn in os.walk(directory)
                 for f in fn if f.endswith('jpeg') or f.endswith('tiff')] 
    filenames = sorted(filenames)

    if test:
        names = data.get_names(filenames)
        y = data.get_labels(names)
        for f, level in zip(filenames, y):
            if level == 1:
                try:
                    img = convert(f, crop_size)
                    img.show()
                    Image.open(f).show()
                    real_raw_input = vars(__builtins__).get('raw_input',input)
                    real_raw_input('enter for next')
                except KeyboardInterrupt:
                    exit(0)

    print("Resizing images in {} to {}, this takes a while."
          "".format(directory, convert_directory))

    n = len(filenames)
    # process in batches, sometimes weird things happen with Pool on my machine
    batchsize = 500
    batches = n // batchsize + 1
    pool = Pool(N_PROC)

    args = []

    for f in filenames:
        args.append((convert, (directory, convert_directory, f, crop_size, 
                           extension)))

    for i in range(batches):
        print("batch {:>2} / {}".format(i + 1, batches))
        pool.map(process, args[i * batchsize: (i + 1) * batchsize])

    pool.close()

    print('done')
def main(cnf, classes, weights_from, predict):

    config = util.load_module(cnf).config
    files = data.get_image_files(config.get('train_dir'))
    names = data.get_names(files)
    names = [int(x) for x in names]
    data.classes = int(classes)
    labels = data.get_labels(names)
    net = create_net(config)

    print files.shape
    print labels.shape
    if predict:
        if weights_from is None:
            weights_from = config.weights_file
        else:
            weights_from = str(weights_from)
        print weights_from
        try:
            net.load_params_from(weights_from)
            print("loaded weights from {}".format(weights_from))
        except IOError:
            print("couldn't load weights starting from scratch")
    if not predict:
        print("fitting ...")
        net.fit(files, labels)
    else:
        print("predicting ...")
        test_files = data.get_image_files(config.get('test_dir'))
        y_pred = net.predict(test_files)
        y_pred = y_pred.transpose()
        print y_pred
        y_pred = np.clip(np.round(y_pred), np.min(labels),
                         np.max(labels)).astype(int)
        #print y_pred
        submission_filename = util.get_submission_filename()
        image_files = data.get_image_files(config.get('test_dir'))
        names = data.get_names(image_files)
        image_column = pd.Series(names, name='photo_id')
        level_column = pd.DataFrame(y_pred)  #name='labels')
        level_column = level_column.apply(lambda x: string_submit(x))
        predictions = pd.concat([image_column, level_column], axis=1)
        print("tail of predictions file")
        print(predictions.tail())
        predictions.columns = ['photo_id', 'labels']
        predictions.to_csv(submission_filename, index=False)
        print("saved predictions to {}".format(submission_filename))
def main(cnf, classes, weights_from, predict):

    config = util.load_module(cnf).config
    files = data.get_image_files(config.get('train_dir'))
    names = data.get_names(files)
    names = [int(x) for x in names ]
    data.classes = int(classes)
    labels = data.get_labels(names)
    net = create_net(config)
    
    print files.shape
    print labels.shape
    if predict : 
    	if weights_from is None:
        	weights_from = config.weights_file
    	else:
        	weights_from = str(weights_from)
	print weights_from    
    	try:
        	net.load_params_from(weights_from)
        	print("loaded weights from {}".format(weights_from))
    	except IOError:
        	print("couldn't load weights starting from scratch")
    if not predict:
    	print("fitting ...")
    	net.fit(files, labels)
    else:
	print("predicting ...")
    	test_files = data.get_image_files(config.get('test_dir'))
    	y_pred = net.predict(test_files)
	y_pred = y_pred.transpose()
	print y_pred
        y_pred = np.clip(np.round(y_pred),
                         np.min(labels), np.max(labels)).astype(int)
        #print y_pred
	submission_filename = util.get_submission_filename()
        image_files = data.get_image_files(config.get('test_dir'))
        names = data.get_names(image_files)
        image_column = pd.Series(names, name='photo_id')
        level_column = pd.DataFrame(y_pred)#name='labels')
	level_column = level_column.apply(lambda x : string_submit(x))        
        predictions = pd.concat([image_column, level_column], axis=1)
        print("tail of predictions file")
        print(predictions.tail())
	predictions.columns = ['photo_id', 'labels']
        predictions.to_csv(submission_filename, index=False)
        print("saved predictions to {}".format(submission_filename))
    def setup(self, bottom, top):
        """Setup the ResamplerDataLayer."""
        # parse the layer parameter string
        layer_config = self.param_str
	self.config = util.load_module(layer_config).config
        filenames = data.get_sentence(self.config.get('datafile'))
        labels = data.get_labels(self.config.get('labelfile'))
	self.sampleIter = iterator.SharedIterator(self.config, deterministic=True,batch_size=self.config.get('batch_size'))
	self.iterator = iter(self.sampleIter(filenames,labels))

        self._name_to_top_map = {
            'data': 0,
            'labels': 1}

        top[0].reshape(self.config.get('batch_size'), 3, self.config.get('h'), self.config.get('w'))

        top[1].reshape(self.config.get('batch_size'))
Exemple #19
0
def main(cnf, weights_from, fold, exp_run_folder, train_retina):
    config = util.load_module(cnf).config
    config.cnf[
        'fold'] = fold  # <-- used to change the directories for weights_best, weights_epoch and weights_final
    config.cnf['exp_run_folder'] = exp_run_folder
    protocol = data.settings['protocol']

    if train_retina != 'train_retina':
        folds = yaml.load(open('folds/' + protocol + '.yml'))
        f0, f1 = fold.split('x')
        train_list = folds['Fold_' + f0][int(f1) - 1]
        files = data.get_image_files(config.get('train_dir'), train_list)
    else:
        files = data.get_image_files(config.get('train_dir'))

    if weights_from is None:
        weights_from = config.weights_file
    else:
        weights_from = str(weights_from)

    names = data.get_names(files)
    labels = data.get_labels(names, label_file='folds/' + protocol +
                             '.csv').astype(np.int32)
    net = nn.create_net(config)

    try:
        net.load_params_from(weights_from)
        print("loaded weights from {}".format(weights_from))
    except IOError:
        print("couldn't load weights, starting from scratch")

    #Print layerinfo
    print("## Layer information")
    import nolearn
    layer_info = nolearn.lasagne.PrintLayerInfo()
    print(layer_info._get_greeting(net))
    layer_info, legend = layer_info._get_layer_info_conv(net)
    print(layer_info)
    print(legend)
    print("fitting ...")
    net.fit(files, labels)
Exemple #20
0
def main(cnf, weights_from):

    config = util.load_module(cnf).config

    if weights_from is None:
        weights_from = config.weights_file
    else:
        weights_from = str(weights_from)

    files = data.get_image_files(config.get('train_dir'))
    names = data.get_names(files)
    labels = data.get_labels(names).astype(np.float32)

    net = create_net(config)

    try:
        net.load_params_from(weights_from)
        print("loaded weights from {}".format(weights_from))
    except IOError:
        print("couldn't load weights starting from scratch")

    print("fitting ...")
    net.fit(files, labels)
Exemple #21
0
def build(cnf, weights_from):

    config = util.load_module(cnf).config

    if weights_from is None:
        weights_from = config.weights_file
    else:
        weights_from = str(weights_from)

    files = data.get_image_files(config.get('train_dir'))
    names = data.get_names(files)
    labels = data.get_labels(names).astype(np.float32)

    net = create_net(config)

    try:
        net.load_params_from(weights_from)
        print("loaded weights from {}".format(weights_from))
    except IOError:
        print("couldn't load weights starting from scratch")

    print("fitting ...")
    # net.fit(files, labels)
    return net, files, names, labels
Exemple #22
0
#              help="Override directory with test set images.")
cnf = 'configs/c_512_5x5_32.py'
predict = True
per_patient = True
features_file = None
n_iter =3 
blend_cnf = 'blend.yml'
test_dir = None


#def fit(cnf, predict, per_patient, features_file, n_iter, blend_cnf, test_dir):

config = util.load_module(cnf).config
image_files = data.get_image_files(config.get('train_dir'))
names = data.get_names(image_files)
labels = data.get_labels(names).astype(np.float32)[:, np.newaxis]

if features_file is not None:
    runs = {'run': [features_file]}
else:
    runs = data.parse_blend_config(yaml.load(open(blend_cnf)))

scalers = {run: StandardScaler() for run in runs}

tr, te = data.split_indices(image_files, labels)

y_preds = []
for i in range(n_iter):
    print("iteration {} / {}".format(i + 1, n_iter))
    for run, files in runs.items():
        print("fitting features for run {}".format(run))
Exemple #23
0
# 3. Test data
test_tokens = test['tokens']
test_counts = test['counts']
args.num_docs_test = len(test_tokens)
test_1_tokens = test['tokens_1']
test_1_counts = test['counts_1']
args.num_docs_test_1 = len(test_1_tokens)
test_2_tokens = test['tokens_2']
test_2_counts = test['counts_2']
args.num_docs_test_2 = len(test_2_tokens)

# 4. Labels
can_classify = True
if can_classify:
    _, labels_ts, _ = data.get_labels(args.data_path)
    _, embed_ts, _ = data.get_embeddings(args.data_path)

embeddings = None
if not args.train_embeddings:
    emb_path = args.emb_path
    vect_path = os.path.join(args.data_path.split('/')[0], 'embeddings.pkl')
    vectors = {}
    with open(emb_path, 'rb') as f:
        for l in f:
            line = l.decode().split()
            word = line[0]
            if word in vocab:
                vect = np.array(line[1:]).astype(np.float)
                vectors[word] = vect
    embeddings = np.zeros((vocab_size, args.emb_size))
Exemple #24
0
# warnings.filterwarnings("ignore")

RUN_CNN = False
ROOT_PATH = 'input/synimg/'  # relative to project folder

if __name__ == '__main__':
    # Read input
    MAX_PER_CLASS = 500
    MAX_TEST_ROWS = None

    train_data, train_images = read_data_from_file('synimg/train/data.csv',
                                                   max_per_class=MAX_PER_CLASS)
    test_data, test_images = read_data_from_file(
        'synimg/test/data_nostyle.csv', nrows=MAX_TEST_ROWS)
    label_encoder, train_data = get_labels(
        train_data,
        print_classes=False)  # one-hot encode, returns in column 'style_id'

    if RUN_CNN:
        test_data = run_CNN_model(train_data, train_images, test_data,
                                  test_images)  # skip over feature processing
        write_output(test_data, label_encoder)
        exit(0)
    # Preprocess features
    X_train, X_test = extract_features(train_images, cachefile="cache/train_{}".format(MAX_PER_CLASS)),\
        extract_features(test_images, cachefile="cache/test".format(MAX_TEST_ROWS))
    # X_train, X_test = rescale(X_train), rescale(X_test)
    y_train = list(train_data['style_id'])

    # Train / select model
    model = model_selection(
def auc(s):
    labels = get_labels()
    v = roc_auc_score(labels, s[:len(labels)])
    return round(v, 5)
Exemple #26
0
def auc(s):
    labels = get_labels()
    v = roc_auc_score(labels, s[:len(labels)])
    return round(v, 5)
Exemple #27
0
        acc_file, gyro_file = data.get_raw_acc_gyro(expr_id, user_id)
        acc_values = data.format_raw_data(acc_file)
        gyro_values = data.format_raw_data(gyro_file)

        for segment, label in segments:
            acc_seg = acc_values[segment[0]: segment[1]]
            gyro_seg = gyro_values[segment[0]: segment[1]]

            segmented_data.append((
                {
                    "acc": acc_seg,
                    "gyro": gyro_seg
                },
                label))

    return np.array(segmented_data)


def save_segments(segments, filename):
    with open(filename, 'wb') as output_file:
        pickle.dump(segments, output_file)


def load_segments(filename):
    with open(filename, 'rb') as input_file:
        return pickle.load(input_file)


if __name__ == '__main__':
    print(len(segment_activities(data.get_labels())))
def calculateRocScore(s):
    labels = get_labels()
    v = roc_auc_score(labels, s[:len(labels)])
    return round(v, 5)
Exemple #29
0
    g, features, target_id_to_node, id_to_node = construct_graph(get_files(args.edges, args.training_dir),
                                                                 get_files(args.nodes, args.training_dir),
                                                                 args.target_ntype)

    mean, stdev, features = normalize(th.from_numpy(features))

    print('feature mean shape:{}, std shape:{}'.format(mean.shape, stdev.shape))

    g.nodes['target'].data['features'] = features

    print("Getting labels")
    n_nodes = g.number_of_nodes('target')

    labels, _, test_mask = get_labels(target_id_to_node,
                                               n_nodes,
                                               args.target_ntype,
                                               get_files(args.labels, args.training_dir),
                                               get_files(args.new_accounts, args.training_dir))
    print("Got labels")

    labels = th.from_numpy(labels).float()
    test_mask = th.from_numpy(test_mask).float()

    n_nodes = th.sum(th.tensor([g.number_of_nodes(n_type) for n_type in g.ntypes]))
    n_edges = th.sum(th.tensor([g.number_of_edges(e_type) for e_type in g.etypes]))

    print("""----Data statistics------'
                #Nodes: {}
                #Edges: {}
                #Features Shape: {}
                #Labeled Test samples: {}""".format(n_nodes,
Exemple #30
0
def fit(cnf, exp_run_folder, classifier, features_file, n_iter, blend_cnf,
        test_dir, fold):

    config = util.load_module(cnf).config
    config.cnf[
        'fold'] = fold  # <-- used to change the directories for weights_best, weights_epoch and weights_final
    config.cnf['exp_run_folder'] = exp_run_folder

    folds = yaml.load(open('folds/' + data.settings['protocol'] + '.yml'))
    f0, f1 = fold.split('x')
    train_list = folds['Fold_' + f0][int(f1) - 1]
    test_list = folds['Fold_' + f0][0 if f1 == '2' else 1]

    image_files = data.get_image_files(config.get('train_dir'), train_list)
    names = data.get_names(image_files)
    labels = data.get_labels(names,
                             label_file='folds/' + data.settings['protocol'] +
                             '.csv').astype(np.int32)[:, np.newaxis]

    if features_file is not None:
        runs = {'run': [features_file]}
    else:
        runs = {
            run: [
                os.path.join(exp_run_folder + '/data/features', f)
                for f in files
            ]
            for run, files in yaml.load(open(blend_cnf)).items()
        }

    scalers = {run: StandardScaler() for run in runs}

    y_preds = []
    y_preds_proba = []
    for i in range(n_iter):
        print("iteration {} / {}".format(i + 1, n_iter))
        for run, files in runs.items():
            files = [
                f.replace('f0xf1.npy', '{}.npy'.format(fold)) for f in files
            ]

            if classifier is None:
                X_test = data.load_features(files, test=True)
                if data.settings['protocol'] != 'protocol3':
                    y_pred_proba = X_test
                    y_proba = []
                    for i in range(0, len(X_test)):
                        y_proba.append(
                            y_pred_proba[i][1])  #using score from the positive
                    y_pred = np.clip(np.round(y_proba), 0, 1).astype(int)
                else:
                    y_pred_proba = est.predict_proba(X)
            else:
                print("fitting features for run {}".format(run))
                X_train = data.load_features(files)
                l2Norm = np.linalg.norm(X_train, axis=1)
                X_train = np.divide(X_train.T, l2Norm).T
                est = estimator(data.settings['protocol'],
                                classifier,
                                X_train.shape[1],
                                image_files,
                                X_train,
                                labels,
                                run,
                                fold,
                                eval_size=0.1)
                open(
                    exp_run_folder +
                    "/best_estimator_fold_{}.txt".format(fold),
                    "w").write(str(est))
                X_test = data.load_features(files, test=True)
                l2Norm = np.linalg.norm(X_test, axis=1)
                X_test = np.divide(X_test.T, l2Norm).T
                if data.settings['protocol'] != 'protocol3':
                    y_pred = est.predict(X_test).ravel()
                    y_pred_proba = est.predict_proba(X_test).ravel()
                    y_proba = []
                    for i in range(0, 2 * len(X_test), 2):
                        y_proba.append(
                            y_pred_proba[i +
                                         1])  #using score from the positive
                else:
                    y_pred_binary = est.predict(X_test)
                    y_pred = preprocessing.LabelBinarizer().fit([0, 1, 2])
                    y_pred = y_pred.inverse_transform(y_pred_binary)
                    y_proba = est.predict_proba(X_test)

    image_files = data.get_image_files(test_dir or config.get('test_dir'),
                                       test_list)
    names = data.get_names(image_files)
    labels = data.get_labels(
        names, label_file='folds/' + data.settings['protocol'] +
        '.csv').astype(np.int32)[:, np.newaxis]  # , per_patient=per_patient

    image_column = pd.Series(names, name='image')
    labels_column = pd.Series(np.squeeze(labels), name='true')

    level_column = pd.Series(y_pred, name='pred')
    if data.settings['protocol'] != 'protocol3':
        proba_column = pd.Series(y_proba, name='proba')
        predictions = pd.concat(
            [image_column, labels_column, level_column, proba_column], axis=1)
    else:
        proba_label_0 = pd.Series(y_proba[:, 0], name='proba_label_0')
        proba_label_1 = pd.Series(y_proba[:, 1], name='proba_label_1')
        proba_label_2 = pd.Series(y_proba[:, 2], name='proba_label_2')
        predictions = pd.concat([
            image_column, labels_column, level_column, proba_label_0,
            proba_label_1, proba_label_2
        ],
                                axis=1)

    predictions.to_csv(exp_run_folder +
                       "/ranked_list_fold_{}.csv".format(fold),
                       sep=';')

    print("tail of predictions")
    print(predictions.tail())
    acc = len(filter(lambda
                     (l, y): l == y, zip(labels, y_pred))) / float(len(labels))
    print("accuracy: {}".format(acc))
    print("confusion matrix")
    print(confusion_matrix(labels, y_pred))

    if data.settings['protocol'] != 'protocol3':
        auc = calc_auc(y_proba, labels, exp_run_folder, classifier, fold)
        print("AUC: {}".format(auc))
        average_precision = average_precision_score(labels, y_proba)
        print("average precision: {}".format(average_precision))
        c_matrix = confusion_matrix(labels, y_pred)
        print("sensitivity: {}".format(c_matrix[1][1] /
                                       (c_matrix[1][1] + c_matrix[0][1])))
        print("specificity: {}".format(c_matrix[0][0] /
                                       (c_matrix[0][0] + c_matrix[1][0])))
    else:
        y_test = label_binarize(labels, classes=[0, 1, 2])
        auc = roc_auc_score(y_test, y_proba, average='macro')
        print("AUC: {}".format(auc))
        average_precision = average_precision_score(y_test,
                                                    y_proba,
                                                    average="macro")
        print("mean average precision: {}".format(average_precision))

    results = pd.concat([
        pd.Series(exp_run_folder, name='folder'),
        pd.Series(fold, name='fold'),
        pd.Series(auc, name='auc'),
        pd.Series(average_precision, name='ap'),
        pd.Series(acc, name='acc')
    ],
                        axis=1)
    with open('results.csv', 'a') as f:
        results.to_csv(f, header=False)
Exemple #31
0
 def setUpClass(self):
     # Set up data for the whole TestCase
     self.train_data, self.train_images =  read_data_from_file('synimg/train/data.csv', max_per_class=MAX_PER_CLASS)
     self.label_encoder, self.train_data = get_labels(self.train_data, print_classes=False) # one-hot encode, returns in column 'style_id'
     self.X_train = extract_features(self.train_images)
     self.y_train = list(self.train_data['style_id'])
Exemple #32
0
#Training samples is an array of samples taken from LCS Spring Split 2019 used to train the model
#Testing samples is an array of samples taken from LCS Spring Playoffs 2020 used to test the model's accuracy

#indexes of features
params = [7, 8, 11, 16]
#training samples and class size array
inputs, sample_sizes = data.get_feature_vec(params)
#scaling training samples
inputs = data.scaled_feature_vec(inputs)
#array of labels corresponding to training data
#0: top
#1: jg
#2: mid
#3: adc
#4: sup
type_label = data.get_labels(sample_sizes)

#Fit SVM Model
model = svm.LinearSVC(max_iter=10000)
#model = svm.SVC(gamma='scale', C=1, kernel='rbf')
model.fit(inputs, type_label)


#takes in an int corresponding to position
#prints position
def classify(num):
    if num == 0:
        print('Top')
    elif num == 1:
        print('Jungle')
    elif num == 2:
Exemple #33
0
def fit(classifier, feature_set, train_table):
    '''
    Train a classifier on the specified feature set.
    
    :param classifier: String or object describing a classifier.
    :param feature_set: The column names of the following table used for prediction.
    :param train_table: The dataframe used for training the model.
    :return: The trained classifier.
    '''
    if isinstance(classifier, str):
        if classifier not in classifier_map.keys():
            raise ValueError(f'No classifier with name \'{classifier}\'')
        classifier = classifier_map[classifier]

    train_data = train_table[feature_set].copy()
    train_labels = train_table[['diabetes']].copy()
    train_labels['diabetes'] = train_labels['diabetes'].apply(lambda val: 1 if val == 'pos' else 0)
    classifier.fit(train_data, train_labels)
    return classifier
    
if __name__=='__main__':
    from data import get_split_table, get_numerical_columns, get_labels
    train_table, test_table = get_split_table()
    train_labels, test_labels = get_labels(train_table, test_table)
    feature_set = get_numerical_columns(train_table)
    import random
    classifier = fit(random.choice(list(classifier_map.keys())), feature_set, train_table)
    train_score = classifier.score(train_table[feature_set], train_labels)
    test_score = classifier.score(test_table[feature_set], test_labels)
    print(f'train_score {train_score} test_score {test_score}')
    g, features, target_id_to_node, id_to_node = construct_graph(
        args.training_dir, args.edges, args.nodes, args.target_ntype)

    mean, stdev, features = normalize(th.from_numpy(features))

    print('feature mean shape:{}, std shape:{}'.format(mean.shape,
                                                       stdev.shape))

    g.nodes['target'].data['features'] = features

    print("Getting labels")
    n_nodes = g.number_of_nodes('target')

    labels, _, test_mask = get_labels(
        target_id_to_node, n_nodes, args.target_ntype,
        os.path.join(args.training_dir, args.labels),
        os.path.join(args.training_dir, args.new_accounts))
    print("Got labels")

    labels = th.from_numpy(labels).float()
    test_mask = th.from_numpy(test_mask).float()

    n_nodes = th.sum(
        th.tensor([g.number_of_nodes(n_type) for n_type in g.ntypes]))
    n_edges = th.sum(
        th.tensor([g.number_of_edges(e_type) for e_type in g.etypes]))

    print("""----Data statistics------'
                #Nodes: {}
                #Edges: {}
                #Features Shape: {}