Beispiel #1
0
def convert_data():
    if train:
        files = config.trainfiles
        num = 16
    else:
        files = config.testfiles
        num = 7

    utils.parmap(convert_train_file, files, num)
Beispiel #2
0
def run(X, y, XTest, yTest, foldsNum, path, inds, jobsNum = 6, calcProbs = True):
    # y = y.astype(np.int)
    #  yTest = yTest.astype(np.int)

    # heldoutSize = 1./foldsNum
    # X, idx = shuffle(X)
    # y = y[idx]
    # XTest, idx = shuffle(XTest)
    # yTest = yTest[idx]

    # scaler = preprocessing.StandardScaler().fit(X)
    # X = scaler.transform(X)
    # XTest = scaler.transform(XTest)

    # X = X[:5000, :]
    # y = y[:5000]
    # ---------------------------------------------------- XTest = XTest[:500, :]
    # ------------------------------------------------------- yTest = yTest[:500]

    # cv = StratifiedShuffleSplit(y, foldsNum, heldoutSize, random_state=0)
    cv = IndicesKFold(inds, 5)#, 4000, 1000)
    # -------------------------------- scores = ['roc_auc','precision', 'recall']
    # -------------- aucScoreFunc = make_scorer(aucScore, greater_is_better=True)
    # scores = ['roc_auc'] #[aucScoreFunc] # 'roc_auc'
    #  ---------------------------------------------------- scores = ['precision']
    calcProbs = True

    print('Start the grid search')
    t = time.time()
    for tuned_param in tuned_parameters:
        params = []
        for fold_num, (train_index, test_index) in enumerate(cv):
            params.append((X, y, train_index, test_index, tuned_param, fold_num, calcProbs))

        print(tuned_param)
        if (jobsNum == 1):
            mapResults = [calc_cv_scores(p) for p in params]  # For debugging
        else:
            mapResults = utils.parmap(calc_cv_scores, params, jobsNum)

        cv_scores = np.array([score for (clf, score) in mapResults][:len(cv)])
        print(cv_scores)
        clf = mapResults[0][0]

        elapsed = time.time() - t
        print('Request took '+str(elapsed)+' sec.')
        print(str(datetime.now()))

        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)
        XTest = scaler.transform(XTest)

        printResults(clf, XTest, yTest, calcProbs, path,cv_scores)
Beispiel #3
0
def run(x, y, x_test, y_test, folds_num, path, inds, jobs_num=6, calc_probs=True):
    # y = y.astype(np.int)
    # y_test = y_test.astype(np.int)

    # heldoutSize = 1./folds_num
    # x, idx = shuffle(x)
    # y = y[idx]
    # x_test, idx = shuffle(x_test)
    # y_test = y_test[idx]

    # scaler = preprocessing.StandardScaler().fit(x)
    # x = scaler.transform(x)
    # x_test = scaler.transform(x_test)

    # x = x[:5000, :]
    # y = y[:5000]
    # ---------------------------------------------------- x_test = x_test[:500, :]
    # ------------------------------------------------------- y_test = y_test[:500]

    # cv = StratifiedShuffleSplit(y, folds_num, heldoutSize, random_state=0)
    cv = IndicesKFold(inds, folds_num)  # , 4000, 1000)
    # -------------------------------- scores = ['roc_auc','precision', 'recall']
    # -------------- auc_scoreFunc = make_scorer(auc_score, greater_is_better=True)
    # scores = ['roc_auc'] #[auc_scoreFunc] # 'roc_auc'
    #  ---------------------------------------------------- scores = ['precision']
    # calc_probs = True

    parts_of_path = path.split('/')
    dump_path = os.path.join(DUMP_FOLDER, parts_of_path[-1])
    if not os.path.exists(dump_path):
        os.makedirs(dump_path)

    current_dump_folder = dump_path
    print('Start the grid search')
    t = time.time()
    for tuned_param in tuned_parameters:
        already_exist = check_if_params_were_calculated(dump_path, tuned_param)
        if already_exist:
            continue
        params = []
        for fold_num, (train_index, test_index) in enumerate(cv):
            params.append((x, y, train_index, test_index, tuned_param, fold_num, calc_probs, inds))

        print(tuned_param)
        if jobs_num == 1:
            map_results = [calc_cv_scores(p) for p in params]  # For debugging
        else:
            map_results = utils.parmap(calc_cv_scores, params, jobs_num)

        cv_scores = np.array([(score,scoreTrial) for (clf, score,scoreTrial) in map_results][:len(cv)])
        print(cv_scores)
        mean_cv_score = sum(cv_scores)/len(cv_scores)
        print('==============================mean auc score is '+str(mean_cv_score)+'==============================')
        clf = map_results[0][0]

        elapsed = time.time() - t
        print('Request took '+str(elapsed)+' sec.')
        print(str(datetime.now()))

        # scaler = preprocessing.StandardScaler().fit(x)
        # x = scaler.transform(x)
        # x_test = scaler.transform(x_test)

        mini_path = path.split('/')[-1]
        mini_path = mini_path.replace(' ', '_')
        print_results(clf, x_test, y_test, calc_probs, path, None, cv_scores, current_dump_folder, mean_cv_score)
Beispiel #4
0
    sys.stdout.flush()

    return 1 - test_score


def run_single(train_files_glob, i):
    print '... reading'
    sys.stdout.flush()

    x_train, y_train = gen_vw.read_train_data(train_files_glob, i)
    x_test, y_test = gen_vw.read_test_data(train_files_glob, i)
    x_valid, y_valid = gen_vw.read_valid_data(train_files_glob, i)

    print '... reading done'
    sys.stdout.flush()

    x = T.matrix('x')
    y = T.ivector('y')

    error = train_batch(x, y, x_train, y_train, x_valid, y_valid, x_test, y_test)
    return 1 - error


if __name__ == '__main__':
    #tfile = config.train_folder + "train_subject0[1-4].mat"
    #print run_single(tfile, 0)

    tfile = config.train_folder + "*.mat"
    precisions = utils.parmap(lambda i: run_single(tfile, i), len(glob(tfile)) - 1)
    print sum(precisions) / len(precisions)