Example #1
0
def fine_tuning():

    # get unique drivers
    drivers = pd.read_csv('../input/driver_imgs_list.csv')
    unique_drivers = np.array(list((set(drivers['subject']))))

    dlist = list(set(drivers['subject']))
    clist = ['c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9']

    import itertools
    dc_list = list(itertools.product(dlist, clist))

    random.seed(random_state)
    random.shuffle(dc_list)

    kf = StratifiedKFold(map(lambda x: x[1:],np.array(dc_list)[:,1]), n_folds=nfolds,
                    shuffle=False, random_state=random_state)

    num_fold = 0
    fold_number = 0
    cv_pred_list = []
    cv_score = []

    all_pred_list = []

    for train_drivers, test_drivers in kf:

        #if fold_number <= 0:
        #    fold_number += 1
        #    continue

        # add test drivers for training i.e., semi-supervised learning
        #sample_rate_test = 0.3
        np.random.seed(fold_number)
        sample_rate_test = np.random.uniform(0.1, 0.3)
        print 'sample_rate_test:', sample_rate_test

        preds_test = pd.read_csv('submission/ensemble1_8.csv')

        # extract c9
        tmp_c9 = preds_test.iloc[(preds_test.c9>0.450).values,:]
        preds_test = preds_test.iloc[(preds_test.c9<=0.450).values,:]

        # extrsct features > 0.9, except c9
        tmp = preds_test.iloc[:,:10].max(1)
        tmp_idx = tmp.iloc[(tmp > 0.90).values].index
        preds_test = preds_test.ix[tmp_idx]

        # concat
        preds_test = pd.concat([preds_test, tmp_c9], axis=0)


        preds_test.reset_index(drop=True, inplace=True)

        preds_test_label = preds_test.iloc[:,:10].idxmax(1)
        preds_test_img = preds_test['img']

        preds_test_semi = pd.DataFrame()
        preds_test_semi['subject'] = ['p099'] * len(preds_test)
        preds_test_semi['classname'] = preds_test_label.values
        preds_test_semi['img'] = preds_test_img.values

        preds_test_semi['classname'] = 'test_' + preds_test_semi['classname']

        preds_test_semi = preds_test_semi.sample(int(len(preds_test) * sample_rate_test)
                        , random_state=fold_number)
        """
        try:
            print 'Fold{} training with drivers split'.format(fold_number)

            dc_list_tr = np.array(dc_list)[train_drivers].tolist()
            dc_list_te = np.array(dc_list)[test_drivers].tolist()

            print 'combination for validation'
            for i in dc_list_te:
                print i,
            print

            print 'number of validation drivers: {}'.format(len(set(np.array(dc_list_te)[:,0])))
            print 'number of validation class: {}'.format(len(set(np.array(dc_list_te)[:,1])))

            print pd.Series(np.array(dc_list_te)[:,0]).value_counts()
            print pd.Series(np.array(dc_list_te)[:,1]).value_counts()

            def f_tr(data):
                if data.tolist() in dc_list_tr:
                    return True
                else:
                    return False

            def f_te(data):
                if data.tolist() in dc_list_te:
                    return True
                else:
                    return False

            index_tr = drivers[['subject', 'classname']].apply(f_tr, axis=1).values
            index_te = drivers[['subject', 'classname']].apply(f_te, axis=1).values

            alltrain_drivers = drivers[index_tr]
            allvalid_drivers = drivers[index_te]


            # add tet images for training
            alltrain_drivers = pd.concat([alltrain_drivers, preds_test_semi])


            alltrain_drivers.to_csv('../input/driver_imgs_list_alltrain.csv', index=False)
            allvalid_drivers.to_csv('../input/driver_imgs_list_allvalid.csv', index=False)

            print 'final training'
            model = final_training()


            # recalculate

            #print 'train drivers: {}'.format(train_drivers_fold)
            #print 'validation drivers: {}'.format(test_drivers_fold)

            samples_per_epoch = len(alltrain_drivers)
            nb_val_samples = len(allvalid_drivers)

            print 'training data: {}'.format(samples_per_epoch)
            print 'validation data: {}'.format(nb_val_samples)
            #samples_per_epoch = batch_size * (samples_per_epoch // batch_size)
            #nb_val_samples = batch_size * (nb_val_samples // batch_size)

            #
            #model = vgg_std16_model(img_rows, img_cols, color_type_global)

            #print 'create generator for saving bottlebeck features'
            train_data_generator = generate_arrays_from_file( \
                                    '../input/driver_imgs_list_alltrain.csv'
                                    ,isvalidation=False, usingalldata=False)

            valid_data_generator = generate_arrays_from_file( \
                                    '../input/driver_imgs_list_allvalid.csv'
                                    , isvalidation=True, usingalldata=False)

            callbacks = [
                EarlyStopping(monitor='val_loss', patience=4, verbose=0),
            ]

            #trianing
            model.fit_generator(train_data_generator,
                        samples_per_epoch=samples_per_epoch, nb_epoch=nb_epoch_all,
                        nb_val_samples=nb_val_samples,
                        validation_data=valid_data_generator, max_q_size=10)


            predictions_valid = model.evaluate_generator(valid_data_generator,
                val_samples=nb_val_samples)
            #score = log_loss(Y_valid, predictions_valid)
            print('Score log_loss: ', predictions_valid)
            cv_score.append(predictions_valid)

            info_string = 'loss_' + str(predictions_valid) \
                        + '_r_' + str(img_rows) \
                        + '_c_' + str(img_cols) \
                        + '_folds_' + str(fold_number) \
                        + '_ep_' + str(nb_epoch)

            cv_pred_list.append('submission/' + info_string + '.csv')

            # predictions with new version
            test_data_generator = test_prediction('../input/imgs/test/*.jpg')

            preds = model.predict_generator(test_data_generator, val_samples=79726)

            save_pred(preds, '../input/imgs/test/*.jpg', \
                                        submission_name=info_string)

            del model
            gc.collect()


        except Exception as e:
            print str(e)



        # delete top model weights
        if os.path.exists(top_model_weights_path):
            os.remove(top_model_weights_path)
        """

        ### Using all data with random split
        try:
            print 'Fold{} training with all data'.format(fold_number)

            drivers = pd.read_csv('../input/driver_imgs_list.csv')

            # random split
            np.random.seed(fold_number)
            split_data = np.random.uniform(0.05, 0.15)

            random_index = random.sample(range(len(drivers)),int(len(drivers)*(1-split_data)))
            alltrain_drivers = drivers.iloc[(drivers.index.isin(random_index)), :]
            allvalid_drivers = drivers.iloc[~(drivers.index.isin(random_index)), :]

            # add tet images for training
            alltrain_drivers = pd.concat([alltrain_drivers, preds_test_semi])

            alltrain_drivers.to_csv('../input/driver_imgs_list_alltrain.csv', index=False)
            allvalid_drivers.to_csv('../input/driver_imgs_list_allvalid.csv', index=False)


            print 'final training'
            model = final_training()


            # recalculate

            #print 'train drivers: {}'.format(train_drivers_fold)
            #print 'validation drivers: {}'.format(test_drivers_fold)

            samples_per_epoch = len(alltrain_drivers)
            nb_val_samples = len(allvalid_drivers)

            print 'training data: {}'.format(samples_per_epoch)
            print 'validation data: {}'.format(nb_val_samples)
            #samples_per_epoch = batch_size * (samples_per_epoch // batch_size)
            #nb_val_samples = batch_size * (nb_val_samples // batch_size)

            #
            #model = vgg_std16_model(img_rows, img_cols, color_type_global)

            #print 'create generator for saving bottlebeck features'
            train_data_generator = generate_arrays_from_file( \
                                    '../input/driver_imgs_list_alltrain.csv'
                                    ,isvalidation=False, usingalldata=False)

            valid_data_generator = generate_arrays_from_file( \
                                    '../input/driver_imgs_list_allvalid.csv'
                                    , isvalidation=True, usingalldata=False)

            callbacks = [
                EarlyStopping(monitor='val_loss', patience=4, verbose=0),
            ]

            #trianing
            model.fit_generator(train_data_generator,
                        samples_per_epoch=samples_per_epoch, nb_epoch=nb_epoch_all,
                        nb_val_samples=nb_val_samples,
                        validation_data=valid_data_generator, max_q_size=10)
                        #callbacks=callbacks)


            predictions_valid = model.evaluate_generator(valid_data_generator,
                val_samples=nb_val_samples)
            #score = log_loss(Y_valid, predictions_valid)
            print('Score log_loss: ', predictions_valid)
            #cv_score.append(predictions_valid)

            info_string = 'all_loss_' + str(predictions_valid) \
                        + '_r_' + str(img_rows) \
                        + '_c_' + str(img_cols) \
                        + '_folds_' + str(fold_number) \
                        + '_ep_' + str(nb_epoch_all) \
                        + 'test' + str(sample_rate_test)

            all_pred_list.append('submission/' + info_string + '.csv')

            # predictions with new version
            test_data_generator = test_prediction('../input/imgs/test/*.jpg')

            preds = model.predict_generator(test_data_generator, val_samples=79726)

            save_pred(preds, '../input/imgs/test/*.jpg', \
                                        submission_name=info_string)

            del model
            gc.collect()


        except Exception as e:
            print str(e)
            fold_number += 1
            continue


        # next fold
        fold_number += 1


    #print 'CV mean: {:.6}, std: {:.6}'.format(np.mean(cv_score), np.std(cv_score))
    #averaging(cv_pred_list, 'ensemble_{}_CV{:.3}'.format(model_name, np.mean(cv_score)))
    averaging(all_pred_list, 'ensemble_{}_all'.format(model_name))

    #cv_all_pred_list = cv_pred_list[:]
    #cv_all_pred_list.extend(all_pred_list)
    #averaging(cv_all_pred_list, 'ensemble_{}_CV_all'.format(model_name))


    return
Example #2
0
def afms(kcs,
         opps,
         actuals,
         stu,
         student_label,
         item_label,
         nfolds=3,
         seed=None):
    """
    Executes AFM+S on the provided data and returns model fits and parameter estimates
    """
    sv = DictVectorizer()
    qv = DictVectorizer()
    ov = DictVectorizer()

    S = sv.fit_transform(stu)
    Q = qv.fit_transform(kcs)
    O = ov.fit_transform(opps)

    X = hstack((S, Q, O))
    y = np.array(actuals)

    l2 = [1.0 for i in range(S.shape[1])]
    l2 += [0.0 for i in range(Q.shape[1])]
    l2 += [0.0 for i in range(O.shape[1])]

    bounds = [(None, None) for i in range(S.shape[1])]
    bounds += [(None, None) for i in range(Q.shape[1])]
    bounds += [(0, None) for i in range(O.shape[1])]

    X = X.toarray()
    X2 = Q.toarray()

    model = BoundedLogistic(first_bounds=bounds, first_l2=l2)
    model.fit(X, X2, y)
    coef_s = model.coef1_[0:S.shape[1]]
    coef_s = [[k, v, invlogit(v)]
              for k, v in sv.inverse_transform([coef_s])[0].items()]
    coef_q = model.coef1_[S.shape[1]:S.shape[1] + Q.shape[1]]
    coef_qint = qv.inverse_transform([coef_q])[0]
    coef_o = model.coef1_[S.shape[1] + Q.shape[1]:S.shape[1] + Q.shape[1] +
                          O.shape[1]]
    coef_qslope = ov.inverse_transform([coef_o])[0]
    coef_qslip = qv.inverse_transform([model.coef2_])[0]

    kc_vals = []
    all_kcs = set(coef_qint).union(set(coef_qslope)).union(set(coef_qslip))
    for kc in all_kcs:
        kc_vals.append([
            kc,
            coef_qint.setdefault(kc, 0.0),
            invlogit(coef_qint.setdefault(kc, 0.0)),
            coef_qslope.setdefault(kc, 0.0),
            coef_qslip.setdefault(kc, 0.0)
        ])

    cvs = [
        KFold(len(y), n_folds=nfolds, shuffle=True, random_state=seed),
        StratifiedKFold(y, n_folds=nfolds, shuffle=True, random_state=seed),
        LabelKFold(student_label, n_folds=nfolds),
        LabelKFold(item_label, n_folds=nfolds)
    ]

    # scores_header = []
    scores = []
    for cv in cvs:
        score = []
        for train_index, test_index in cv:
            X_train, X_test = X[train_index], X[test_index]
            X2_train, X2_test = X2[train_index], X2[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model.fit(X_train, X2_train, y_train)
            score.append(model.mean_squared_error(X_test, X2_test, y_test))
        # scores_header.append(cv_name)
        scores.append(np.mean(np.sqrt(score)))

    return scores, kc_vals, coef_s
                    DecisionTreeClassifier(criterion='entropy'),
                    GaussianNB(),
                    XGBClassifier(max_depth=5, n_estimators=500)
                ]
                # Mean scores in K-FOLD
                precisions = np.zeros((n_folds, len(classifiers)))
                recalls = np.zeros((n_folds, len(classifiers)))
                f_scores = np.zeros((n_folds, len(classifiers)))
                accuracies = np.zeros((n_folds, len(classifiers)))
                cv_s = np.zeros((n_folds * len(classifiers)))

                i = 0
                """ Performing cross validation """
                # print('Begin k-fold')
                for train_idx, test_idx in StratifiedKFold(labels,
                                                           n_folds=n_folds,
                                                           shuffle=True):
                    print("FOLD: " + str(i + 1))

                    X_train = texts[train_idx]
                    y_train = labels[train_idx]
                    X_test = texts[test_idx]
                    y_test = labels[test_idx]

                    models = list()
                    models.append(
                        lsa(vectorizer=vectorizer,
                            classifier=classifiers[0],
                            k=k))

                    if chosen_ds == 'filatova' and args.star:
Example #4
0
                         stop_words=None,
                         max_features=None,
                         decode_error='ignore')),
        #('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', SVC(C=5.2, kernel='linear', probability=True))
    ])

    vot_clf = VotingClassifier(estimators=[('glove', glove_clf),
                                           ('linear', char_clf)],
                               voting='soft')

    print char_clf.named_steps

    print "TRAIN"
    print 80 * '='
    cv = StratifiedKFold(data.Stance, n_folds=10, shuffle=True, random_state=1)

    pred_stances = cross_val_predict(vot_clf,
                                     data.Abstract,
                                     data.Stance,
                                     cv=cv)

    print classification_report(data.Stance, pred_stances, digits=4)

    macro_f = fbeta_score(data.Stance,
                          pred_stances,
                          1.0,
                          labels=['AGAINST', 'FAVOR', 'NONE'],
                          average='weighted')

    print 'macro-average of F-score(FAVOR) and F-score(AGAINST): {:.4f}\n'.format(
Example #5
0
    INPUT_MASK_PATH = '/neurospin/brainomics/2018_euaims_leap_predict_vbm/results/VBM/1.5mm/data/mask.nii'

    NFOLDS_OUTER = 6
    NFOLDS_INNER = 5

    shutil.copy(INPUT_DATA_X, WD)
    shutil.copy(INPUT_DATA_y, WD)
    shutil.copy(INPUT_MASK_PATH, WD)
    #############################################################################
    ## Create config file
    y = np.load(INPUT_DATA_y)
    site = np.load(
        "/neurospin/brainomics/2018_euaims_leap_predict_vbm/results/VBM/1.5mm/by_age/data/adolescents/site.npy"
    )

    cv_outer = [[tr, te] for tr, te in StratifiedKFold(
        y.ravel(), n_folds=NFOLDS_OUTER, random_state=42)]
    cv_outer[0][0] = np.transpose(np.where(site != 1)).ravel()
    cv_outer[0][1] = np.transpose(np.where(site == 1)).ravel()

    cv_outer[1][0] = np.transpose(np.where(site != 2)).ravel()
    cv_outer[1][1] = np.transpose(np.where(site == 2)).ravel()

    cv_outer[2][0] = np.transpose(np.where(site != 3)).ravel()
    cv_outer[2][1] = np.transpose(np.where(site == 3)).ravel()

    cv_outer[3][0] = np.transpose(np.where(site != 4)).ravel()
    cv_outer[3][1] = np.transpose(np.where(site == 4)).ravel()

    cv_outer[4][0] = np.transpose(np.where(site != 5)).ravel()
    cv_outer[4][1] = np.transpose(np.where(site == 5)).ravel()
Example #6
0
    def handle(self, *args, **options):
        print(settings.BASE_DIR, 'BASE_DIR')
        print(os.path.join(settings.BASE_DIR, '../'))

        page_num = options['n']
        save_flag = options['save']
        # self.stdout.write(str(page_num), ending='\n')

        # [Category_obj1, Category_obj2, ...]
        categories = get_categories("https://gunosy.com/")
        all_contents = []
        all_links = []
        all_labels = []
        for i, category in enumerate(categories):
            for page_num in range(1, page_num + 1):
                pager_query = '?page=%d' % page_num
                url = category.url + pager_query
                print(url)
                links, contents = get_links_and_contents(url)
                all_links.extend(links)
                all_contents.extend(contents)
                all_labels.extend([i] * len(links))

        res = get_words_matrix(all_contents)
        dictionary = corpora.Dictionary(res)
        dictionary.filter_extremes(no_below=10, no_above=0.2)
        # [ [(w_id1, w_id1_cnt), (w_id2, w_id2_cnt),...] ,
        #   [(w_id1, w_id1_cnt), (w_id2, w_id2_cnt),...] ,
        # ]
        bows = [dictionary.doc2bow(x) for x in res]

        X = np.array([(matutils.corpus2dense([vec], num_terms=len(dictionary)).T[0])
                      for vec in bows])
        y = np.array(all_labels)

        # cross validation
        skf = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=7654)
        scores = []
        for train_index, test_index in skf:
            #print(train_index, test_index)
            train_X, test_X = X[train_index], X[test_index]
            train_y, test_y = y[train_index], y[test_index]
            score = evaluate_model(train_X, test_X, train_y, test_y)
            scores.append(score)
        scores = np.array(scores)
        print(scores)
        print('結果', np.mean(scores))

        if save_flag:
            def save_as_pickle(file_path, obj):
                with open(file_path, 'wb') as f:
                    pickle.dump(obj, f)

            # pickleに保存
            cat_prob = calc_cat(y)
            word_cat_prob = calc_each_word_bar_cat(X, y)
            cvt_y_to_category_name = dict(list(starmap(lambda i, x: (
                i, x.name), enumerate(categories))))  # ラベルy から カテゴリ名に変換するdict
            dir_path = (os.path.join(settings.BASE_DIR, '../'))
            save_as_pickle(dir_path + 'dictionay.dump', dictionary)
            save_as_pickle(dir_path + 'cat_prob.dump', cat_prob)
            save_as_pickle(dir_path + 'word_cat_prob.dump', word_cat_prob)
            save_as_pickle(
                dir_path + 'cvt_y_to_category_name.dump', cvt_y_to_category_name)
            print('------ Save completed. ------')
        else:
            print('------ not saved. -------')
Example #7
0
def lstm_predict():
    dim = 100
    # preprocess for lstm
    print('preprocess data...')
    x_train, y_train = load_data()
    x_test, test_id = load_testdata()
    #     print(x_train.shape,x_test.shape)
    x_train = preprocess_data(x_train)
    x_test = preprocess_data(x_test)
    data = pd.concat([x_train, x_test], axis=0).astype(str)
    texts = [[word for word in document.split(' ')]
             for document in data.values]
    #     texts, label = select_data(texts,label)

    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            #token = int(token)
            #print token
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] >= 20]
             for text in texts]
    vocab = set([word for doc in texts for word in doc])
    vocab_size = len(vocab)

    print('generate embedding_matrix...')
    embedding_matrix = np.zeros((vocab_size, dim))
    word2index = {}
    model = Word2Vec.load('../feature/predict/100w2vModel.m')
    #     model = get_pretrained_w2vmodel()
    for i, word in enumerate(vocab):
        word2index[word] = i
        # if model is selftrained
        embedding_matrix[i] = model[word]
        # if model is pretrained
#         if word in model.keys():
#             try:
#                 embedding_matrix[i] = model[word]
#             except:
#                 print('error: ',word)
#         else:
#             embedding_matrix[i] = np.zeros(dim)

    print('generate encoded_texts...')
    encoded_texts = []
    for doc in texts:
        encoded_doc = []
        for word in doc:
            encoded_doc.append(word2index[word])
        encoded_texts.append(encoded_doc)


#     print(encoded_texts[:100])
#     max_length = max([len(doc) for doc in texts])
    max_length = config['max_length']
    print('generate padded_texts...')
    padded_texts = pad_sequences(encoded_texts,
                                 maxlen=max_length,
                                 padding='post')

    x_train = padded_texts[:len(x_train)]
    x_test = padded_texts[len(x_train):]
    #     x_train, x_test, y_train, y_test = train_test_split(padded_texts, label, test_size=0.2,random_state=42)
    print(len(x_train), len(x_test), len(y_train))

    # lstm model structure
    print('Construct lstm model...')
    model = Sequential()
    embedding = Embedding(input_dim=vocab_size,
                          output_dim=dim,
                          mask_zero=True,
                          weights=[embedding_matrix],
                          input_length=max_length,
                          trainable=False)
    model.add(embedding)
    model.add(
        LSTM(units=50,
             activation='sigmoid',
             recurrent_activation='hard_sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    # Compile and train the model
    print('Compiling the Model...')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    print("Train...")
    skf = StratifiedKFold(y_train, n_folds=3, shuffle=True)
    new_train = np.zeros((len(x_train), 1))
    new_test = np.zeros((len(x_test), 1))

    for i, (trainid, valid) in enumerate(skf):
        print('fold' + str(i))
        train_x = x_train[trainid]
        train_y = y_train[trainid]
        val_x = x_train[valid]
        model.fit(train_x,
                  train_y,
                  batch_size=config['batch_size'],
                  epochs=config['n_epoch'],
                  verbose=1)
        new_train[valid] = model.predict_proba(val_x)
        new_test += model.predict_proba(x_test)

    new_test /= 3
    stacks = []
    stacks_name = []
    stack = np.vstack([new_train, new_test])
    stacks.append(stack)
    stacks = np.hstack(stacks)
    clf_stacks = pd.DataFrame(data=stacks, columns=['lstm'])
    clf_stacks.to_csv('../feature/predict/lstm_prob2.csv', index=0)
from utility import formatAndPrintMetrics
from utility import cross_entropy
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing

fullTrainFile = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/data_CSV/train_svm_light.v2.new.txt'
X, Y = get_data(fullTrainFile)
le = preprocessing.LabelEncoder()
le.fit(Y)
Y = le.transform(Y) 

skf = StratifiedKFold(Y, n_folds=3, random_state=app_random_state_value)
skfList = list(skf)
train_index, test_index = skfList[0]
XD = X#.todense()
xTr, xTe = XD[train_index], XD[test_index]
yTr, yTe = Y[train_index], Y[test_index]

clf = MultinomialNB()
clf = SGDClassifier(loss="hinge", penalty="l2")
clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.8, max_depth=15, subsample=0.9, verbose=5, random_state=app_random_state_value)
clf = GaussianNB() #yhTeGNB = yhTe #yLogPGNB = yLogP
clf = RandomForestClassifier(n_estimators=500,verbose=1,n_jobs=4, random_state=app_random_state_value)

#### Temp code to experiment on single class classification(binary-1/0)
yTrMod = [1 if a == 37 else 0 for a in yTr]
yTeMod = [1 if a == 37 else 0 for a in yTe]
Example #9
0
dataset = pd.read_json("../data/preprocessed.json")
dataset = dataset.reset_index(drop=True)


def in_arange(s, e, step):
    return np.append(np.arange(s, e, step), e)


#################################################

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
skf = StratifiedKFold(dataset["highest_reaction"], n_folds=10)

# param_grid = {'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
#               'vect__min_df': in_arange(1, 30, 1),
#               'vect__max_df': in_arange(0.7, 1.0, 0.1),
#               'tfidf__use_idf': [True],
#               'clf__C': in_arange(0.1, 2.0, 0.3),
# }

# param_grid = {'vect__ngram_range': [(1, 2)],
#               'vect__min_df': in_arange(1, 20, 1),
#               'vect__max_df': in_arange(0.01, 0.5, 0.1),
#               'tfidf__use_idf': [True],
#               'clf__C': in_arange(0.1, 2.0, 0.1),
# }
Example #10
0
print data.shape
outcome_var = 'ckd'
predictor_var = [c for c in columns if c not in ["ckd"]]
X = data[predictor_var]
y = data[outcome_var]
print(X)
print(y)

# Create the RFE object and compute a cross-validated score.
#svc = SVC(kernel="linear")
from sklearn import linear_model

model = linear_model.LogisticRegression(fit_intercept=True, multi_class="ovr")
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=model,
              step=1,
              cv=StratifiedKFold(y, 2),
              scoring='accuracy')
rfecv.fit(X, y)
print('accuracy scoring', rfecv.scoring)
print("Ranking of the features : %d" % rfecv.ranking_)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
Example #11
0
                if np.isinf(x) or x > y_max:
                    x = y_max
                x_row.append(x)
            X.append(x_row)

        y = []

        for y_true in fs_list:
            if y_true >= TH:
                y.append(1)
            else:
                y.append(0)
        X = np.asarray(X)
        y = np.asarray(y)

        kf = StratifiedKFold(y,n_folds=10,random_state=0,shuffle=True)
        cm = np.zeros((2,2))
        for tr_ind,ts_ind in kf:
            if clf == 'SVM':
                lr = SVC(cache_size=20000,kernel='rbf',C=1,probability=True)
            if clf == 'LR':
                lr = LogisticRegression(C=0.0001,n_jobs=-1)
            if clf == 'RF':
                lr = RandomForestClassifier(n_estimators=100,criterion='gini',max_features='sqrt',n_jobs=-1)

            lr.fit(X[tr_ind],y[tr_ind])
            y_pred = lr.predict(X[ts_ind])
            y_true = y[ts_ind]

            cm_one_fold = confusion_matrix(y_true,y_pred)
            if cm_one_fold.shape == (2,2):
Example #12
0
        'num_var22_ult3',  # 0.03452566096423017
        'saldo_medio_var5_hace3',  # 0.04074650077760498
        'saldo_medio_var5_hace2',  # 0.04292379471228616
        'SumZeros',  # 0.04696734059097978
        'saldo_var30',  # 0.09611197511664074
        'var38',  # 0.1390357698289269
        'var15'
    ]  # 0.20964230171073095
    features = train.columns[1:-1]
    todrop = list(set(tokeep).difference(set(features)))
    train.drop(todrop, inplace=True, axis=1)
    test.drop(todrop, inplace=True, axis=1)
    features = train.columns[1:-1]
    split = 10
    skf = StratifiedKFold(train.TARGET.values,
                          n_folds=split,
                          shuffle=False,
                          random_state=42)

    train_preds = None
    test_preds = None
    visibletrain = blindtrain = train
    index = 0
    print('Change num_rounds to 350')
    num_rounds = 350
    params = {}
    params["objective"] = "binary:logistic"
    params["eta"] = 0.03
    params["subsample"] = 0.8
    params["colsample_bytree"] = 0.7
    params["silent"] = 1
    params["max_depth"] = 5
Example #13
0
def init():
    os.makedirs(WD, exist_ok=True)
    shutil.copy(os.path.join(DATA_PATH, 'X.npy'), WD)
    shutil.copy(os.path.join(DATA_PATH, 'y.npy'), WD)

    # VBM
    if DATA_TYPE == "image":
        shutil.copy(os.path.join(DATA_PATH, 'mask.nii'), WD)
    elif DATA_TYPE == "mesh":
        shutil.copy(os.path.join(DATA_PATH, 'mask.npy'), WD)
        shutil.copy(os.path.join(DATA_PATH, 'lrh.pial.gii'), WD)

    shutil.copy(os.path.join(DATA_PATH, "Atv.npz"), WD)

    site = np.load(
        "/neurospin/brainomics/2016_schizConnect/analysis/all_studies+VIP/VBM/all_subjects/data/site.npy"
    )

    ## Create config file
    os.chdir(WD)
    X = np.load("X.npy")
    y = np.load("y.npy")

    cv_outer = [[tr, te] for tr, te in StratifiedKFold(
        y.ravel(), n_folds=NFOLDS_OUTER, random_state=42)]
    cv_outer[0][0] = np.transpose(np.where(site != 1)).ravel()
    cv_outer[0][1] = np.transpose(
        np.where(site == 1)).ravel()  #CV00 TEST ON COBRE

    cv_outer[1][0] = np.transpose(np.where(site != 2)).ravel()
    cv_outer[1][1] = np.transpose(
        np.where(site == 2)).ravel()  #CV01 TEST ON NMORPHch

    cv_outer[2][0] = np.transpose(np.where(site != 3)).ravel()
    cv_outer[2][1] = np.transpose(
        np.where(site == 3)).ravel()  #CV02 TEST ON NUSDAST

    cv_outer[3][0] = np.transpose(np.where(site != 4)).ravel()
    cv_outer[3][1] = np.transpose(
        np.where(site == 4)).ravel()  #CV03 TEST ON VIP

    assert len(cv_outer[0][0]) == 442
    assert len(cv_outer[1][0]) == 526
    assert len(cv_outer[2][0]) == 336
    assert len(cv_outer[3][0]) == 514

    cv_outer[0][0] = cv_outer[0][0][:int(np.around(len(cv_outer[0][0]) * 0.1))]
    cv_outer[1][0] = cv_outer[1][0][:int(np.around(len(cv_outer[1][0]) * 0.1))]
    cv_outer[2][0] = cv_outer[2][0][:int(np.around(len(cv_outer[2][0]) * 0.1))]
    cv_outer[3][0] = cv_outer[3][0][:int(np.around(len(cv_outer[3][0]) * 0.1))]

    assert len(cv_outer[0][0]) == 44
    assert len(cv_outer[1][0]) == 53
    assert len(cv_outer[2][0]) == 34
    assert len(cv_outer[3][0]) == 51

    import collections
    cv = collections.OrderedDict()
    for cv_outer_i, (tr_val, te) in enumerate(cv_outer):
        cv["cv%02d/all" % (cv_outer_i)] = [tr_val, te]
        cv_inner = StratifiedKFold(y[tr_val].ravel(),
                                   n_folds=NFOLDS_INNER,
                                   random_state=42)
        for cv_inner_i, (tr, val) in enumerate(cv_inner):
            cv["cv%02d/cvnested%02d" %
               ((cv_outer_i), cv_inner_i)] = [tr_val[tr], tr_val[val]]
    for k in cv:
        cv[k] = [cv[k][0].tolist(), cv[k][1].tolist()]

    C_range = [[100], [10], [1], [1e-1], [1e-2], [1e-3], [1e-4], [1e-5],
               [1e-6], [1e-7], [1e-8], [1e-9]]

    config = dict(data=dict(X="X.npy", y="y.npy"),
                  params=C_range,
                  resample=cv,
                  structure_linear_operator_tv="Atv.npz",
                  map_output="results",
                  user_func=user_func_filename)
    json.dump(config, open(os.path.join(WD, "config_cv_largerange.json"), "w"))

    # Build utils files: sync (push/pull) and PBS
    import brainomics.cluster_gabriel as clust_utils
    cmd = "mapreduce.py --map  %s/config_cv_largerange.json" % WD_CLUSTER
    clust_utils.gabriel_make_qsub_job_files(WD,
                                            cmd,
                                            walltime="250:00:00",
                                            suffix="_cv_largerange",
                                            freecores=2)
Example #14
0
    #     count += 1
    # for i in negative_sample_index:
    #     data_set[count][0] = whole_negative_index[i][0]
    #     data_set[count][1] = whole_negative_index[i][1]
    #     data_set[count][2] = 0
    #     count += 1
    data_set = np.load('dataset/' + opts.r + '_train.npy')
    if opts.t == 'unique':
        pass

    else:
        test_auc_fold = []
        test_aupr_fold = []
        rs = np.random.randint(0, 1000, 1)[0]
        kf = StratifiedKFold(data_set[:, 2],
                             n_folds=10,
                             shuffle=True,
                             random_state=rs)

        for train_index, test_index in kf:
            DTItrain, DTItest = data_set[train_index], data_set[test_index]
            DTItrain, DTIvalid = train_test_split(DTItrain,
                                                  test_size=0.05,
                                                  random_state=rs)

            v_auc, v_aupr, t_auc, t_aupr = train_and_evaluate(
                DTItrain=DTItrain,
                DTIvalid=DTIvalid,
                DTItest=DTItest,
                graph=graph,
                num_steps=2000)
            test_auc_fold.append(t_auc)
Example #15
0
predict_data = pd.read_csv("../data/predict_data.csv")

########## tune the para ##########
# learning_rate
lambdas = [0.0001, 0.001, 0.01, 0.1, 1]
# n_estimators
ntree_list = [50, 100, 250, 500]
# max_depth
depth = [10, 25, 50]

param_grid = dict(learning_rate=lambdas,
                  n_estimators=ntree_list,
                  max_depth=depth)
# param_grid = dict(learning_rate  = lambdas, n_estimators = ntree_list, max_depth = depth)
train_data, train_label = get_lb_ft(train1, "Y_midprice")
cv = StratifiedKFold(labels, n_folds=3, random_state=20151204, shuffle=TRUE)
grid = GridSearchCV(GradientBoostingClassifier(),
                    param_grid=param_grid,
                    cv=cv,
                    n_jobs=-1)
grid.fit(train_data, train_label)

print(grid.grid_scores_)
print("The best parameters are %s with a score of %0.4f" %
      (grid.best_params_, grid.best_score_))

# fit the best model
clf = GradientBoostingClassifier(
    learning_rate=grid.best_params_['learning_rate'],
    n_estimators=grid.best_params_['n_estimators'],
    max_depth=grid.best_params_['max_depth'])
Example #16
0
    ###### transform all the categorical variables with one hot
    ###### transformation and standardize the numerical variable
    ###### transform the target variable
    ##############################################################

    df = pd.read_csv('../data/census-income.data', header=None)
    X_ = dataProcess(df, catList, numList)
    le = LabelEncoder()
    y_ = le.fit_transform(df[41].values)

    ##############################################################
    ###### play with stratified K fold
    ##############################################################

    skf = StratifiedKFold(y_,
                          n_folds=5,
                          shuffle=True,
                          random_state=np.random.seed(10))

    for train_index_s, test_index_s in skf:

        print "length(train_index_s): ", len(train_index_s)
        print "Counter(train_index_s): ", Counter(y_[train_index_s])

        raw_input("press return")

    ##############################################################
    ###### re-balanced the data
    ##############################################################

    # new_train_index = dataBalance(y_,0.01)
    # X = X_[new_train_index,:]
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_iter=100,
                  hidden=4,
                  lrate=.1,
                  n_fold=5):

    _, y_val = load_svmlight_file(train_file)

    cv = StratifiedKFold(y_val,
                         n_folds=n_fold,
                         shuffle=True,
                         random_state=2015)

    logging.info('Cross validation...')
    p_val = np.zeros_like(y_val)
    lloss = 0.
    for i_trn, i_val in cv:
        clf = NN(n=10000, h=hidden, a=lrate, seed=2015)

        logging.info('Epoch\tTrain\tValid')
        logging.info('=========================')
        for i_iter in range(n_iter):
            lloss_trn = 0.
            cnt_trn = 0
            for i, (x, y) in enumerate(clf.read_sparse(train_file)):
                if i in i_val:
                    p_val[i] = clf.predict(x)
                else:
                    p = clf.predict(x)
                    clf.update(x, p - y)
                    lloss_trn += logloss(y, p)
                    cnt_trn += 1

            lloss_trn /= cnt_trn
            lloss_val = log_loss(y_val[i_val], p_val[i_val])

            if (i_iter == 0) or ((i_iter + 1) % int(n_iter / 10)
                                 == 0) or (i_iter == n_iter - 1):
                logging.info('#{:4d}\t{:.4f}\t{:.4f}'.format(
                    i_iter + 1, lloss_trn, lloss_val))

        lloss += lloss_val

    logging.info('Log Loss = {:.4f}'.format(lloss / n_fold))

    logging.info('Retraining with 100% data...')
    clf = NN(n=10000, h=hidden, a=lrate, seed=2015)
    for i_iter in range(n_iter):
        for x, y in clf.read_sparse(train_file):
            p = clf.predict(x)
            clf.update(x, p - y)

        logging.info('#{:4d}'.format(i_iter + 1))

    _, y_tst = load_svmlight_file(test_file)
    p_tst = np.zeros_like(y_tst)
    for i, (x, _) in enumerate(clf.read_sparse(test_file)):
        p_tst[i] = clf.predict(x)

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Example #18
0
    if not os.path.exists(WD):
        os.makedirs(WD)

    os.chdir(WD)

    #############################################################################
    ## Create config file
    y = np.load(INPUT_DATA_y)
    if os.path.exists("config.json"):
        inf = open("config.json", "r")
        old_conf = json.load(inf)
        cv = old_conf["resample"]
        inf.close()
    else:
        cv = [[tr.tolist(), te.tolist()]
              for tr, te in StratifiedKFold(y.ravel(), n_folds=5)]
    if cv[0] is not None:  # Make sure first fold is None
        cv.insert(0, None)
    # parameters grid
    # Re-run with
    tv_range = np.hstack([np.arange(0, 1., .1), [0.05, 0.01, 0.005, 0.001]])
    ratios = np.array([[1., 0., 1], [0., 1., 1], [.5, .5, 1], [.9, .1, 1],
                       [.1, .9, 1], [.01, .99, 1], [.001, .999, 1]])
    alphas = [.01, .05, .1, .5, 1.]
    k_range = [100, 1000, 10000, 100000, -1]
    l1l2tv = [
        np.array([[float(1 - tv), float(1 - tv), tv]]) * ratios
        for tv in tv_range
    ]
    l1l2tv.append(np.array([[0., 0., 1.]]))
    l1l2tv = np.concatenate(l1l2tv)
Example #19
0
        y = np.array(y_list)

        h5f = h5py.File('/data/MIMIC/Xy_seq' + str(sequence_length) + '.h5',
                        'w')
        h5f.create_dataset('X', data=X)
        h5f.create_dataset('y', data=y)
        h5f.close()

    else:
        h5f = h5py.File('/data/MIMIC/Xy_seq' + str(sequence_length) + '.h5',
                        'r')
        X = h5f['X'][:]
        y = h5f['y'][:]

    print('Train model')
    cv = StratifiedKFold(y, n_folds=5, random_state=123)
    roc_auc = {'lstm': []}

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.Session(config=config) as sess:
        for j, (train, test) in enumerate(cv):
            roc_auc['lstm'].append(
                lstm_fit_predict(X[train], y[train], X[test], y[test],
                                 roc_auc))
            print('Cross fold: ', j, roc_auc)
        pkl.dump(
            roc_auc,
            open(
                '/data/MIMIC/lstm_encounter_scores_' + str(sequence_length) +
Example #20
0
from util import plot_tree

NUM_FOLDS = 10
PROP_VALIDATION = 0.1  #10%

NUM_TREES = 500
NUM_PARAMETERS = 10

# Withhold some proportion of the data set for validation later (stratify)
sss = StratifiedShuffleSplit(target, n_iter=1, test_size=PROP_VALIDATION)
for train_index, test_index in sss:
    X_train, X_test = data.iloc[train_index], data.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

# Initialise folds on remaining data and store indexes for use later
kf = StratifiedKFold(y_train, n_folds=NUM_FOLDS)
fold_indexes = []
for train_index, test_index in kf:
    fold_indexes.append({"train_index": train_index, "test_index": test_index})

# Init score storing structures
importance_scores = np.zeros([NUM_FOLDS, len(all_parameters)])
importance_scores_stdev = np.zeros([NUM_FOLDS, len(all_parameters)])
tree_cv_scores = np.zeros(NUM_FOLDS)
parameter_union = np.zeros(len(all_parameters), dtype=int)

# For each fold, build a forest, skim the NUM_PARAMETERS best features as
# measured by the classifier's feature_importance property, then fit and score a
# single decision tree on that feature set.
for n_fold, indexer in enumerate(fold_indexes):
    print "\n[FRST] Constructing Forest#%d" % (n_fold + 1)
Example #21
0
            count = 0
            for seq in nseqs:
                seq = str(seq)
                for i in range(len(seq) - k + 1):
                    idx = kmers.index(tuple(seq[i:i + k]))
                    kmers_d[count, idx] += 1
                kmers_d[count, :] = np.divide(kmers_d[count, :],
                                              kmers_d[count, :].sum())
                count += 1

            #y = np_utils.to_categorical(y, max(y)+1)
            test_predicted1 = np.zeros((labels.shape[0], ))
            test_predicted2 = np.zeros((labels.shape[0], ))
            i = 0
            #For each one of the 10 fold, fit the classifier and test
            for train_idx, test_idx in StratifiedKFold(y, 10, True):
                i += 1
                print("fold " + str(i))
                #train_idx = get_balanced_classes(train_idx, np.argmax(y, axis=1))
                X_train, X_test = kmers_d[train_idx, :], kmers_d[test_idx, :]
                y_train, y_test = y[train_idx], y[test_idx]
                #Build new network
                #model = build_lstm(4)
                model1 = SVC(kernel='linear', class_weight='balanced', C=10)
                model2 = SVC(kernel='rbf', class_weight='balanced', C=10)
                model1.fit(X_train, y_train)
                model2.fit(X_train, y_train)
                test_predicted1[test_idx] = model1.predict(X_test)
                test_predicted2[test_idx] = model2.predict(X_test)
                #print(accuracy_score(np.argmax(y_test, axis=1), test_predicted[test_idx]))
# just applying it on the test set.

scaler = Scaler()

X = scaler.fit_transform(X)

# For an initial search, a logarithmic grid with basis
# 10 is often helpful. Using a basis of 2, a finer
# tuning can be achieved but at a much higher cost.

C_range = 10. ** np.arange(-3, 8)
gamma_range = 10. ** np.arange(-5, 4)

param_grid = dict(gamma=gamma_range, C=C_range)

grid = GridSearchCV(SVC(), param_grid=param_grid, cv=StratifiedKFold(y=Y, k=5))

grid.fit(X, Y)

print("The best classifier is: ", grid.best_estimator_)

# plot the scores of the grid
# grid_scores_ contains parameter settings and scores
score_dict = grid.grid_scores_

# We extract just the scores
scores = [x[1] for x in score_dict]
scores = np.array(scores).reshape(len(C_range), len(gamma_range))

# Make a nice figure
pl.figure(figsize=(8, 6))
Example #23
0
def main(argv):
    # Define command line options
    p = optparse.OptionParser(description='Pyxit',
                              prog='PyXit (PYthon piXiT)',
                              version='PyXit 0.1')

    p.add_option('--dir_ls',
                 type="string",
                 dest="dir_ls",
                 help="The learning set directory")
    p.add_option('--dir_ts',
                 type="string",
                 dest="dir_ts",
                 help="The training set directory")

    p.add_option('--cv_k_folds',
                 type="int",
                 dest="cv_k_folds",
                 help="The number of folds")
    p.add_option(
        '--cv_shuffle',
        default=False,
        action="store_true",
        dest="cv_shuffle",
        help="Whether cross-validation is performed using ShuffleSplit.")
    p.add_option('--cv_shuffle_test_fraction',
                 default=0.1,
                 type="float",
                 dest="cv_shuffle_test_fraction",
                 help="The proportion of data in shuffled test splits.")

    p.add_option('--pyxit_n_subwindows',
                 default=10,
                 type="int",
                 dest="pyxit_n_subwindows",
                 help="number of subwindows")
    p.add_option('--pyxit_min_size',
                 default=0.5,
                 type="float",
                 dest="pyxit_min_size",
                 help="min size")
    p.add_option('--pyxit_max_size',
                 default=1.0,
                 type="float",
                 dest="pyxit_max_size",
                 help="max size")
    p.add_option('--pyxit_target_width',
                 default=16,
                 type="int",
                 dest="pyxit_target_width",
                 help="target width")
    p.add_option('--pyxit_target_height',
                 default=16,
                 type="int",
                 dest="pyxit_target_height",
                 help="target height")
    p.add_option('--pyxit_interpolation',
                 default=2,
                 type="int",
                 dest="pyxit_interpolation",
                 help="interpolation method 1,2,3,4")
    p.add_option('--pyxit_transpose',
                 default=False,
                 action="store_true",
                 dest="pyxit_transpose",
                 help="transpose subwindows")
    p.add_option('--pyxit_colorspace',
                 default=2,
                 type="int",
                 dest="pyxit_colorspace",
                 help="colorspace 0=RGB, 1=TRGB, 2=HSV")
    p.add_option('--pyxit_fixed_size',
                 default=False,
                 action="store_true",
                 dest="pyxit_fixed_size",
                 help="extract fixed size subwindows")
    p.add_option('--pyxit_n_jobs',
                 default=1,
                 type="int",
                 dest="pyxit_n_jobs",
                 help="number of jobs")
    p.add_option('--pyxit_save_to',
                 type="string",
                 dest="pyxit_save_to",
                 help="file to save the model into")

    p.add_option('--forest_n_estimators',
                 default=10,
                 type="int",
                 dest="forest_n_estimators",
                 help="number of base estimators (T)")
    p.add_option('--forest_max_features',
                 default=1,
                 type="int",
                 dest="forest_max_features",
                 help="max features at test node (k)")
    p.add_option('--forest_min_samples_split',
                 default=1,
                 type="int",
                 dest="forest_min_samples_split",
                 help="minimum node sample size (nmin)")
    p.add_option('--forest_shared_mem',
                 default=False,
                 action="store_true",
                 dest="forest_shared_mem",
                 help="shared mem")

    p.add_option(
        '--svm',
        default=0,
        dest="svm",
        help=
        "final svm classifier: 0=nosvm, 1=libsvm, 2=liblinear, 3=lr-l1, 4=lr-l2",
        type="int")
    p.add_option('--svm_c',
                 default=1.0,
                 type="float",
                 dest="svm_c",
                 help="svm C")

    p.add_option('--quiet',
                 action="store_false",
                 default=True,
                 dest="verbose",
                 help="Turn off verbose mode")
    p.add_option('--verbose',
                 action="store_true",
                 default=True,
                 dest="verbose",
                 help="Turn on verbose mode")

    options, arguments = p.parse_args(args=argv)

    # Check for errors in the options
    e = None

    if not options.dir_ls:
        e = "--dir_ls needs to be set."

    elif options.dir_ts and options.cv_k_folds:
        e = "--dir_ts and --cv_k_folds cannot be set at the same time."

    elif options.pyxit_save_to and options.cv_k_folds:
        e = "--pyxit_save_to and --cv_k_folds cannot be set at the time."

    if e:
        print "Error: %s" % e
        print "Run with -h option for help."
        sys.exit(1)

    if options.verbose:
        print "[pyxit.main] Options = ", options

    # Load data
    if options.verbose:
        print "[pyxit.main] Loading data..."

    X, y = build_from_dir(options.dir_ls)

    classes = np.unique(y)
    n_classes = len(classes)
    y_original = y
    y = np.searchsorted(classes, y)

    # Instantiate classifiers
    if options.verbose:
        print "[pyxit.main] Initializing PyxitClassifier..."

    forest = ExtraTreesClassifier(
        n_estimators=options.forest_n_estimators,
        max_features=options.forest_max_features,
        min_samples_split=options.forest_min_samples_split,
        n_jobs=options.pyxit_n_jobs,
        verbose=options.verbose)

    pyxit = PyxitClassifier(base_estimator=forest,
                            n_subwindows=options.pyxit_n_subwindows,
                            min_size=options.pyxit_min_size,
                            max_size=options.pyxit_max_size,
                            target_width=options.pyxit_target_width,
                            target_height=options.pyxit_target_height,
                            interpolation=options.pyxit_interpolation,
                            transpose=options.pyxit_transpose,
                            colorspace=options.pyxit_colorspace,
                            fixed_size=options.pyxit_fixed_size,
                            n_jobs=options.pyxit_n_jobs,
                            verbose=options.verbose)

    if options.svm:
        if options.svm == SVM_LIBSVM:
            svm = SVC(probability=True, C=options.svm_c, kernel="linear")
        if options.svm == SVM_LIBLINEAR:
            svm = LinearSVC(C=options.svm_c)
        if options.svm == SVM_LRL1:
            svm = LogisticRegression(penalty="l1", C=options.svm_c)
        if options.svm == SVM_LRL2:
            svm = LogisticRegression(penalty="l2", C=options.svm_c)
        if options.svm == ET:
            svm = ExtraTreesClassifier(
                n_estimators=1000,
                max_features="sqrt",
                #max_features=1000,
                min_samples_split=2,
                n_jobs=options.pyxit_n_jobs,
                verbose=options.verbose)
        if options.svm == RF:
            svm = RandomForestClassifier(
                n_estimators=1000,
                #max_features=1000,
                max_features="sqrt",
                min_samples_split=2,
                n_jobs=options.pyxit_n_jobs,
                verbose=options.verbose)

        if options.svm == NN:
            svm = neighbors.KNeighborsClassifier(10)

    if options.verbose:
        print "[pyxit.main] PyxitClassifier ="
        print pyxit

        if options.svm:
            print "[pyxit.main] SVM ="
            print svm

    # Build and evaluate
    if options.dir_ls and not options.dir_ts and not options.cv_k_folds:
        if options.pyxit_save_to:
            fd = open(options.pyxit_save_to, "wb")
            pickle.dump(classes, fd, protocol=pickle.HIGHEST_PROTOCOL)

        if options.verbose:
            print "[pyxit.main] Fitting PyxitClassifier on %s" % options.dir_ls

        _X, _y = pyxit.extract_subwindows(X, y)
        pyxit.fit(X, y, _X=_X, _y=_y)

        if options.verbose:
            print "[pyxit.main] Saving PyxitClassifier into %s" % options.pyxit_save_to

        if options.pyxit_save_to:
            pickle.dump(pyxit, fd, protocol=pickle.HIGHEST_PROTOCOL)

        if options.svm:
            Xt = pyxit.transform(X, _X=_X, reset=True)

            if options.verbose:
                print "[pyxit.main] Fitting SVC on %s" % options.dir_ls

            svm.fit(Xt, y)

            if options.verbose:
                print "[pyxit.main] Saving SVM into %s" % options.pyxit_save_to

            if options.pyxit_save_to:
                pickle.dump(svm, fd, protocol=pickle.HIGHEST_PROTOCOL)

        if options.pyxit_save_to:
            fd.close()

    elif options.dir_ts:
        if options.pyxit_save_to:
            fd = open(options.pyxit_save_to, "wb")
            pickle.dump(classes, fd, protocol=pickle.HIGHEST_PROTOCOL)

        if options.verbose:
            print "[pyxit.main] Fitting PyxitClassifier on %s" % options.dir_ls

        _X, _y = pyxit.extract_subwindows(X, y)
        pyxit.fit(X, y, _X=_X, _y=_y)

        if options.pyxit_save_to:
            pickle.dump(pyxit, fd, protocol=pickle.HIGHEST_PROTOCOL)

        if options.svm:
            Xt = pyxit.transform(X, _X=_X, reset=True)

            if options.verbose:
                print "[pyxit.main] Fitting SVC on %s" % options.dir_ls

            svm.fit(Xt, y)

            if options.pyxit_save_to:
                pickle.dump(svm, fd, protocol=pickle.HIGHEST_PROTOCOL)

        if options.pyxit_save_to:
            fd.close()

        if options.verbose:
            print "[pyxit.main] Testing on %s" % options.dir_ts

        X_test, y_test = build_from_dir(options.dir_ts)
        y_test = np.searchsorted(classes, y_test)
        _X_test, _y_test = pyxit.extract_subwindows(X_test, y_test)
        y_true = y_test
        all_tested = np.ones(len(y_true), dtype=np.bool)

        if not options.svm:
            y_predict = pyxit.predict(X_test, _X=_X_test)
            y_proba = pyxit.predict_proba(X_test, _X=_X_test)

        else:
            Xt = pyxit.transform(X_test, _X=_X_test)
            y_predict = svm.predict(Xt)
            if options.svm != SVM_LIBLINEAR:
                y_proba = svm.predict_proba(Xt)

    elif options.cv_k_folds:
        if options.verbose:
            print "[pyxit.main] K-Fold cross-validation (K=%d)" % options.cv_k_folds

        _X, _y = pyxit.extract_subwindows(X, y)

        i = 1
        step = 100. / options.cv_k_folds

        y_true = y
        y_predict = np.empty(y_true.shape, dtype=y.dtype)
        y_proba = np.empty((y_true.shape[0], n_classes))
        all_tested = np.zeros(len(y_true), dtype=np.bool)

        cm = np.zeros((n_classes, n_classes), dtype=np.int32)

        if not options.cv_shuffle:
            cv = StratifiedKFold(y_true, options.cv_k_folds)
        else:
            cv = ShuffleSplit(len(X),
                              n_iter=options.cv_k_folds,
                              test_size=options.cv_shuffle_test_fraction)

        for train, test in cv:
            all_tested[test] = True
            _train = pyxit.extend_mask(train)
            _test = pyxit.extend_mask(test)

            if options.verbose:
                print "[pyxit.main] Fitting PyxitClassifier on fold %d" % i

            pyxit.fit(X[train], y[train], _X=_X[_train], _y=_y[_train])

            if options.svm:
                Xt = pyxit.transform(X[train], _X=_X[_train], reset=True)

                if options.verbose:
                    print "[pyxit.main] Fitting SVC on fold %d" % i

                svm.fit(Xt, y[train])

            if options.verbose:
                print "[pyxit.main] Testing on fold %d" % i

            if not options.svm:
                y_predict[test] = pyxit.predict(X[test], _X=_X[_test])
                y_proba[test] = pyxit.predict_proba(X[test], _X=_X[_test])

            else:
                Xt = pyxit.transform(X[test], _X=_X[_test])
                y_predict[test] = np.asarray(svm.predict(Xt), dtype=y.dtype)

                if hasattr(svm, "predict_proba"):
                    y_proba[test] = svm.predict_proba(Xt)
                print svm

            if options.verbose:
                print "[pyxit.main] Classification error on fold %d = %f" % (
                    i, 1.0 * np.sum(y_true[test] != y_predict[test]) /
                    len(y_true[test]))
                print "[pyxit.main] Cumulated confusion matrix ="
                cm += confusion_matrix(y_true[test], y_predict[test])
                print_cm(cm, classes)

            i += 1

    # Output some results
    if "all_tested" in locals():
        if options.verbose:
            print "---"
            print "[pyxit.main] Test coverage =", sum(all_tested) / (
                1.0 * len(all_tested))
            print "[pyxit.main] Overall classification error = %f" % (
                1.0 * np.sum(y_true[all_tested] != y_predict[all_tested]) /
                len(y_true[all_tested]))
            print "[pyxit.main] Overall confusion matrix ="
            print_cm(
                confusion_matrix(y_true[all_tested], y_predict[all_tested]),
                classes)

        #y_true = classes.take(y_true[all_tested], axis=0)
        y_predict = classes.take(y_predict[all_tested], axis=0)
        y_proba = np.max(y_proba, axis=1)
        d = {}
        for i in xrange(len(X)):
            d[X[i]] = (int(y_predict[i]), y_proba[i])
        return d
Example #24
0
def xgbLocalCVModel(taskName, config):
    params = config['params']
    config['task'] = taskName
    trainFeature, testFeature, trainLabel, trainUid, testUid = readFeature(
        config)
    if taskName == 'gender':
        rounds = config['roundsGender']
    elif taskName == 'age':
        rounds = config['roundsAge']
    else:
        rounds = config['roundsEdu']

    if config['multiClass'] == True:
        params['num_class'] = len(np.unique(trainLabel))
        print params['num_class']
    else:
        params['scale_pos_weight'] = (float)(len(
            trainLabel[trainLabel == 0])) / len(trainLabel[trainLabel == 1])
        print params['scale_pos_weight']

    if config['prob'] == True:
        params['objective'] = 'multi:softprob'
    else:
        params['objective'] = 'multi:softmax'

    print 'CV On XGB Model....'
    kfold = StratifiedKFold(y=trainLabel,
                            n_folds=config['folds'],
                            shuffle=True,
                            random_state=params['seed'])
    f = 0
    predict = []
    true = []
    uid = []
    for index1, index2 in kfold:
        print 'fold:' + str(f)
        print index1, index2
        localTrainFeature = trainFeature[index1, :]
        localTestFeature = trainFeature[index2, :]
        localTrainLabel = trainLabel[index1]
        localTestLabel = trainLabel[index2]
        localTestUid = trainUid[index2]
        uid = np.append(uid, localTestUid)
        print 'Build, Train and Predict XGB Model.....'
        #print localTrainFeature.shape[1]
        localPredict = xgbLocalModel(localTrainFeature, localTestFeature,
                                     localTrainLabel, localTestLabel, params,
                                     config, rounds)

        if config['prob'] == True:
            if f == 0:
                predict = localPredict
            else:
                predict = np.concatenate((predict, localPredict), axis=0)
        else:
            print error(localTestLabel, localPredict)
            predict = np.append(predict, localPredict)
            true = np.append(true, localTestLabel)
        f += 1
    if config['prob'] == True:
        return predict, uid
    else:
        print "Total error" + str(error(true, predict))
        return predict, uid
train_file = opts['--train']
test_file = opts['--test']
pred_file = opts['--pred']
epoch = int(opts['--epoch'])
cv = int(opts['--cv'])
nfolds = int(opts['--folds'])
target_col = 'target'

if cv == 0:
    nfolds = 2

X, y, y_coded, ids_train, scaler = load_train_data(train_file)
X_test, ids_test = load_test_data(test_file, scaler)
num_classes = len(y[0])
num_features = X.shape[1]
skf = StratifiedKFold(y_coded, nfolds, random_state=2015)
ids_train_folds = np.empty(0)
for train_index, valid_index in skf:
    ids_train_folds = np.append(ids_train_folds, ids_train[valid_index])

#train = train.reindex(np.random.permutation(train.index))

param = {}
param['objective'] = 'binary:logistic'
param['eta'] = 0.1
param['booster'] = 'gblinear'
param['max_depth'] = 12
param['eval_metric'] = 'logloss'
param['silent'] = 1
param['nthread'] = 6
param['min_child_weight'] = 1
Example #26
0
def stackFrame(data, config, clf_List):
    # -- get train /test feature and train label
    trainFeature = data['trainFeature']
    testFeature = data['testFeature']

    # -- get stack param from config
    cvfolds = config['folds']
    # -- stack train and test
    for j, clf in enumerate(clf_List):
        modelName = config['modelName'][j]
        LogInfo("Model-" + modelName)
        for labelIndex in range(3):
            labelName = 'trainLabel' + str(labelIndex + 1)
            LogInfo(labelName)
            trainLabel = data[labelName]
            skf = list(StratifiedKFold(trainLabel, cvfolds))
            config['task'] = config['taskList'][labelIndex]
            # -- define the stack model result
            blend_train = np.zeros(
                (trainFeature.shape[0], len(np.unique(trainLabel))))
            blend_test = np.zeros(
                (testFeature.shape[0], len(np.unique(trainLabel))))
            for i, (trainIndex, testIndex) in enumerate(skf):
                LogInfo("Fold-" + str(i))
                X_train = trainFeature[trainIndex]
                y_train = trainLabel[trainIndex]
                X_test = trainFeature[testIndex]
                y_test = trainLabel[testIndex]

                if clf == 'xgb':
                    y_pred, test_pred = xgbStackModel(X_train, X_test, y_train,
                                                      y_test, testFeature,
                                                      config)
                    blend_test += test_pred
                else:
                    clf.fit(X_train, y_train)
                    y_pred = clf.predict_proba(X_test)
                    y_pred_val = clf.predict(X_test)
                    test_pred = clf.predict_proba(testFeature)
                    blend_test += test_pred
                    evalerror(y_pred_val, y_test)

                blend_train[testIndex, :] = y_pred

            blend_test = blend_test / cvfolds

            if labelIndex == 0:
                train = blend_train
                test = blend_test
            else:
                train = np.concatenate([train, blend_train], axis=1)
                test = np.concatenate([test, blend_test], axis=1)

        train = pd.DataFrame(train,
                             columns=getColName(train.shape[1], modelName))
        test = pd.DataFrame(test, columns=getColName(test.shape[1], modelName))

        train.to_csv('../feature/stack-my/' + modelName + '_train_prob.csv',
                     index=False)
        test.to_csv('../feature/stack-my/' + modelName + '_test_prob.csv',
                    index=False)
Example #27
0
                dtype=np.float32,
                usecols=np.concatenate([[
                    0
                ], important_indices[important_indices >= n_date_features] +
                                        1 - 1156])).values
],
                   axis=1)
y = pd.read_csv("../input/train_numeric.csv",
                index_col=0,
                dtype=np.float32,
                usecols=[0, 969]).values.ravel()

# In[ ]:

clf = XGBClassifier(max_depth=5, base_score=0.005)
cv = StratifiedKFold(y, n_folds=3)
preds = np.ones(y.shape[0])
for i, (train, test) in enumerate(cv):
    preds[test] = clf.fit(X[train], y[train]).predict_proba(X[test])[:, 1]
    print("fold {}, ROC AUC: {:.3f}".format(
        i, roc_auc_score(y[test], preds[test])))
print(roc_auc_score(y, preds))

# In[ ]:

# pick the best threshold out-of-fold
thresholds = np.linspace(0.01, 0.99, 50)
mcc = np.array([matthews_corrcoef(y, preds > thr) for thr in thresholds])
plt.plot(thresholds, mcc)
best_threshold = thresholds[mcc.argmax()]
print(mcc.max())
Example #28
0

test.drop(labels = ["v22",'v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1, inplace = True)


train = np.asarray(train, dtype=np.float32)        
labels = labels.ravel()



X = train ; 
y = labels; 
X_submission = test; 
n_folds = 2

skf = list(StratifiedKFold(y, n_folds))


# BLEND 1
clfs = [
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=80, max_features = "auto" ,min_samples_split = 30 , n_jobs=-1, criterion='entropy'),
        RandomForestClassifier(n_estimators=150, max_features = 80 ,min_samples_split = 50 , n_jobs=-1, criterion='entropy'),
        RandomForestClassifier(n_estimators=50, max_features = "auto" ,min_samples_split = 70 , n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=120, n_jobs=-1, max_depth=50 , max_features = 60 ,  min_samples_leaf=40 , criterion='gini'),
        ExtraTreesClassifier(n_estimators=150, n_jobs=-1, max_depth=100 , max_features = 80 ,  min_samples_leaf=40 , criterion='entropy'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, max_depth=100 , max_features = 30 ,  min_samples_leaf=30 , criterion='entropy'),
        ExtraTreesClassifier(n_estimators=150, n_jobs=-1, max_depth=120 , max_features = "auto" ,  min_samples_leaf=20 , criterion='entropy')
        ]

print "Creating train and test sets for blending."
Example #29
0
testing_reduced.shape

#Hyperparameters tuning
run_gs = False

if run_gs:
    parameter_grid = {
        'max_depth': [4, 6, 8],
        'n_estimators': [50, 10],
        'max_features': ['sqrt', 'auto', 'log2'],
        'min_samples_split': [1, 3, 10],
        'min_samples_leaf': [1, 3, 10],
        'bootstrap': [True, False],
    }
    forest = RandomForestClassifier()
    cross_validation = StratifiedKFold(targets, n_folds=5)

    grid_search = GridSearchCV(forest,
                               scoring='accuracy',
                               param_grid=parameter_grid,
                               cv=cross_validation)

    grid_search.fit(training, targets)
    model = grid_search
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))
else:
    parameters = {
        'bootstrap': False,

gbc=GradientBoostingClassifier(n_estimators=200)
evaluate_model(gbc)

'''
0.971107544141252
0.8097014925373134
[0.65921788 0.82681564 0.81460674 0.8258427  0.85310734]
0.797066906117377
'''

single_model = RandomForestClassifier(n_estimators=100)
# model = GradientBoostingClassifier()
# model = LogisticRegression(C=0.5, penalty='l2', tol=1e-9)
rfecv = RFECV( estimator = single_model , step = 1 , cv = StratifiedKFold( train_y , 2 ) , scoring = 'accuracy' )
evaluate_model(rfecv)

'''
    0.9983948635634029
    0.7910447761194029
    [0.7877095  0.79888268 0.84269663 0.80898876 0.83050847]
    0.8137572093211297
'''


from sklearn.ensemble import VotingClassifier

voc = VotingClassifier([('lr', lr), ('rf', rfc), ('gbc', gbc)], voting='hard')
evaluate_model(voc)