Esempio n. 1
0
    #     tile = np.moveaxis(tile, -1, 0)
    #     tile = np.expand_dims(tile, axis=0)
    #
    #     # Scale to [0, 1]
    #     tile = tile / 255
    #
    #     # Embed tile
    #     tile = torch.from_numpy(tile).float()
    #     tile = Variable(tile)
    #
    #     if cuda: tile = tile.cuda()
    #     z = tilenet.encode(tile)
    #     if cuda: z = z.cpu()
    #     z = z.data.numpy()
    #
    #     X[idx,:] = z
    # t1 = time()
    # print('Embedded {} tiles: {:0.3f}s'.format(config.n_tiles, t1-t0))

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split

    # Splitting data and training RF classifer
    X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=0.2)
    rf = RandomForestClassifier(n_estimators=1000,
                                max_depth=10,
                                max_features='sqrt',
                                random_state=1)
    rf.fit(X_trn, y_trn)
    print("AND EVAL", rf.eval(X_val, y_val))
Esempio n. 2
0
def main():
    print("Loading train data from {}".format(args.raw_data))
    df = pd.read_csv(args.raw_data)
    df_train_input_sc, df_train_target, df_test_input_sc, df_test_target = lib.clear_data(df, args)

    if args.algo == 'decisiontree':
        # min_samples_leaf: 0.05, min_samples_split: 10, class_weight: None, splitter: best, max_features: 10
        # criterion: entropy, max_depth: 7 for all feature

        # min_samples_leaf: 0.05, min_samples_split: 3, class_weight: None, splitter: best, max_features: 8
        # criterion: entropy, max_depth: 6 for discrete feature

        model = tree.DecisionTreeClassifier(min_samples_leaf=0.05,
                                            min_samples_split=3,
                                            class_weight=None,
                                            splitter="best",
                                            max_features=8,
                                            criterion="entropy",
                                            max_depth=6)
        model.fit(df_train_input_sc, df_train_target)
        y_pred = model.predict(df_test_input_sc)

    if args.algo == 'randomforest':
        #  random_state: 42, n_estimators: 1000, criterion: gini, max_depth: 7, bootstrap: True, max_features: 5,
        #  min_samples_leaf: 7, min_samples_split: 7 for all feature

        #  random_state: 42, n_estimators: 100, criterion: gini, max_depth: 7, bootstrap: True, max_features: 5,
        #  min_samples_leaf: 7, min_samples_split: 7 for discrete feature

        model = RandomForestClassifier(random_state=42,  # pafam for using all feature
                                       n_estimators=1000,
                                       criterion="gini",
                                       max_depth=7,
                                       bootstrap=True,
                                       max_features=5,
                                       min_samples_leaf=7,
                                       min_samples_split=7)

        model.fit(df_train_input_sc, df_train_target)
        y_pred = model.predict(df_test_input_sc)

    if args.algo == 'logisticregression':
        # penalty: l1, random_state: 42, C: 0.05, tol: 0.01, intercept_scaling: 3, fit_intercept: True,
        # max_iter: 10 for all feature

        # penalty: l2, random_state: 42, C: 0.05, tol: 0.1, intercept_scaling: 1, fit_intercept: True,
        # max_iter: 10 for discrete feature

        model = LogisticRegression(penalty="l1",
                                   random_state=42,
                                   C=.05,
                                   tol=0.01,
                                   intercept_scaling=3,
                                   fit_intercept=True,
                                   max_iter=10)

        model.fit(df_train_input_sc, df_train_target)
        y_pred = model.predict(df_test_input_sc)

    if args.algo == 'ADA':
        model = AdaBoostClassifier()
        model.fit(df_train_input_sc, df_train_target)
        y_pred = model.predict(df_test_input_sc)

    if args.algo == 'XGB':
        model = XGBClassifier()
        model.fit(df_train_input_sc, df_train_target)
        y_pred = model.predict(df_test_input_sc)

    if args.algo == 'FFN':
        model = lib.FFN(df_train_input_sc.shape[1], args.output_dim, args.num_classes)
        optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
        dataloader = lib.DataLoader(df_train_input_sc, df_train_target, args.batchsize)

        # training
        model.train()
        for epoch in range(args.num_epochs):
            sum_loss = 0
            cnt = 0
            for it, (input_data, target_data) in enumerate(dataloader):
                cnt += 1
                input_data = torch.Tensor(input_data)
                target_data = torch.LongTensor(target_data)
                optimizer.zero_grad()
                logit = model(input_data)
                loss = F.nll_loss(logit, target_data)
                pred = logit.data.max(1)[1]
                sum_loss += loss.item()
                loss.backward()
                optimizer.step()
            print("Epoch: {} - loss: {}".format(epoch, float(sum_loss) / cnt))

        # testing
        model.eval()
        with torch.no_grad():
            input_data_test = torch.Tensor(df_test_input_sc)
            target_data_test = torch.LongTensor(df_test_target)
            logit = model(input_data_test)
            loss = F.nll_loss(logit, target_data_test)
            y_pred = logit.data.max(1)[1]

    print(classification_report(df_test_target, y_pred))