trainData = pd.read_table('../dataset/dataset1/train.txt',
                          header=None,
                          encoding='gb2312',
                          delim_whitespace=True)
testData = pd.read_table('../dataset/dataset1/test.txt',
                         header=None,
                         encoding='gb2312',
                         delim_whitespace=True)
trainLabel = np.array(trainData.pop(3))
trainData = np.array(trainData)
testLabel = np.array(testData.pop(3))
testData = np.array(testData)

time_start1 = time.time()
clf1 = DecisionTreeClassifier()
clf1.train(trainData, trainLabel)
clf1.predict(testData)
score1 = clf1.accuarcy(testLabel)
time_end1 = time.time()
print("Accuracy of self-DecisionTree: %f" % score1)
print("Runtime of self-DecisionTree:", time_end1 - time_start1)

time_start = time.time()
clf = RandomForestClassifier()
clf.train(trainData, trainLabel)
clf.predict(testData)
score = clf.accuarcy(testLabel)
time_end = time.time()
print("Accuracy of RandomForest: %f" % score)
print("Runtime of RandomForest:", time_end - time_start)
# Splitting data
from sklearn.model_selection import train_test_split

X = data.drop("label", axis=1).values
y = data["label"].values

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)
#%%
# My Algorithm
from RandomForest import RandomForestClassifier

model = RandomForestClassifier(nb_trees=200,
                               max_depth=50,
                               n_estimators=300,
                               max_workers=5)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

joblib.dump(model, "data/my_random_forest.joblib")

#%%
from sklearn.ensemble import RandomForestClassifier as RFC

rf = RFC(max_depth=50,
         n_estimators=300,
         criterion='entropy',
         verbose=0,
         max_features="sqrt")
rf.fit(X_train, y_train)
 larangan = [
     labeltrain[x][y] for x in range(len(labeltrain)) for y in range(0, 3)
     if y == 1
 ]
 informasi = [
     labeltrain[x][y] for x in range(len(labeltrain)) for y in range(0, 3)
     if y == 2
 ]
 trainAnjuran = [trainf[x][1:] + [anjuran[x]] for x in range(len(trainf))]
 trainLarangan = [trainf[x][1:] + [larangan[x]] for x in range(len(trainf))]
 trainInformasi = [
     trainf[x][1:] + [informasi[x]] for x in range(len(trainf))
 ]
 # In[57]:
 # ## Build Model & Prediction
 modelA = RandomForestClassifier(rf_trees=80, rf_samples=1000)
 modelL = RandomForestClassifier(rf_trees=80, rf_samples=1000)
 modelI = RandomForestClassifier(rf_trees=80, rf_samples=1000)
 modelA.fit(trainAnjuran)
 modelL.fit(trainLarangan)
 modelI.fit(trainInformasi)
 predictA = []
 predictL = []
 predictI = []
 for y in range(0, len(testf)):
     predictA.append(modelA.predict(testf[y][1:]))
     predictL.append(modelL.predict(testf[y][1:]))
     predictI.append(modelI.predict(testf[y][1:]))
 prediction = [[predictA[x]] + [predictL[x]] + [predictI[x]]
               for x in range(len(predictA))]
 hammingloss, index = processor.getHammingLoss(labeltest, prediction)
test.append(fold1)


def split(dtrain, dtest):
    train = [dtrain[x][0:-2] for x in range(len(dtrain))]
    test = [dtest[x][0:-2] for x in range(len(dtest))]
    labeltrain = [dtrain[x][-1] for x in range(len(dtrain))]
    labeltest = [dtest[x][-2] for x in range(len(dtest))]
    return train, test, labeltrain, labeltest


index = []
performance = []
for x in range(5):
    trainf, testf, labeltrain, labeltest = split(train[x], test[x])
    model = RandomForestClassifier(rf_trees=80, rf_samples=1000)
    trainfix = [
        trainf[x][1:] + [labeltrain[x]] for x in range(len(labeltrain))
    ]
    model.fit(trainfix)
    prediction = []
    for y in range(0, len(testf)):
        prediction.append(model.predict(testf[y][1:]))
    hammingloss, index = (processor.getHammingLoss(labeltest, prediction))
    index.append(index)
    performance.append(hammingloss)
# ## Split Data
#train, validation, labeltrain, labelval = train_test_split(extraction, label, test_size=0.30, random_state=42)
#anjuran = [labeltrain[x][y] for x in range(len(labeltrain)) for y in range(0,3) if y==0]
#larangan =[labeltrain[x][y] for x in range(len(labeltrain)) for y in range(0,3) if y==1]
#informasi = [labeltrain[x][y] for x in range(len(labeltrain)) for y in range(0,3) if y==2]