Esempio n. 1
0
def extractTrainAndTest(testRate):

    # load the csv file including all the data (except for the first row)
    csvArray = RD.loadArray('train_test.csv', splitter=',')
    csvArray = csvArray[1:]
    numOfRows = len(csvArray)

    # write first row
    firstRow = ['label']
    
    imgSize = 32
    for i in range(imgSize):
        for j in range(imgSize):
            firstRow.append(str(i) + '_' + str(j))

    # designate training data and test data
    trainingData = [firstRow]
    testData = [firstRow]

    for i in range(numOfRows):
        rand = random.random()

        if rand >= testRate: trainingData.append(csvArray[i])
        else: testData.append(csvArray[i])

    # save file (trainingData and testData)
    RD.saveArray('train.csv', trainingData, splitter=',', saveSize=1000)
    RD.saveArray('test.csv', testData, splitter=',', saveSize=1000)
Esempio n. 2
0
def deepLearningQ_training(Q, deviceName, epoch, printed):

    # Q Table           = [[[s0], [q00, q01, ...]], [[s1], [q10, q11, ...]], ...]
    # convert to input  = converted version of [[s0], [s1], ...]
    #            output = original  version of [[q00, q01, ...], [q10, q11, ...], ...]

    # input array (need to convert original array [s0])
    inputData = []
    for i in range(len(Q)):
        inputData.append(stateTo1dArray(Q[i][0]))

    # output array (as original)
    outputData = []
    for i in range(len(Q)):
        outputData.append(Q[i][1])

    # save input and output array as file
    if len(inputData) > 0:
        RD.saveArray('Q_input.txt', inputData)
    if len(outputData) > 0:
        RD.saveArray('Q_output.txt', outputData)

    # train using deep learning and save the model (testInputFile and testOutputFile is None)
    # need: modelConfig.txt
    # DON'T NEED TO APPLY SIGMOID to training output data, because DLmain.deeplearning applies it
    try:
        DLmain.deepLearning('Q_input.txt', 'Q_output.txt', None, None, None,
                            None, 0.0, None, 'modelConfig.txt', deviceName,
                            epoch, printed, 'deepQ_model')
    except:
        print('Q_input.txt or Q_output.txt does not exist.')
Esempio n. 3
0
def lightGBM(TRI_array, TRO_array, TEI_array, count):

    # create Pandas DataFrame
    # tv_input  : test / validation input
    # tv_output : test / validation output
    (train_input, train_output, tv_input) = create_dataframe(TRI_array, TRO_array, TEI_array)

    # convert to lightgbm dataset
    train_ds = lgb.Dataset(train_input, label=train_output)

    # set parameters
    # refer to https://www.kaggle.com/hiro5299834/tps-apr-2021-pseudo-labeling-voting-ensemble (0.81722)
    params = {'metric': 'binary_logloss',
              'objective': 'binary',
              'random_state': 2021 + count, # SEED = 2021...
              'learning_rate': 0.01,
              'min_child_samples': 150,
              'reg_alpha': 3e-5,
              'reg_lambda': 9e-2,
              'num_leaves': 20,
              'max_depth': 16,
              'colsample_bytree': 0.8,
              'subsample': 0.8,
              'subsample_freq': 2,
              'max_bin': 240}

    # create model
    model = lgb.train(params, train_ds, 2000, train_ds, verbose_eval=20, early_stopping_rounds=200)

    # predict
    predict_tv = model.predict(tv_input)
    predictions = len(predict_tv)

    RD.saveArray('lightGBM_tv_predict_' + str(count) + '.txt', np.array([predict_tv]).T)
Esempio n. 4
0
def mergeTestResult(original, IDcol, files):
    merged = []

    for i in range(files):

        # load test result array from result file
        result = RD.loadArray('result_split_' + str(i) + '.csv', ',')
        result = result[1:]  # remove first value with column title

        # read corresponding ID
        test = RD.loadArray(original + '_sub_' + str(i) + '.txt')
        testIDs_ = np.array(test)[:, IDcol]

        # testIDs_ = [0, 1, 2, ...] -> testIDs = [[0], [1], [2], ...]
        testIDs = []
        for j in range(len(testIDs_)):
            testIDs.append([testIDs_[j]])

        # merge result and test array
        IDsAndResult = np.concatenate(
            [np.array(testIDs), np.array(result)], axis=1)

        print('\n <<< IDs-Result for file ' + str(i) + ' >>>')
        print(np.array(IDsAndResult))

        # append this to array 'merged'
        for j in range(len(IDsAndResult)):
            merged.append([IDsAndResult[j][0], IDsAndResult[j][1]])

    # sort the result array 'merged'
    merged = sorted(merged, key=lambda x: x[0])

    # write to file
    RD.saveArray('result_split_final.txt', merged)
Esempio n. 5
0
def lightGBM(TRI_array, TRO_array, TEI_array, TEO_array, count, valid):

    # create Pandas DataFrame and convert to lightgbm dataset
    # tv_input  : test / validation input
    # tv_output : test / validation output
    if valid == True:
        (train_input, train_output, tv_input,
         tv_output) = create_dataframe(TRI_array, TRO_array, TEI_array,
                                       TEO_array)
        train_ds = lgb.Dataset(train_input, label=train_output)
        valid_ds = lgb.Dataset(tv_input, label=tv_output)

    else:
        (train_input, train_output,
         tv_input) = create_dataframe(TRI_array, TRO_array, TEI_array)
        train_ds = lgb.Dataset(train_input, label=train_output)

    # set parameters
    params = {
        'metric': 'AUC',
        'objective': 'regression',
        'random_state': 2021 + count,  # SEED = 2021...
        'learning_rate': 0.005,
        'min_child_samples': 256,
        'reg_alpha': 3e-5,
        'reg_lambda': 9e-2,
        'num_leaves': 32,
        'max_depth': 32,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'subsample_freq': 2,
        'max_bin': 1024
    }

    # create model
    if valid == True:
        model = lgb.train(params,
                          train_ds,
                          4000,
                          valid_ds,
                          verbose_eval=30,
                          early_stopping_rounds=200)
    else:
        model = lgb.train(params,
                          train_ds,
                          4000,
                          train_ds,
                          verbose_eval=30,
                          early_stopping_rounds=200)

    # predict
    predict_tv = model.predict(tv_input)
    predictions = len(predict_tv)

    if valid == True:
        RD.saveArray('lightGBM_tv_valid_' + str(count) + '.txt',
                     np.array([predict_tv]).T)
    else:
        RD.saveArray('lightGBM_tv_predict_' + str(count) + '.txt',
                     np.array([predict_tv]).T)
def readResult(pred, real, num, count):

    # assertion
    assert(len(pred) == len(real))

    # extract values
    vals = []
    result = [['thrs', 'TP', 'TN', 'FP', 'FN', 'accu', 'correl', 'roc-auc'],
              ['-', '-', '-', '-', '-', '-',
               round(np.corrcoef(pred, real)[0][1], 4),
               round(roc_auc_score(real, pred), 4)]]
    
    for i in range(len(pred)):
        vals.append([pred[i], real[i]])

    for i in range(1, 250):
        threshold = count * round(1 - pow(0.95, i), 6)

        TP = 0
        TN = 0
        FP = 0
        FN = 0

        pred_binary = []
        real_binary = []

        for j in range(len(vals)):
            if vals[j][0] >= threshold and vals[j][1] == 1:
                TP += 1
                pred_binary.append(1)
                real_binary.append(1)
            elif vals[j][0] < threshold and vals[j][1] == 0:
                TN += 1
                pred_binary.append(0)
                real_binary.append(0)
            elif vals[j][0] >= threshold and vals[j][1] == 0:
                FP += 1
                pred_binary.append(1)
                real_binary.append(0)
            elif vals[j][0] < threshold and vals[j][1] == 1:
                FN += 1
                pred_binary.append(0)
                real_binary.append(1)

        # compute correlation
        try:
            corr = round(np.corrcoef(pred_binary, real_binary)[0][1], 4)
        except:
            corr = '-'

        # record result
        result.append([round(threshold, 4),
                       TP, TN, FP, FN,
                       round(float((TP+TN)/(TP+TN+FP+FN)), 4),
                       corr,
                       round(roc_auc_score(real_binary, pred_binary), 4)])

    # write result
    print(result)
    RD.saveArray('bert_val_report_' + str(num) + '.txt', result)
Esempio n. 7
0
def makeCsv():

    # write CSV file: for example, (10x10 images with label 0, 1, ..., or 7)
    
    # label 0,0 0,1 0,2 0,3 ... 0,9 1,0 ... 1,9 2,0 ... 9,9
    # 4     0   0   0   0   ... 0   0   ... 255 255 ... 0
    # 1     0   255 204 51  ... 0   51  ... 255 0   ... 0
    # 5     0   0   102 153 ... 0   0   ... 0   0   ... 135
    # 0     0   0   0   0   ... 65  173 ... 0   0   ... 0
    # 1     0   0   0   0   ... 102 200 ... 12  0   ... 0
    # ...   ... ... ... ... ... ... ... ... ... ... ... ...
    # 7     0   0   0   102 ... 102 225 ... 195 0   ... 0

    # initialize array
    csvArray = []

    # write first row
    firstRow = ['label']
    
    imgWidth = 192
    imgHeight = 128
    for i in range(imgHeight):
        for j in range(imgWidth):
            firstRow.append(str(i) + '_' + str(j))

    csvArray.append(firstRow)

    # convert each image into a row for the CSV file
    files = os.listdir('images')
    count = 0
    
    for file in files:

        # for count
        if count % 25 == 0: print(count, len(files))
        count += 1

        # open each image file
        img = Image.open('images/' + file)
        pixel = img.load()

        # add each pixel
        OFEC_list = {'train_car':100, 'train_bus':101, 'test_car':102, 'test_bus':103}
        label = OFEC_list[file.split(')')[0]]
        thisRow = [label]
        
        for i in range(imgHeight):
            for j in range(imgWidth):
                thisRow.append(int(sum(pixel[j, i])/3))

        csvArray.append(thisRow)

    # save into file
    RD.saveArray('train_test.csv', csvArray, splitter=',', saveSize=50)
Esempio n. 8
0
def mergeTrain(TRI, TRO, TRIO):

    # read array
    TI = np.array(RD.loadArray(TRI, '\t'))
    TO = np.array(RD.loadArray(TRO, '\t'))

    # concatenate arrays
    TIO = np.concatenate((TI, TO), axis=1)

    # write array
    RD.saveArray(TRIO, TIO, '\t', 500)
Esempio n. 9
0
def useAdvancedModels():
    count_lightGBM = 1
    count_DecisionTree = 0
    count_XGBoost = 1
    count_deepLearning = 1

    pred_array = read_val_report.getPredAndRealArray(count_lightGBM,
                                                     count_DecisionTree,
                                                     count_XGBoost,
                                                     count_deepLearning, None,
                                                     False)

    RD.saveArray('final_test_output.txt', np.array([pred_array]).T)
Esempio n. 10
0
def convertToNumeric():

    # number of train rows and test rows
    trainRows = 12000  # / 60000
    testRows = 2000  # / 10000

    # check if converted data file exists
    try:
        _ = open('mnist_train_input.txt', 'r')
        _.close()
        _ = open('mnist_train_output.txt', 'r')
        _.close()
        _ = open('mnist_test_input.txt', 'r')
        _.close()
        _ = open('mnist_test_output.txt', 'r')
        _.close()
        return
    except:
        pass

    # TRAINING DATA
    # read and print col=1, ... and row=1, ... of the CSV file
    train_input = readCSV('mnist_train.csv', [1, None], [1, trainRows + 1])
    train_output = readCSV('mnist_train.csv', [0, 1], [1, trainRows + 1])

    # TEST DATA
    test_input = readCSV('mnist_test.csv', [1, None], [1, testRows + 1])
    test_output = readCSV('mnist_test.csv', [0, 1], [1, testRows + 1])

    # make training data numeric (inverting the color)
    for i in range(trainRows):
        if i % 1000 == 0: print(i)

        for j in range(len(train_input[0])):
            train_input[i][j] = 1.0 - int(train_input[i][j]) / 255.0

    # make test data numeric (inverting the color)
    for i in range(testRows):
        if i % 1000 == 0: print(i)

        for j in range(len(test_input[0])):
            test_input[i][j] = 1.0 - int(test_input[i][j]) / 255.0

    # make output one-hot
    train_output = list(train_output)
    test_output = list(test_output)

    train_output = one_hot(train_output)
    test_output = one_hot(test_output)

    # save
    RD.saveArray('mnist_train_input.txt', train_input)
    RD.saveArray('mnist_train_output.txt', train_output)
    RD.saveArray('mnist_test_input.txt', test_input)
    RD.saveArray('mnist_test_output.txt', test_output)
Esempio n. 11
0
def writeResult(predict_tv, tv_output, VAL_rate, num, modelName):
    
    # write result
    result = []
    for i in range(len(predict_tv)):
        result.append([predict_tv[i]])

    if VAL_rate > 0:
        RD.saveArray(modelName + '_val_result_' + str(num) + '.txt', result, '\t', 500)
    else:
        RD.saveArray(modelName + '_test_result_' + str(num) + '.txt', result, '\t', 500)

    # validation mode -> compute RMSLE error
    # saved results are still NORMALIZED values
    if VAL_rate > 0:

        # convert prediction
        tv_output_ = []
        predict_tv_ = []

        # for tv_output
        for i in range(len(tv_output[0])):

            if num == 0: # formation_energy_ev_natom
                tv_output_.append(float(tv_output[0][i]) * 0.104078 + 0.187614)

            else: # bandgap_energy_ev
                tv_output_.append(float(tv_output[0][i]) * 1.006635 + 2.077205)

        # for predict_tv
        for i in range(len(predict_tv)):

            if num == 0: # formation_energy_ev_natom
                predict_tv_.append(float(predict_tv[i]) * 0.104078 + 0.187614)

            else: # bandgap_energy_ev
                predict_tv_.append(float(predict_tv[i]) * 1.006635 + 2.077205)

        # compute RMSLE and return
        print('\n\n ====[ ' + modelName + ' / valid=True ]====')
        return rmsle(tv_output_, predict_tv_)

    return 0
Esempio n. 12
0
def deepLearningQ_training(Q, deviceName, epoch, printed):

    model = defineModel()
    
    # Q : [state, action_reward, i (UAV/cluster index), k (device index)]
    
    # Q Table           = [[[s0], [Q00, Q01, ...]], [[s1], [Q10, Q11, ...]], ...]
    # convert to input  = converted version of [[s0], [s1], ...]
    #            output = original  version of [[Q00, Q01, ...], [Q10, Q11, ...], ...]
    # where           s = [q[n][l], {a[n][l][k_l]}, {R[n][k_l]}]
    #        and      Q = reward

    # input array (need to convert original array [s0])
    inputData = []
    for i in range(len(Q)):

        # convert into 1d array (valid if not converted)
        try:
            inputData.append(stateTo1dArray(Q[i][0], Q[i][3]))

        # executed if already converted to 1d array
        except:
            inputData.append(Q[i][0])

    # output array (as original)
    outputData = []
    for i in range(len(Q)): outputData.append(Q[i][1])

    # save input and output array as file
    if len(inputData) > 0:
        RD.saveArray('Q_input.txt', inputData)
    if len(outputData) > 0:
        RD.saveArray('Q_output.txt', outputData)

    # save normalized data
    if len(inputData) > 0:
        normalizedInputData = normalize(inputData, False, 'input' + str(len(inputData)), True)
        normalizedOutputData = normalize(outputData, False, 'output' + str(len(inputData)), True)
        RD.saveArray('Q_input_normalized.txt', normalizedInputData)
        RD.saveArray('Q_output_normalized.txt', normalizedOutputData)

    # train using deep learning and save the model (testInputFile and testOutputFile is None)
    # need: modelConfig.txt
    # DON'T NEED TO APPLY SIGMOID to training output data, because DLmain.deeplearning applies it
    try:
        Q_input_noramlized = np.array(RD.loadArray('Q_input_normalized.txt', '\t')).astype(float)
        Q_output_noramlized = np.array(RD.loadArray('Q_output_normalized.txt', '\t')).astype(float)
        trainDataWithModel(Q_input_noramlized, Q_output_noramlized, model, 15)

    except:
        print('[train] Q_input_normalized.txt or Q_output_normalized.txt does not exist.')
Esempio n. 13
0
def useTestOutput(fn, threshold):

    # read file
    testResult = RD.loadArray(fn)

    # write final result
    finalResult = []

    for i in range(len(testResult)):
        value = float(testResult[i][0])

        if threshold == None:
            finalResult.append([value])
        else:
            if value < threshold:
                finalResult.append([0])
            else:
                finalResult.append([1])

    # write file
    RD.saveArray('to_submit.txt', finalResult)
Esempio n. 14
0
def DecisionTree(TRI_array, TRO_array, TEI_array, count):

    # create Pandas DataFrame
    # tv_input  : test / validation input
    # tv_output : test / validation output
    (train_input, train_output, tv_input) = create_dataframe(TRI_array, TRO_array, TEI_array)

    # set parameters and create model
    # refer to https://www.kaggle.com/hiro5299834/tps-apr-2021-pseudo-labeling-voting-ensemble (0.81722)
    #          https://www.kaggle.com/remekkinas/ensemble-learning-meta-classifier-for-stacking (0.81692)
    model = DecisionTreeClassifier(
        max_depth = 12 + count % 2,
        min_samples_leaf = 6 + count // 2,
        random_state = 2021 + count
    )
    model.fit(train_input, train_output)

    # predict
    predict_tv = model.predict(tv_input)

    RD.saveArray('DecisionTree_tv_predict_' + str(count) + '.txt', np.array([predict_tv]).T)
Esempio n. 15
0
def XGBoost(TRI_array, TRO_array, TEI_array, count):

    # create Pandas DataFrame
    # tv_input  : test / validation input
    # tv_output : test / validation output
    (train_input, train_output, tv_input) = create_dataframe(TRI_array, TRO_array, TEI_array)

    # set parameters and create model
    # refer to https://www.datacamp.com/community/tutorials/xgboost-in-python
    params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

    model = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3,
                             learning_rate = 0.1,
                             max_depth = 5,
                             alpha = 10, n_estimators = 10)

    model.fit(train_input, train_output)

    # predict
    predict_tv = model.predict(tv_input)
    
    RD.saveArray('XGBoost_tv_predict_' + str(count) + '.txt', np.array([predict_tv]).T)
Esempio n. 16
0
def extractTrainAndTest():

    # load the csv file including all the data (except for the first row)
    csvArray = RD.loadArray('train_test.csv', splitter=',')
    csvArray = csvArray[1:]
    numOfRows = len(csvArray)

    # load label list
    labelList = RD.loadArray('label_list.csv')

    # write first row
    firstRow = ['label']

    imgWidth = 64
    imgHeight = 64
    for i in range(imgHeight):
        for j in range(imgWidth):
            firstRow.append(str(i) + '_' + str(j))

    # designate training data and test data
    trainingData = [firstRow]
    testData = [firstRow]
    trainingLabel = []
    testLabel = []

    for i in range(numOfRows):

        # train or test
        if int(labelList[i][0]) >= 120: train = False
        else: train = True

        # car: 0, bus: 1
        csvArray[i][0] = str(int(csvArray[i][0]) % 2)

        # append to training/test data
        if train == True:
            trainingData.append(csvArray[i])
            trainingLabel.append(labelList[i])
        else:
            testData.append(csvArray[i])
            testLabel.append(labelList[i])

    # save file (trainingData and testData)
    RD.saveArray('train.csv', trainingData, splitter=',', saveSize=500)
    RD.saveArray('test.csv', testData, splitter=',', saveSize=500)
    RD.saveArray('trainLabels.csv', trainingLabel)
    RD.saveArray('testLabels.csv', testLabel)
Esempio n. 17
0
def writeFinalOutput(reviews):

    train_review = np.array(pd.read_csv('yelp_training_set_review.csv'))
    train_output = []

    for i in range(reviews):

        # for example, 'funny': 0, 'useful': 0, 'cool': 0
        votes = train_review[i][1].split('{')[1].split('}')[0]
        useful = int(votes.split(',')[1].split(': ')[1])

        train_output.append(useful)

    # average and stddev of each array TO_xxxx
    avg = np.mean(train_output)
    stddev = np.std(train_output)

    # normalize each TO_xxxx
    for i in range(reviews):
        train_output[i] = [(train_output[i] - avg) / stddev]

    # save
    RD.saveArray('train_output.txt', train_output)
    RD.saveArray('train_output_avg_and_std.txt', [[avg, stddev]])
Esempio n. 18
0
def writeOutput(valid, isValid):

    trainData = np.array(pd.read_csv('train.csv'))
    train_output = []

    if valid == True:
        valid_output = []

    print(trainData)

    for i in range(len(trainData)):
        if isValid[i] == True:
            valid_output.append([trainData[i][1]])  # index of 'Survived' is 1
        else:
            train_output.append([trainData[i][1]])

    # save
    if valid == True:
        RD.saveArray('train_train_output.txt', train_output)
        RD.saveArray('train_valid_output.txt', valid_output)
    else:
        RD.saveArray('train_output.txt', train_output)
Esempio n. 19
0
def makeCsv():

    # write CSV file: for example, (10x10 images with label 0, 1, ..., or 7)

    # label 0,0 0,1 0,2 0,3 ... 0,9 1,0 ... 1,9 2,0 ... 9,9
    # 4     0   0   0   0   ... 0   0   ... 255 255 ... 0
    # 1     0   255 204 51  ... 0   51  ... 255 0   ... 0
    # 5     0   0   102 153 ... 0   0   ... 0   0   ... 135
    # 0     0   0   0   0   ... 65  173 ... 0   0   ... 0
    # 1     0   0   0   0   ... 102 200 ... 12  0   ... 0
    # ...   ... ... ... ... ... ... ... ... ... ... ... ...
    # 7     0   0   0   102 ... 102 225 ... 195 0   ... 0

    # initialize array
    csvArray = []

    # write first row
    firstRow = ['label']

    imgWidth = 64
    imgHeight = 64
    for i in range(imgHeight):
        for j in range(imgWidth):
            firstRow.append(str(i) + '_' + str(j))

    csvArray.append(firstRow)

    # convert each image into a row for the CSV file
    files = os.listdir('images')
    count = 0
    labelList = []

    for file in files:

        # for count
        if count % 25 == 0: print(count, len(files))
        count += 1

        # open each image file
        img = Image.open('images/' + file)
        pixel = img.load()

        # add label
        OFEC_list = {
            'NF_train': 100,
            'NL_train': 101,
            'NM_train': 102,
            'NR_train': 103,
            'VF_train': 110,
            'VL_train': 111,
            'VM_train': 112,
            'VR_train': 113,
            'NF_test': 120,
            'NL_test': 121,
            'NM_test': 122,
            'NR_test': 123,
            'VF_test': 130,
            'VL_test': 131,
            'VM_test': 132,
            'VR_test': 133
        }

        label = OFEC_list[file.split(')')[0]]
        if label % 20 >= 10: thisRow = [1]  # a vehicle
        else: thisRow = [0]  # not a vehicle
        labelList.append([label])

        # add each pixel
        for i in range(imgHeight):
            for j in range(imgWidth):
                thisRow.append(int(sum(pixel[j, i]) / 3))

        csvArray.append(thisRow)

    # save into file
    RD.saveArray('train_test.csv', csvArray, splitter=',', saveSize=50)
    RD.saveArray('label_list.csv', labelList)
Esempio n. 20
0
def readAllSubs(size):

    # read train and test data
    train = RD.loadArray('train.csv', ',')
    test = RD.loadArray('test.csv', ',')

    # write id-delta, input and output of training data
    # write id-delta and input         of test     data
    try:
        _ = open('train_id.txt', 'r')
        _.close()
        _ = open('train_input.txt', 'r')
        _.close()
        _ = open('train_output.txt', 'r')
        _.close()
        _ = open('test_id.txt', 'r')
        _.close()
        _ = open('test_input.txt', 'r')
        _.close()

    except:
        # train.txt -> id, delta, start1~625, stop1~625 (if size=25) -> train_id.txt     : extract id and delta
        #                                                            -> train_input.txt  : extract delta and stop1~625 (if size=25)
        #                                                            -> train_output.txt : extract delta and start1~625 (if size=25)
        RD.saveArray('train_id.txt', np.array(train)[:, 0:2])
        RD.saveArray(
            'train_input.txt',
            np.concatenate([
                np.array(train)[:, 1:2],
                np.array(train)[:, size * size + 2:2 * size * size + 2]
            ],
                           axis=1))
        RD.saveArray('train_output.txt', np.array(train)[:, 1:size * size + 2])

        # test.txt  -> id, delta, stop1~625 (if size=25)             -> test_id.txt      : extract id and delta
        #                                                            -> test_input.txt   : extract delta and stop1~625 (if size=25)
        RD.saveArray('test_id.txt', np.array(test)[:, 0:2])
        RD.saveArray('test_input.txt', np.array(test)[:, 1:size * size + 2])

    # split train and test data into files
    try:
        # try to read file
        for i in range(5):
            _ = open('train_id_sub_' + str(i) + '.txt', 'r')
            _.close()
            _ = open('train_input_sub_' + str(i) + '.txt', 'r')
            _.close()
            _ = open('train_output_sub_' + str(i) + '.txt', 'r')
            _.close()
            _ = open('test_id_sub_' + str(i) + '.txt', 'r')
            _.close()
            _ = open('test_input_sub_' + str(i) + '.txt', 'r')
            _.close()
    except:
        # write train_id, train_input, train_output, test_id and test_input files
        deltaOrder = [[1], [2], [3], [4],
                      [5]]  # order of delta (1, 2, 3, 4, 5)

        # train_id_sub_X.txt     : id             of training data with delta X
        # train_input_sub_X.txt  : input  (stop)  of training data with delta X
        # train_output_sub_X.txt : output (start) of training data with delta X
        # test_id_sub_X.txt      : id             of test data with delta X
        # test_input_sub_X.txt   : input  (stop)  of test data with delta X
        RD.splitArray('train_id.txt', [1], deltaOrder, True)
        RD.splitArray('train_input.txt', [0], deltaOrder, True)
        RD.splitArray('train_output.txt', [0], deltaOrder, True)
        RD.splitArray('test_id.txt', [1], deltaOrder, True)
        RD.splitArray('test_input.txt', [0], deltaOrder, True)
Esempio n. 21
0
import sys
sys.path.insert(0, '../../AI_BASE')
import readData as RD
import numpy as np

if __name__ == '__main__':

    # read file
    testResult = RD.loadArray('test_output.txt')

    # write final result
    finalResult = []

    for i in range(len(testResult)):
        finalResult.append([float(testResult[i][0]) + 8.0])

    # write file
    RD.saveArray('to_submit.txt', finalResult)
Esempio n. 22
0
        final_train_output0.append([
            (float(train_output0[i][0]) - 0.187614) / 0.104078
        ])
        final_train_output1.append([
            (float(train_output1[i][0]) - 2.077205) / 1.006635
        ])

    result += 'train input:\n'
    result += str(np.array(final_train_input)) + '\n'
    result += 'train output 0:\n'
    result += str(np.array(final_train_output0)) + '\n'
    result += 'train output 1:\n'
    result += str(np.array(final_train_output1)) + '\n'

    # save training data
    RD.saveArray('train_input.txt', final_train_input, '\t', 500)
    RD.saveArray('train_output_0.txt', final_train_output0, '\t', 500)
    RD.saveArray('train_output_1.txt', final_train_output1, '\t', 500)

    # TEST
    test_data = RD.loadArray('test_converted.csv', ',')
    test_input = np.array(test_data)[1:, 1:]

    result += 'test input:\n'
    result += str(np.array(test_input)) + '\n'

    # make final test input
    final_test_input = []

    for i in range(test_rows):
        if i % 1000 == 0: print(i)
Esempio n. 23
0
            max_lengths_train.append(
                convertForBert(data_to_train[i], None, print_interval,
                               tokenizer, None))
            max_lengths_test.append(
                convertForBert(data_to_test[i], None, print_interval,
                               tokenizer, None))
            max_lengths.append(max(max_lengths_train[i], max_lengths_test[i]))

        print('max length')
        print(max_lengths_train)
        print(max_lengths_test)
        print(max_lengths)

        # save max lengths
        RD.saveArray('bert_max_lengths_train.txt', [max_lengths_train], '\t',
                     500)
        RD.saveArray('bert_max_lengths_test.txt', [max_lengths_test], '\t',
                     500)
        RD.saveArray('bert_max_lengths.txt', [max_lengths], '\t', 500)

    # model 0: train_title   -> train_approved
    # model 1: train_essay1  -> train_approved
    # model 2: train_essay2  -> train_approved
    # model 3: train_essay3  -> train_approved
    # model 4: train_essay4  -> train_approved
    # model 5: train_summary -> train_approved
    for i in range(6):

        input_data = data_to_train[i]
        output_data = train_approved
        rows = len(input_data)
Esempio n. 24
0
    train_input = RD.loadArray(train_input_fn)
    train_output = RD.loadArray(train_output_fn)
    train_rows = len(train_input)

    print(' ==== before augmentation ====')
    print(np.shape(train_input))
    print(np.array(train_input))
    print('')
    print(np.shape(train_output))
    print(np.array(train_output))

    # augment training input and output
    for i in range(train_rows):
        train_input.append(train_input[i][::-1])

        if augment_test == True:
            train_output.append(train_output[i][::-1])
        else:
            train_output.append(train_output[i])

    print('\n ==== after augmentation ====')
    print(np.shape(train_input))
    print(np.array(train_input))
    print('')
    print(np.shape(train_output))
    print(np.array(train_output))

    # save
    RD.saveArray('train_input_augmented.txt', train_input, '\t', 1)
    RD.saveArray('train_output_augmented.txt', train_output, '\t', 1)
Esempio n. 25
0
            for k in range(1, len(i[j])):
                value = i[j][k]
                if float(i[j][k]) >= threshold:
                    i[j][k] = 1  # above threshold -> live
                else:
                    i[j][k] = 0  # below threshold -> dead

    # write final array
    finalArray = []

    print('for sub0')
    for i in range(len(sub0)):
        finalArray.append(sub0[i])
    print('for sub1')
    for i in range(len(sub1)):
        finalArray.append(sub1[i])
    print('for sub2')
    for i in range(len(sub2)):
        finalArray.append(sub2[i])
    print('for sub3')
    for i in range(len(sub3)):
        finalArray.append(sub3[i])
    print('for sub4')
    for i in range(len(sub4)):
        finalArray.append(sub4[i])
    print('finished')

    finalArray = sorted(finalArray, key=lambda x: x[0])
    RD.saveArray('final.csv', finalArray, ',')
Esempio n. 26
0
    files = 64

    # [result0, result1, ...]
    # where each element resultX is [1, 3, 4, 2, 9, 6, 5, 7, ...] for example
    finalResults = []

    # sum of test results
    sumTestResults = []

    # read file
    for i in range(files):
        testResult = np.array(RD.loadArray('test_output_' + str(i) +
                                           '.txt'))[:, :9].astype(float)

        if i == 0:
            sumTestResults = np.array(copy.deepcopy(list(testResult)))
        else:
            sumTestResults = sumTestResults + np.array(
                copy.deepcopy(list(testResult)))

        finalResult = getFinalResult(testResult)
        finalResults.append(finalResult)

    # save the sum of test results
    RD.saveArray('sumTestResults.txt', sumTestResults)

    # write final result
    # USE THE RIGHTMOST COLUMN OF to_submit.txt AS FINAL RESULT
    finalResults.append(getFinalResult(sumTestResults))
    RD.saveArray('to_submit.txt', np.array(finalResults).T)
Esempio n. 27
0
def makeData(delta, n, n_, size, limitLen, writeTestInput):

    # window size
    ws = int((n - 1) / 2)  # for training/test input
    ws_ = int((n_ - 1) / 2)  # for training/test output

    # read data
    trainInput = RD.loadArray('train_input_sub_' + str(delta - 1) + '.txt')
    trainOutput = RD.loadArray('train_output_sub_' + str(delta - 1) + '.txt')
    testInput = RD.loadArray('test_input_sub_' + str(delta - 1) + '.txt')

    trainLen = min(len(trainInput), limitLen)
    testLen = len(testInput)

    # input data to make
    trainInputData = []

    # output data to make
    trainOutputData = []

    # test input data to make
    if writeTestInput == True: testInputData = []

    # reshape training data
    for i in range(trainLen):
        if i % 10 == 0:
            print('makeData (training) : ' + str(i) + ' / ' + str(trainLen))

        # trainInput and trainOutput as numeric type
        trainInput = np.array(trainInput).astype('float')
        trainOutput = np.array(trainOutput).astype('float')

        # reshape to derive n*n training data (with ws-sized padding)
        trainInputReshaped = np.pad(
            np.array(trainInput[i]).reshape(size, size), ((ws, ws), (ws, ws)),
            'wrap')
        trainOutputReshaped = np.pad(
            np.array(trainOutput[i]).reshape(size, size),
            ((ws_, ws_), (ws_, ws_)), 'wrap')

        # save training data into array trainInputData and trainOutputData
        for j in range(size):
            for k in range(size):
                trainInputData.append(
                    list(trainInputReshaped[j:j + 2 * ws + 1,
                                            k:k + 2 * ws + 1].reshape(n * n)))
                trainOutputData.append(
                    list(trainOutputReshaped[j:j + 2 * ws_ + 1, k:k + 2 * ws_ +
                                             1].reshape(n_ * n_)))

    # reshape test data
    if writeTestInput == True:
        for i in range(testLen):
            if i % 10 == 0:
                print('makeData (test) : ' + str(i) + ' / ' + str(testLen))

            # trainInput and trainOutput as numeric type
            testInput = np.array(testInput).astype('float')

            # reshape to derive n*n training data (with ws-sized padding)
            testInputReshaped = np.pad(
                np.array(testInput[i]).reshape(size, size),
                ((ws, ws), (ws, ws)), 'wrap')

            # save test data into array testInputData
            for j in range(size):
                for k in range(size):
                    testInputData.append(
                        list(testInputReshaped[j:j + 2 * ws + 1, k:k + 2 * ws +
                                               1].reshape(n * n)))

    # save as file
    # [ADDED] saveSize=10000
    RD.saveArray('train_input_n_sub_' + str(delta - 1) + '.txt',
                 trainInputData,
                 saveSize=10000)
    RD.saveArray('train_output_n_sub_' + str(delta - 1) + '.txt',
                 trainOutputData,
                 saveSize=10000)
    if writeTestInput == True:
        RD.saveArray('test_input_n_sub_' + str(delta - 1) + '.txt',
                     testInputData)
Esempio n. 28
0
    for i in range(N):
        for j in range(N):
            
            if sum(goals_array[i][j]) == 0:
                goals_rate_array[i][j] = 0.5
            else:
                goals_rate_array[i][j] = goals_array[i][j][0] / sum(goals_array[i][j])

            # transformation: 1 - 2*(1 - x)^2 if x >= 0.5
            #                 2x^2 if x < 0.5
            if goals_rate_array[i][j] >= 0.5:
                goals_rate_array[i][j] = 1 - 2 * (1 - goals_rate_array[i][j])**2
            else:
                goals_rate_array[i][j] = 2 * (goals_rate_array[i][j])**2

    # prediction for 2016 season
    sample_submission = np.array(pd.read_csv('SampleSubmission.csv'))
    final_result = []

    for i in range(len(sample_submission)):
        info = sample_submission[i][0].split('_')

        teamA = int(info[1])
        teamB = int(info[2])

        final_result.append([goals_rate_array[teamA - 1101][teamB - 1101]])

    # write as file
    RD.saveArray('finalResult.txt', final_result, '\t', 500)
Esempio n. 29
0
        # for [Wteam, Lteam, 1, Wloc, season]
        for i in range(len(all_array)):
            raw_result.append([
                all_array[i][2], all_array[i][4], 1, all_array[i][6],
                all_array[i][0]
            ])

        # for [Lteam, Wteam, 0, Wloc, season]
        for i in range(len(all_array)):
            raw_result.append([
                all_array[i][4], all_array[i][2], 0, all_array[i][6],
                all_array[i][0]
            ])

        RD.saveArray('raw_result.txt', raw_result, '\t', 500)

        # TEST DATA : raw_result_test.txt

        raw_result_test = []

        test_array = np.array(pd.read_csv('SampleSubmission.csv'))

        for i in range(len(test_array)):
            IDsplit = test_array[i][0].split('_')

            team0_id = int(IDsplit[1])
            team1_id = int(IDsplit[2])

            raw_result_test.append([team0_id, team1_id])
Esempio n. 30
0
            # load training output
            print('loading training output...')
            TRO_array = np.array(RD.loadArray(TRO, '\t', UTF8=False, type_='f'))

            if len(TRO_array) > toTrainLimit:
                TRO_array = TRO_array[:toTrainLimit]

            # load test input
            print('loading test input...')
            TEI_array = np.array(RD.loadArray(TEI, '\t', UTF8=False, type_='f'))

            if len(TEI_array) > toTestLimit:
                TEI_array = TEI_array[:toTestLimit]

            # train and save the model
            model.fit(TRI_array, TRO_array, validation_split=VAL_rate, callbacks=[early, lr_reduced], epochs=epochs)
            model.summary()
            model.save('model_e_' + str(epochs))

            # load the model
            loaded_model = tf.keras.models.load_model('model_e_' + str(epochs))

            # validation
            prediction = loaded_model.predict(TEI_array)

            # write result for validation
            if VAL_rate > 0:
                RD.saveArray('valid_prediction_' + str(i) + '.txt', prediction, '\t', 500)
            else:
                RD.saveArray('test_prediction_' + str(i) + '.txt', prediction, '\t', 500)