def extractTrainAndTest(testRate): # load the csv file including all the data (except for the first row) csvArray = RD.loadArray('train_test.csv', splitter=',') csvArray = csvArray[1:] numOfRows = len(csvArray) # write first row firstRow = ['label'] imgSize = 32 for i in range(imgSize): for j in range(imgSize): firstRow.append(str(i) + '_' + str(j)) # designate training data and test data trainingData = [firstRow] testData = [firstRow] for i in range(numOfRows): rand = random.random() if rand >= testRate: trainingData.append(csvArray[i]) else: testData.append(csvArray[i]) # save file (trainingData and testData) RD.saveArray('train.csv', trainingData, splitter=',', saveSize=1000) RD.saveArray('test.csv', testData, splitter=',', saveSize=1000)
def deepLearningQ_training(Q, deviceName, epoch, printed): # Q Table = [[[s0], [q00, q01, ...]], [[s1], [q10, q11, ...]], ...] # convert to input = converted version of [[s0], [s1], ...] # output = original version of [[q00, q01, ...], [q10, q11, ...], ...] # input array (need to convert original array [s0]) inputData = [] for i in range(len(Q)): inputData.append(stateTo1dArray(Q[i][0])) # output array (as original) outputData = [] for i in range(len(Q)): outputData.append(Q[i][1]) # save input and output array as file if len(inputData) > 0: RD.saveArray('Q_input.txt', inputData) if len(outputData) > 0: RD.saveArray('Q_output.txt', outputData) # train using deep learning and save the model (testInputFile and testOutputFile is None) # need: modelConfig.txt # DON'T NEED TO APPLY SIGMOID to training output data, because DLmain.deeplearning applies it try: DLmain.deepLearning('Q_input.txt', 'Q_output.txt', None, None, None, None, 0.0, None, 'modelConfig.txt', deviceName, epoch, printed, 'deepQ_model') except: print('Q_input.txt or Q_output.txt does not exist.')
def lightGBM(TRI_array, TRO_array, TEI_array, count): # create Pandas DataFrame # tv_input : test / validation input # tv_output : test / validation output (train_input, train_output, tv_input) = create_dataframe(TRI_array, TRO_array, TEI_array) # convert to lightgbm dataset train_ds = lgb.Dataset(train_input, label=train_output) # set parameters # refer to https://www.kaggle.com/hiro5299834/tps-apr-2021-pseudo-labeling-voting-ensemble (0.81722) params = {'metric': 'binary_logloss', 'objective': 'binary', 'random_state': 2021 + count, # SEED = 2021... 'learning_rate': 0.01, 'min_child_samples': 150, 'reg_alpha': 3e-5, 'reg_lambda': 9e-2, 'num_leaves': 20, 'max_depth': 16, 'colsample_bytree': 0.8, 'subsample': 0.8, 'subsample_freq': 2, 'max_bin': 240} # create model model = lgb.train(params, train_ds, 2000, train_ds, verbose_eval=20, early_stopping_rounds=200) # predict predict_tv = model.predict(tv_input) predictions = len(predict_tv) RD.saveArray('lightGBM_tv_predict_' + str(count) + '.txt', np.array([predict_tv]).T)
def mergeTestResult(original, IDcol, files): merged = [] for i in range(files): # load test result array from result file result = RD.loadArray('result_split_' + str(i) + '.csv', ',') result = result[1:] # remove first value with column title # read corresponding ID test = RD.loadArray(original + '_sub_' + str(i) + '.txt') testIDs_ = np.array(test)[:, IDcol] # testIDs_ = [0, 1, 2, ...] -> testIDs = [[0], [1], [2], ...] testIDs = [] for j in range(len(testIDs_)): testIDs.append([testIDs_[j]]) # merge result and test array IDsAndResult = np.concatenate( [np.array(testIDs), np.array(result)], axis=1) print('\n <<< IDs-Result for file ' + str(i) + ' >>>') print(np.array(IDsAndResult)) # append this to array 'merged' for j in range(len(IDsAndResult)): merged.append([IDsAndResult[j][0], IDsAndResult[j][1]]) # sort the result array 'merged' merged = sorted(merged, key=lambda x: x[0]) # write to file RD.saveArray('result_split_final.txt', merged)
def lightGBM(TRI_array, TRO_array, TEI_array, TEO_array, count, valid): # create Pandas DataFrame and convert to lightgbm dataset # tv_input : test / validation input # tv_output : test / validation output if valid == True: (train_input, train_output, tv_input, tv_output) = create_dataframe(TRI_array, TRO_array, TEI_array, TEO_array) train_ds = lgb.Dataset(train_input, label=train_output) valid_ds = lgb.Dataset(tv_input, label=tv_output) else: (train_input, train_output, tv_input) = create_dataframe(TRI_array, TRO_array, TEI_array) train_ds = lgb.Dataset(train_input, label=train_output) # set parameters params = { 'metric': 'AUC', 'objective': 'regression', 'random_state': 2021 + count, # SEED = 2021... 'learning_rate': 0.005, 'min_child_samples': 256, 'reg_alpha': 3e-5, 'reg_lambda': 9e-2, 'num_leaves': 32, 'max_depth': 32, 'colsample_bytree': 0.8, 'subsample': 0.8, 'subsample_freq': 2, 'max_bin': 1024 } # create model if valid == True: model = lgb.train(params, train_ds, 4000, valid_ds, verbose_eval=30, early_stopping_rounds=200) else: model = lgb.train(params, train_ds, 4000, train_ds, verbose_eval=30, early_stopping_rounds=200) # predict predict_tv = model.predict(tv_input) predictions = len(predict_tv) if valid == True: RD.saveArray('lightGBM_tv_valid_' + str(count) + '.txt', np.array([predict_tv]).T) else: RD.saveArray('lightGBM_tv_predict_' + str(count) + '.txt', np.array([predict_tv]).T)
def readResult(pred, real, num, count): # assertion assert(len(pred) == len(real)) # extract values vals = [] result = [['thrs', 'TP', 'TN', 'FP', 'FN', 'accu', 'correl', 'roc-auc'], ['-', '-', '-', '-', '-', '-', round(np.corrcoef(pred, real)[0][1], 4), round(roc_auc_score(real, pred), 4)]] for i in range(len(pred)): vals.append([pred[i], real[i]]) for i in range(1, 250): threshold = count * round(1 - pow(0.95, i), 6) TP = 0 TN = 0 FP = 0 FN = 0 pred_binary = [] real_binary = [] for j in range(len(vals)): if vals[j][0] >= threshold and vals[j][1] == 1: TP += 1 pred_binary.append(1) real_binary.append(1) elif vals[j][0] < threshold and vals[j][1] == 0: TN += 1 pred_binary.append(0) real_binary.append(0) elif vals[j][0] >= threshold and vals[j][1] == 0: FP += 1 pred_binary.append(1) real_binary.append(0) elif vals[j][0] < threshold and vals[j][1] == 1: FN += 1 pred_binary.append(0) real_binary.append(1) # compute correlation try: corr = round(np.corrcoef(pred_binary, real_binary)[0][1], 4) except: corr = '-' # record result result.append([round(threshold, 4), TP, TN, FP, FN, round(float((TP+TN)/(TP+TN+FP+FN)), 4), corr, round(roc_auc_score(real_binary, pred_binary), 4)]) # write result print(result) RD.saveArray('bert_val_report_' + str(num) + '.txt', result)
def makeCsv(): # write CSV file: for example, (10x10 images with label 0, 1, ..., or 7) # label 0,0 0,1 0,2 0,3 ... 0,9 1,0 ... 1,9 2,0 ... 9,9 # 4 0 0 0 0 ... 0 0 ... 255 255 ... 0 # 1 0 255 204 51 ... 0 51 ... 255 0 ... 0 # 5 0 0 102 153 ... 0 0 ... 0 0 ... 135 # 0 0 0 0 0 ... 65 173 ... 0 0 ... 0 # 1 0 0 0 0 ... 102 200 ... 12 0 ... 0 # ... ... ... ... ... ... ... ... ... ... ... ... ... # 7 0 0 0 102 ... 102 225 ... 195 0 ... 0 # initialize array csvArray = [] # write first row firstRow = ['label'] imgWidth = 192 imgHeight = 128 for i in range(imgHeight): for j in range(imgWidth): firstRow.append(str(i) + '_' + str(j)) csvArray.append(firstRow) # convert each image into a row for the CSV file files = os.listdir('images') count = 0 for file in files: # for count if count % 25 == 0: print(count, len(files)) count += 1 # open each image file img = Image.open('images/' + file) pixel = img.load() # add each pixel OFEC_list = {'train_car':100, 'train_bus':101, 'test_car':102, 'test_bus':103} label = OFEC_list[file.split(')')[0]] thisRow = [label] for i in range(imgHeight): for j in range(imgWidth): thisRow.append(int(sum(pixel[j, i])/3)) csvArray.append(thisRow) # save into file RD.saveArray('train_test.csv', csvArray, splitter=',', saveSize=50)
def mergeTrain(TRI, TRO, TRIO): # read array TI = np.array(RD.loadArray(TRI, '\t')) TO = np.array(RD.loadArray(TRO, '\t')) # concatenate arrays TIO = np.concatenate((TI, TO), axis=1) # write array RD.saveArray(TRIO, TIO, '\t', 500)
def useAdvancedModels(): count_lightGBM = 1 count_DecisionTree = 0 count_XGBoost = 1 count_deepLearning = 1 pred_array = read_val_report.getPredAndRealArray(count_lightGBM, count_DecisionTree, count_XGBoost, count_deepLearning, None, False) RD.saveArray('final_test_output.txt', np.array([pred_array]).T)
def convertToNumeric(): # number of train rows and test rows trainRows = 12000 # / 60000 testRows = 2000 # / 10000 # check if converted data file exists try: _ = open('mnist_train_input.txt', 'r') _.close() _ = open('mnist_train_output.txt', 'r') _.close() _ = open('mnist_test_input.txt', 'r') _.close() _ = open('mnist_test_output.txt', 'r') _.close() return except: pass # TRAINING DATA # read and print col=1, ... and row=1, ... of the CSV file train_input = readCSV('mnist_train.csv', [1, None], [1, trainRows + 1]) train_output = readCSV('mnist_train.csv', [0, 1], [1, trainRows + 1]) # TEST DATA test_input = readCSV('mnist_test.csv', [1, None], [1, testRows + 1]) test_output = readCSV('mnist_test.csv', [0, 1], [1, testRows + 1]) # make training data numeric (inverting the color) for i in range(trainRows): if i % 1000 == 0: print(i) for j in range(len(train_input[0])): train_input[i][j] = 1.0 - int(train_input[i][j]) / 255.0 # make test data numeric (inverting the color) for i in range(testRows): if i % 1000 == 0: print(i) for j in range(len(test_input[0])): test_input[i][j] = 1.0 - int(test_input[i][j]) / 255.0 # make output one-hot train_output = list(train_output) test_output = list(test_output) train_output = one_hot(train_output) test_output = one_hot(test_output) # save RD.saveArray('mnist_train_input.txt', train_input) RD.saveArray('mnist_train_output.txt', train_output) RD.saveArray('mnist_test_input.txt', test_input) RD.saveArray('mnist_test_output.txt', test_output)
def writeResult(predict_tv, tv_output, VAL_rate, num, modelName): # write result result = [] for i in range(len(predict_tv)): result.append([predict_tv[i]]) if VAL_rate > 0: RD.saveArray(modelName + '_val_result_' + str(num) + '.txt', result, '\t', 500) else: RD.saveArray(modelName + '_test_result_' + str(num) + '.txt', result, '\t', 500) # validation mode -> compute RMSLE error # saved results are still NORMALIZED values if VAL_rate > 0: # convert prediction tv_output_ = [] predict_tv_ = [] # for tv_output for i in range(len(tv_output[0])): if num == 0: # formation_energy_ev_natom tv_output_.append(float(tv_output[0][i]) * 0.104078 + 0.187614) else: # bandgap_energy_ev tv_output_.append(float(tv_output[0][i]) * 1.006635 + 2.077205) # for predict_tv for i in range(len(predict_tv)): if num == 0: # formation_energy_ev_natom predict_tv_.append(float(predict_tv[i]) * 0.104078 + 0.187614) else: # bandgap_energy_ev predict_tv_.append(float(predict_tv[i]) * 1.006635 + 2.077205) # compute RMSLE and return print('\n\n ====[ ' + modelName + ' / valid=True ]====') return rmsle(tv_output_, predict_tv_) return 0
def deepLearningQ_training(Q, deviceName, epoch, printed): model = defineModel() # Q : [state, action_reward, i (UAV/cluster index), k (device index)] # Q Table = [[[s0], [Q00, Q01, ...]], [[s1], [Q10, Q11, ...]], ...] # convert to input = converted version of [[s0], [s1], ...] # output = original version of [[Q00, Q01, ...], [Q10, Q11, ...], ...] # where s = [q[n][l], {a[n][l][k_l]}, {R[n][k_l]}] # and Q = reward # input array (need to convert original array [s0]) inputData = [] for i in range(len(Q)): # convert into 1d array (valid if not converted) try: inputData.append(stateTo1dArray(Q[i][0], Q[i][3])) # executed if already converted to 1d array except: inputData.append(Q[i][0]) # output array (as original) outputData = [] for i in range(len(Q)): outputData.append(Q[i][1]) # save input and output array as file if len(inputData) > 0: RD.saveArray('Q_input.txt', inputData) if len(outputData) > 0: RD.saveArray('Q_output.txt', outputData) # save normalized data if len(inputData) > 0: normalizedInputData = normalize(inputData, False, 'input' + str(len(inputData)), True) normalizedOutputData = normalize(outputData, False, 'output' + str(len(inputData)), True) RD.saveArray('Q_input_normalized.txt', normalizedInputData) RD.saveArray('Q_output_normalized.txt', normalizedOutputData) # train using deep learning and save the model (testInputFile and testOutputFile is None) # need: modelConfig.txt # DON'T NEED TO APPLY SIGMOID to training output data, because DLmain.deeplearning applies it try: Q_input_noramlized = np.array(RD.loadArray('Q_input_normalized.txt', '\t')).astype(float) Q_output_noramlized = np.array(RD.loadArray('Q_output_normalized.txt', '\t')).astype(float) trainDataWithModel(Q_input_noramlized, Q_output_noramlized, model, 15) except: print('[train] Q_input_normalized.txt or Q_output_normalized.txt does not exist.')
def useTestOutput(fn, threshold): # read file testResult = RD.loadArray(fn) # write final result finalResult = [] for i in range(len(testResult)): value = float(testResult[i][0]) if threshold == None: finalResult.append([value]) else: if value < threshold: finalResult.append([0]) else: finalResult.append([1]) # write file RD.saveArray('to_submit.txt', finalResult)
def DecisionTree(TRI_array, TRO_array, TEI_array, count): # create Pandas DataFrame # tv_input : test / validation input # tv_output : test / validation output (train_input, train_output, tv_input) = create_dataframe(TRI_array, TRO_array, TEI_array) # set parameters and create model # refer to https://www.kaggle.com/hiro5299834/tps-apr-2021-pseudo-labeling-voting-ensemble (0.81722) # https://www.kaggle.com/remekkinas/ensemble-learning-meta-classifier-for-stacking (0.81692) model = DecisionTreeClassifier( max_depth = 12 + count % 2, min_samples_leaf = 6 + count // 2, random_state = 2021 + count ) model.fit(train_input, train_output) # predict predict_tv = model.predict(tv_input) RD.saveArray('DecisionTree_tv_predict_' + str(count) + '.txt', np.array([predict_tv]).T)
def XGBoost(TRI_array, TRO_array, TEI_array, count): # create Pandas DataFrame # tv_input : test / validation input # tv_output : test / validation output (train_input, train_output, tv_input) = create_dataframe(TRI_array, TRO_array, TEI_array) # set parameters and create model # refer to https://www.datacamp.com/community/tutorials/xgboost-in-python params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1, 'max_depth': 5, 'alpha': 10} model = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10) model.fit(train_input, train_output) # predict predict_tv = model.predict(tv_input) RD.saveArray('XGBoost_tv_predict_' + str(count) + '.txt', np.array([predict_tv]).T)
def extractTrainAndTest(): # load the csv file including all the data (except for the first row) csvArray = RD.loadArray('train_test.csv', splitter=',') csvArray = csvArray[1:] numOfRows = len(csvArray) # load label list labelList = RD.loadArray('label_list.csv') # write first row firstRow = ['label'] imgWidth = 64 imgHeight = 64 for i in range(imgHeight): for j in range(imgWidth): firstRow.append(str(i) + '_' + str(j)) # designate training data and test data trainingData = [firstRow] testData = [firstRow] trainingLabel = [] testLabel = [] for i in range(numOfRows): # train or test if int(labelList[i][0]) >= 120: train = False else: train = True # car: 0, bus: 1 csvArray[i][0] = str(int(csvArray[i][0]) % 2) # append to training/test data if train == True: trainingData.append(csvArray[i]) trainingLabel.append(labelList[i]) else: testData.append(csvArray[i]) testLabel.append(labelList[i]) # save file (trainingData and testData) RD.saveArray('train.csv', trainingData, splitter=',', saveSize=500) RD.saveArray('test.csv', testData, splitter=',', saveSize=500) RD.saveArray('trainLabels.csv', trainingLabel) RD.saveArray('testLabels.csv', testLabel)
def writeFinalOutput(reviews): train_review = np.array(pd.read_csv('yelp_training_set_review.csv')) train_output = [] for i in range(reviews): # for example, 'funny': 0, 'useful': 0, 'cool': 0 votes = train_review[i][1].split('{')[1].split('}')[0] useful = int(votes.split(',')[1].split(': ')[1]) train_output.append(useful) # average and stddev of each array TO_xxxx avg = np.mean(train_output) stddev = np.std(train_output) # normalize each TO_xxxx for i in range(reviews): train_output[i] = [(train_output[i] - avg) / stddev] # save RD.saveArray('train_output.txt', train_output) RD.saveArray('train_output_avg_and_std.txt', [[avg, stddev]])
def writeOutput(valid, isValid): trainData = np.array(pd.read_csv('train.csv')) train_output = [] if valid == True: valid_output = [] print(trainData) for i in range(len(trainData)): if isValid[i] == True: valid_output.append([trainData[i][1]]) # index of 'Survived' is 1 else: train_output.append([trainData[i][1]]) # save if valid == True: RD.saveArray('train_train_output.txt', train_output) RD.saveArray('train_valid_output.txt', valid_output) else: RD.saveArray('train_output.txt', train_output)
def makeCsv(): # write CSV file: for example, (10x10 images with label 0, 1, ..., or 7) # label 0,0 0,1 0,2 0,3 ... 0,9 1,0 ... 1,9 2,0 ... 9,9 # 4 0 0 0 0 ... 0 0 ... 255 255 ... 0 # 1 0 255 204 51 ... 0 51 ... 255 0 ... 0 # 5 0 0 102 153 ... 0 0 ... 0 0 ... 135 # 0 0 0 0 0 ... 65 173 ... 0 0 ... 0 # 1 0 0 0 0 ... 102 200 ... 12 0 ... 0 # ... ... ... ... ... ... ... ... ... ... ... ... ... # 7 0 0 0 102 ... 102 225 ... 195 0 ... 0 # initialize array csvArray = [] # write first row firstRow = ['label'] imgWidth = 64 imgHeight = 64 for i in range(imgHeight): for j in range(imgWidth): firstRow.append(str(i) + '_' + str(j)) csvArray.append(firstRow) # convert each image into a row for the CSV file files = os.listdir('images') count = 0 labelList = [] for file in files: # for count if count % 25 == 0: print(count, len(files)) count += 1 # open each image file img = Image.open('images/' + file) pixel = img.load() # add label OFEC_list = { 'NF_train': 100, 'NL_train': 101, 'NM_train': 102, 'NR_train': 103, 'VF_train': 110, 'VL_train': 111, 'VM_train': 112, 'VR_train': 113, 'NF_test': 120, 'NL_test': 121, 'NM_test': 122, 'NR_test': 123, 'VF_test': 130, 'VL_test': 131, 'VM_test': 132, 'VR_test': 133 } label = OFEC_list[file.split(')')[0]] if label % 20 >= 10: thisRow = [1] # a vehicle else: thisRow = [0] # not a vehicle labelList.append([label]) # add each pixel for i in range(imgHeight): for j in range(imgWidth): thisRow.append(int(sum(pixel[j, i]) / 3)) csvArray.append(thisRow) # save into file RD.saveArray('train_test.csv', csvArray, splitter=',', saveSize=50) RD.saveArray('label_list.csv', labelList)
def readAllSubs(size): # read train and test data train = RD.loadArray('train.csv', ',') test = RD.loadArray('test.csv', ',') # write id-delta, input and output of training data # write id-delta and input of test data try: _ = open('train_id.txt', 'r') _.close() _ = open('train_input.txt', 'r') _.close() _ = open('train_output.txt', 'r') _.close() _ = open('test_id.txt', 'r') _.close() _ = open('test_input.txt', 'r') _.close() except: # train.txt -> id, delta, start1~625, stop1~625 (if size=25) -> train_id.txt : extract id and delta # -> train_input.txt : extract delta and stop1~625 (if size=25) # -> train_output.txt : extract delta and start1~625 (if size=25) RD.saveArray('train_id.txt', np.array(train)[:, 0:2]) RD.saveArray( 'train_input.txt', np.concatenate([ np.array(train)[:, 1:2], np.array(train)[:, size * size + 2:2 * size * size + 2] ], axis=1)) RD.saveArray('train_output.txt', np.array(train)[:, 1:size * size + 2]) # test.txt -> id, delta, stop1~625 (if size=25) -> test_id.txt : extract id and delta # -> test_input.txt : extract delta and stop1~625 (if size=25) RD.saveArray('test_id.txt', np.array(test)[:, 0:2]) RD.saveArray('test_input.txt', np.array(test)[:, 1:size * size + 2]) # split train and test data into files try: # try to read file for i in range(5): _ = open('train_id_sub_' + str(i) + '.txt', 'r') _.close() _ = open('train_input_sub_' + str(i) + '.txt', 'r') _.close() _ = open('train_output_sub_' + str(i) + '.txt', 'r') _.close() _ = open('test_id_sub_' + str(i) + '.txt', 'r') _.close() _ = open('test_input_sub_' + str(i) + '.txt', 'r') _.close() except: # write train_id, train_input, train_output, test_id and test_input files deltaOrder = [[1], [2], [3], [4], [5]] # order of delta (1, 2, 3, 4, 5) # train_id_sub_X.txt : id of training data with delta X # train_input_sub_X.txt : input (stop) of training data with delta X # train_output_sub_X.txt : output (start) of training data with delta X # test_id_sub_X.txt : id of test data with delta X # test_input_sub_X.txt : input (stop) of test data with delta X RD.splitArray('train_id.txt', [1], deltaOrder, True) RD.splitArray('train_input.txt', [0], deltaOrder, True) RD.splitArray('train_output.txt', [0], deltaOrder, True) RD.splitArray('test_id.txt', [1], deltaOrder, True) RD.splitArray('test_input.txt', [0], deltaOrder, True)
import sys sys.path.insert(0, '../../AI_BASE') import readData as RD import numpy as np if __name__ == '__main__': # read file testResult = RD.loadArray('test_output.txt') # write final result finalResult = [] for i in range(len(testResult)): finalResult.append([float(testResult[i][0]) + 8.0]) # write file RD.saveArray('to_submit.txt', finalResult)
final_train_output0.append([ (float(train_output0[i][0]) - 0.187614) / 0.104078 ]) final_train_output1.append([ (float(train_output1[i][0]) - 2.077205) / 1.006635 ]) result += 'train input:\n' result += str(np.array(final_train_input)) + '\n' result += 'train output 0:\n' result += str(np.array(final_train_output0)) + '\n' result += 'train output 1:\n' result += str(np.array(final_train_output1)) + '\n' # save training data RD.saveArray('train_input.txt', final_train_input, '\t', 500) RD.saveArray('train_output_0.txt', final_train_output0, '\t', 500) RD.saveArray('train_output_1.txt', final_train_output1, '\t', 500) # TEST test_data = RD.loadArray('test_converted.csv', ',') test_input = np.array(test_data)[1:, 1:] result += 'test input:\n' result += str(np.array(test_input)) + '\n' # make final test input final_test_input = [] for i in range(test_rows): if i % 1000 == 0: print(i)
max_lengths_train.append( convertForBert(data_to_train[i], None, print_interval, tokenizer, None)) max_lengths_test.append( convertForBert(data_to_test[i], None, print_interval, tokenizer, None)) max_lengths.append(max(max_lengths_train[i], max_lengths_test[i])) print('max length') print(max_lengths_train) print(max_lengths_test) print(max_lengths) # save max lengths RD.saveArray('bert_max_lengths_train.txt', [max_lengths_train], '\t', 500) RD.saveArray('bert_max_lengths_test.txt', [max_lengths_test], '\t', 500) RD.saveArray('bert_max_lengths.txt', [max_lengths], '\t', 500) # model 0: train_title -> train_approved # model 1: train_essay1 -> train_approved # model 2: train_essay2 -> train_approved # model 3: train_essay3 -> train_approved # model 4: train_essay4 -> train_approved # model 5: train_summary -> train_approved for i in range(6): input_data = data_to_train[i] output_data = train_approved rows = len(input_data)
train_input = RD.loadArray(train_input_fn) train_output = RD.loadArray(train_output_fn) train_rows = len(train_input) print(' ==== before augmentation ====') print(np.shape(train_input)) print(np.array(train_input)) print('') print(np.shape(train_output)) print(np.array(train_output)) # augment training input and output for i in range(train_rows): train_input.append(train_input[i][::-1]) if augment_test == True: train_output.append(train_output[i][::-1]) else: train_output.append(train_output[i]) print('\n ==== after augmentation ====') print(np.shape(train_input)) print(np.array(train_input)) print('') print(np.shape(train_output)) print(np.array(train_output)) # save RD.saveArray('train_input_augmented.txt', train_input, '\t', 1) RD.saveArray('train_output_augmented.txt', train_output, '\t', 1)
for k in range(1, len(i[j])): value = i[j][k] if float(i[j][k]) >= threshold: i[j][k] = 1 # above threshold -> live else: i[j][k] = 0 # below threshold -> dead # write final array finalArray = [] print('for sub0') for i in range(len(sub0)): finalArray.append(sub0[i]) print('for sub1') for i in range(len(sub1)): finalArray.append(sub1[i]) print('for sub2') for i in range(len(sub2)): finalArray.append(sub2[i]) print('for sub3') for i in range(len(sub3)): finalArray.append(sub3[i]) print('for sub4') for i in range(len(sub4)): finalArray.append(sub4[i]) print('finished') finalArray = sorted(finalArray, key=lambda x: x[0]) RD.saveArray('final.csv', finalArray, ',')
files = 64 # [result0, result1, ...] # where each element resultX is [1, 3, 4, 2, 9, 6, 5, 7, ...] for example finalResults = [] # sum of test results sumTestResults = [] # read file for i in range(files): testResult = np.array(RD.loadArray('test_output_' + str(i) + '.txt'))[:, :9].astype(float) if i == 0: sumTestResults = np.array(copy.deepcopy(list(testResult))) else: sumTestResults = sumTestResults + np.array( copy.deepcopy(list(testResult))) finalResult = getFinalResult(testResult) finalResults.append(finalResult) # save the sum of test results RD.saveArray('sumTestResults.txt', sumTestResults) # write final result # USE THE RIGHTMOST COLUMN OF to_submit.txt AS FINAL RESULT finalResults.append(getFinalResult(sumTestResults)) RD.saveArray('to_submit.txt', np.array(finalResults).T)
def makeData(delta, n, n_, size, limitLen, writeTestInput): # window size ws = int((n - 1) / 2) # for training/test input ws_ = int((n_ - 1) / 2) # for training/test output # read data trainInput = RD.loadArray('train_input_sub_' + str(delta - 1) + '.txt') trainOutput = RD.loadArray('train_output_sub_' + str(delta - 1) + '.txt') testInput = RD.loadArray('test_input_sub_' + str(delta - 1) + '.txt') trainLen = min(len(trainInput), limitLen) testLen = len(testInput) # input data to make trainInputData = [] # output data to make trainOutputData = [] # test input data to make if writeTestInput == True: testInputData = [] # reshape training data for i in range(trainLen): if i % 10 == 0: print('makeData (training) : ' + str(i) + ' / ' + str(trainLen)) # trainInput and trainOutput as numeric type trainInput = np.array(trainInput).astype('float') trainOutput = np.array(trainOutput).astype('float') # reshape to derive n*n training data (with ws-sized padding) trainInputReshaped = np.pad( np.array(trainInput[i]).reshape(size, size), ((ws, ws), (ws, ws)), 'wrap') trainOutputReshaped = np.pad( np.array(trainOutput[i]).reshape(size, size), ((ws_, ws_), (ws_, ws_)), 'wrap') # save training data into array trainInputData and trainOutputData for j in range(size): for k in range(size): trainInputData.append( list(trainInputReshaped[j:j + 2 * ws + 1, k:k + 2 * ws + 1].reshape(n * n))) trainOutputData.append( list(trainOutputReshaped[j:j + 2 * ws_ + 1, k:k + 2 * ws_ + 1].reshape(n_ * n_))) # reshape test data if writeTestInput == True: for i in range(testLen): if i % 10 == 0: print('makeData (test) : ' + str(i) + ' / ' + str(testLen)) # trainInput and trainOutput as numeric type testInput = np.array(testInput).astype('float') # reshape to derive n*n training data (with ws-sized padding) testInputReshaped = np.pad( np.array(testInput[i]).reshape(size, size), ((ws, ws), (ws, ws)), 'wrap') # save test data into array testInputData for j in range(size): for k in range(size): testInputData.append( list(testInputReshaped[j:j + 2 * ws + 1, k:k + 2 * ws + 1].reshape(n * n))) # save as file # [ADDED] saveSize=10000 RD.saveArray('train_input_n_sub_' + str(delta - 1) + '.txt', trainInputData, saveSize=10000) RD.saveArray('train_output_n_sub_' + str(delta - 1) + '.txt', trainOutputData, saveSize=10000) if writeTestInput == True: RD.saveArray('test_input_n_sub_' + str(delta - 1) + '.txt', testInputData)
for i in range(N): for j in range(N): if sum(goals_array[i][j]) == 0: goals_rate_array[i][j] = 0.5 else: goals_rate_array[i][j] = goals_array[i][j][0] / sum(goals_array[i][j]) # transformation: 1 - 2*(1 - x)^2 if x >= 0.5 # 2x^2 if x < 0.5 if goals_rate_array[i][j] >= 0.5: goals_rate_array[i][j] = 1 - 2 * (1 - goals_rate_array[i][j])**2 else: goals_rate_array[i][j] = 2 * (goals_rate_array[i][j])**2 # prediction for 2016 season sample_submission = np.array(pd.read_csv('SampleSubmission.csv')) final_result = [] for i in range(len(sample_submission)): info = sample_submission[i][0].split('_') teamA = int(info[1]) teamB = int(info[2]) final_result.append([goals_rate_array[teamA - 1101][teamB - 1101]]) # write as file RD.saveArray('finalResult.txt', final_result, '\t', 500)
# for [Wteam, Lteam, 1, Wloc, season] for i in range(len(all_array)): raw_result.append([ all_array[i][2], all_array[i][4], 1, all_array[i][6], all_array[i][0] ]) # for [Lteam, Wteam, 0, Wloc, season] for i in range(len(all_array)): raw_result.append([ all_array[i][4], all_array[i][2], 0, all_array[i][6], all_array[i][0] ]) RD.saveArray('raw_result.txt', raw_result, '\t', 500) # TEST DATA : raw_result_test.txt raw_result_test = [] test_array = np.array(pd.read_csv('SampleSubmission.csv')) for i in range(len(test_array)): IDsplit = test_array[i][0].split('_') team0_id = int(IDsplit[1]) team1_id = int(IDsplit[2]) raw_result_test.append([team0_id, team1_id])
# load training output print('loading training output...') TRO_array = np.array(RD.loadArray(TRO, '\t', UTF8=False, type_='f')) if len(TRO_array) > toTrainLimit: TRO_array = TRO_array[:toTrainLimit] # load test input print('loading test input...') TEI_array = np.array(RD.loadArray(TEI, '\t', UTF8=False, type_='f')) if len(TEI_array) > toTestLimit: TEI_array = TEI_array[:toTestLimit] # train and save the model model.fit(TRI_array, TRO_array, validation_split=VAL_rate, callbacks=[early, lr_reduced], epochs=epochs) model.summary() model.save('model_e_' + str(epochs)) # load the model loaded_model = tf.keras.models.load_model('model_e_' + str(epochs)) # validation prediction = loaded_model.predict(TEI_array) # write result for validation if VAL_rate > 0: RD.saveArray('valid_prediction_' + str(i) + '.txt', prediction, '\t', 500) else: RD.saveArray('test_prediction_' + str(i) + '.txt', prediction, '\t', 500)