def mergeTestResult(original, IDcol, files): merged = [] for i in range(files): # load test result array from result file result = RD.loadArray('result_split_' + str(i) + '.csv', ',') result = result[1:] # remove first value with column title # read corresponding ID test = RD.loadArray(original + '_sub_' + str(i) + '.txt') testIDs_ = np.array(test)[:, IDcol] # testIDs_ = [0, 1, 2, ...] -> testIDs = [[0], [1], [2], ...] testIDs = [] for j in range(len(testIDs_)): testIDs.append([testIDs_[j]]) # merge result and test array IDsAndResult = np.concatenate( [np.array(testIDs), np.array(result)], axis=1) print('\n <<< IDs-Result for file ' + str(i) + ' >>>') print(np.array(IDsAndResult)) # append this to array 'merged' for j in range(len(IDsAndResult)): merged.append([IDsAndResult[j][0], IDsAndResult[j][1]]) # sort the result array 'merged' merged = sorted(merged, key=lambda x: x[0]) # write to file RD.saveArray('result_split_final.txt', merged)
def mergeTrain(TRI, TRO, TRIO): # read array TI = np.array(RD.loadArray(TRI, '\t')) TO = np.array(RD.loadArray(TRO, '\t')) # concatenate arrays TIO = np.concatenate((TI, TO), axis=1) # write array RD.saveArray(TRIO, TIO, '\t', 500)
def deepLearningQ_training(Q, deviceName, epoch, printed): model = defineModel() # Q : [state, action_reward, i (UAV/cluster index), k (device index)] # Q Table = [[[s0], [Q00, Q01, ...]], [[s1], [Q10, Q11, ...]], ...] # convert to input = converted version of [[s0], [s1], ...] # output = original version of [[Q00, Q01, ...], [Q10, Q11, ...], ...] # where s = [q[n][l], {a[n][l][k_l]}, {R[n][k_l]}] # and Q = reward # input array (need to convert original array [s0]) inputData = [] for i in range(len(Q)): # convert into 1d array (valid if not converted) try: inputData.append(stateTo1dArray(Q[i][0], Q[i][3])) # executed if already converted to 1d array except: inputData.append(Q[i][0]) # output array (as original) outputData = [] for i in range(len(Q)): outputData.append(Q[i][1]) # save input and output array as file if len(inputData) > 0: RD.saveArray('Q_input.txt', inputData) if len(outputData) > 0: RD.saveArray('Q_output.txt', outputData) # save normalized data if len(inputData) > 0: normalizedInputData = normalize(inputData, False, 'input' + str(len(inputData)), True) normalizedOutputData = normalize(outputData, False, 'output' + str(len(inputData)), True) RD.saveArray('Q_input_normalized.txt', normalizedInputData) RD.saveArray('Q_output_normalized.txt', normalizedOutputData) # train using deep learning and save the model (testInputFile and testOutputFile is None) # need: modelConfig.txt # DON'T NEED TO APPLY SIGMOID to training output data, because DLmain.deeplearning applies it try: Q_input_noramlized = np.array(RD.loadArray('Q_input_normalized.txt', '\t')).astype(float) Q_output_noramlized = np.array(RD.loadArray('Q_output_normalized.txt', '\t')).astype(float) trainDataWithModel(Q_input_noramlized, Q_output_noramlized, model, 15) except: print('[train] Q_input_normalized.txt or Q_output_normalized.txt does not exist.')
def extractTrainAndTest(testRate): # load the csv file including all the data (except for the first row) csvArray = RD.loadArray('train_test.csv', splitter=',') csvArray = csvArray[1:] numOfRows = len(csvArray) # write first row firstRow = ['label'] imgSize = 32 for i in range(imgSize): for j in range(imgSize): firstRow.append(str(i) + '_' + str(j)) # designate training data and test data trainingData = [firstRow] testData = [firstRow] for i in range(numOfRows): rand = random.random() if rand >= testRate: trainingData.append(csvArray[i]) else: testData.append(csvArray[i]) # save file (trainingData and testData) RD.saveArray('train.csv', trainingData, splitter=',', saveSize=1000) RD.saveArray('test.csv', testData, splitter=',', saveSize=1000)
def readCSV(fn, colRange, rowRange, delimiter=','): csv = RD.loadArray(fn, delimiter) # colRange start and end, rowRange start and end cs = colRange[0] ce = colRange[1] rs = rowRange[0] re = rowRange[1] # colRange: [a, None] if ce == None: # rowRange: [a, None] if re == None: return np.array(csv)[rs:, cs:] # rowRange: [a, b] else: return np.array(csv)[rs:re, cs:] # colRange: [a, b] else: # rowRange: [a, None] if re == None: return np.array(csv)[rs:, cs:ce] # rowRange: [a, b] else: return np.array(csv)[rs:re, cs:ce]
def extractTrainAndTest(): # load the csv file including all the data (except for the first row) csvArray = RD.loadArray('train_test.csv', splitter=',') csvArray = csvArray[1:] numOfRows = len(csvArray) # load label list labelList = RD.loadArray('label_list.csv') # write first row firstRow = ['label'] imgWidth = 64 imgHeight = 64 for i in range(imgHeight): for j in range(imgWidth): firstRow.append(str(i) + '_' + str(j)) # designate training data and test data trainingData = [firstRow] testData = [firstRow] trainingLabel = [] testLabel = [] for i in range(numOfRows): # train or test if int(labelList[i][0]) >= 120: train = False else: train = True # car: 0, bus: 1 csvArray[i][0] = str(int(csvArray[i][0]) % 2) # append to training/test data if train == True: trainingData.append(csvArray[i]) trainingLabel.append(labelList[i]) else: testData.append(csvArray[i]) testLabel.append(labelList[i]) # save file (trainingData and testData) RD.saveArray('train.csv', trainingData, splitter=',', saveSize=500) RD.saveArray('test.csv', testData, splitter=',', saveSize=500) RD.saveArray('trainLabels.csv', trainingLabel) RD.saveArray('testLabels.csv', testLabel)
def readMNISTData(): print('reading MNIST data...') # TRAINING DATA # read and print col=1, ... and row=1, ... of the CSV file train_input = np.array(RD.loadArray('mnist_train_input.txt')) train_output = np.array(RD.loadArray('mnist_train_output.txt')) print(train_input) print(train_output) # TEST DATA test_input = np.array(RD.loadArray('mnist_test_input.txt')) test_output = np.array(RD.loadArray('mnist_test_output.txt')) print(test_input) print(test_output) return (train_input, train_output, test_input, test_output)
def useTestOutput(fn, threshold): # read file testResult = RD.loadArray(fn) # write final result finalResult = [] for i in range(len(testResult)): value = float(testResult[i][0]) if threshold == None: finalResult.append([value]) else: if value < threshold: finalResult.append([0]) else: finalResult.append([1]) # write file RD.saveArray('to_submit.txt', finalResult)
def compare(finalResult, validName, validationCol, trainValid_validRows): # assert that finalResult and validRows have the same length dataLen = len(finalResult) # length of data assert (dataLen == len(trainValid_validRows)) # assert the same length # read data validData = RD.loadArray(validName) # original validation data array validResult = [] # validation data array for i in range(dataLen): validResult.append(validData[trainValid_validRows[i]][validationCol]) validResult = np.array(validResult) # compute MAE and MSE result = '' MAE = 0.0 MSE = 0.0 for i in range(dataLen): finalResult[i] = float(finalResult[i]) validResult[i] = float(validResult[i]) MAE += abs(float(finalResult[i]) - float(validResult[i])) MSE += pow(float(finalResult[i]) - float(validResult[i]), 2) result += 'pred=' + str(finalResult[i]) + ' val=' + str( validResult[i]) + '\n' MAE /= dataLen MSE /= dataLen # print and write result print('\n **** validation result ****') print('MAE=' + str(MAE)) print('MSE=' + str(MSE)) f = open('result_valid.txt', 'w') f.write(result + '\nMAE = ' + str(MAE) + ', MSE = ' + str(MSE)) f.close()
import sys sys.path.insert(0, '../../AI_BASE') import math import numpy as np import readData as RD if __name__ == '__main__': count = 1 for i in range(count): if i == 0: answer_array = np.array(RD.loadArray('bert_valid_result_count_' + str(i) + '.txt'))[:, 5:6].astype(float) else: answer_array += np.array(RD.loadArray('bert_valid_result_count_' + str(i) + '.txt'))[:, 5:6].astype(float) answer_array /= count RD.saveArray('final_answer.txt', answer_array)
# convert : 0 ~ 255 -> -1 ~ 2 imgArr = np.round_(3 * (imgArr / 255) - 1, 2) inputArr.append(imgArr) if __name__ == '__main__': # SUBMISSION # prediction of 'survived' column # rows train_rows = 61578 test_rows = 79975 valid_rate = float(RD.loadArray('val_rate.txt')[0][0]) # directories train_input_dir = 'images_training_rev1/images_training_rev1' train_output_file = 'training_solutions_rev1/training_solutions_rev1.csv' test_input_dir = 'images_test_rev1/images_test_rev1' # set data to validate (array 'isValid') isValid = [] for i in range(train_rows): isValid.append(False) count = 0 while count < train_rows * valid_rate: rand = random.randint(0, train_rows - 1)
text_models.append(text_model) # train / test result array train_result = [[0 for j in range(6)] for i in range(rows_to_train)] test_result = [[0 for j in range(6)] for i in range(rows_to_test)] # train / test max length # for donorschoose-application-screening, # max_length_train = 132, 2183, 818, 387, 224, 234 # max_length_test = 159, 2583, 993, 245, 163, 107 # max_length = 159, 2583, 993, 387, 224, 234 try: max_lengths_train = RD.loadArray('bert_max_lengths_train.txt')[0] max_lengths_test = RD.loadArray('bert_max_lengths_test.txt')[0] max_lengths = RD.loadArray('bert_max_lengths.txt')[0] except: max_lengths_train = [] max_lengths_test = [] max_lengths = [] for i in range(6): print('checking max length : ' + str(i)) max_lengths_train.append( convertForBert(data_to_train[i], None, print_interval, tokenizer, None)) max_lengths_test.append(
# train_converted.csv -> train_input.txt, train_output_0.txt, and train_output_1.txt # test_converted.csv -> test_input.txt # -> my_extract_result.txt if __name__ == '__main__': train_rows = 2400 test_rows = 600 input_cols_cat = 1 input_cols_cont = 29 result = '' # TRAIN train_data = RD.loadArray('train_converted.csv', ',') train_input = np.array(train_data)[1:, 1:31] train_output0 = np.array(train_data)[1:, 31:32] # formation_energy_ev_natom train_output1 = np.array(train_data)[1:, 32:33] # bandgap_energy_ev # AVG and STD for each column of CONTINUOUS training input cont_avgs = [] cont_stds = [] # categorical for i in range(input_cols_cat): cont_avgs.append(-1) cont_stds.append(-1) # continuous
final.close() if __name__ == '__main__': use_n_sub = True # use n_sub mode size = 25 # the number of rows/columns in each input data inputLength = size * size # length of input vector # read array from final_X.csv # when using n_sub mode if use_n_sub == True: try: sub0 = RD.loadArray('final_0.csv', ',') sub1 = RD.loadArray('final_1.csv', ',') sub2 = RD.loadArray('final_2.csv', ',') sub3 = RD.loadArray('final_3.csv', ',') sub4 = RD.loadArray('final_4.csv', ',') except: weight = [1] readTestOutput('test_id_sub_0.txt', inputLength, 'test_output_n_sub_0.txt', 'final_0.csv', weight) readTestOutput('test_id_sub_1.txt', inputLength, 'test_output_n_sub_1.txt', 'final_1.csv', weight) readTestOutput('test_id_sub_2.txt', inputLength, 'test_output_n_sub_2.txt', 'final_2.csv', weight) readTestOutput('test_id_sub_3.txt', inputLength, 'test_output_n_sub_3.txt', 'final_3.csv', weight) readTestOutput('test_id_sub_4.txt', inputLength,
import sys sys.path.insert(0, '../../AI_BASE') import readData as RD import numpy as np if __name__ == '__main__': train_input_fn = 'train_input.txt' train_output_fn = 'train_output.txt' augment_test = False # read training input and output train_input = RD.loadArray(train_input_fn) train_output = RD.loadArray(train_output_fn) train_rows = len(train_input) print(' ==== before augmentation ====') print(np.shape(train_input)) print(np.array(train_input)) print('') print(np.shape(train_output)) print(np.array(train_output)) # augment training input and output for i in range(train_rows): train_input.append(train_input[i][::-1]) if augment_test == True: train_output.append(train_output[i][::-1]) else: train_output.append(train_output[i])
def readAllSubs(size): # read train and test data train = RD.loadArray('train.csv', ',') test = RD.loadArray('test.csv', ',') # write id-delta, input and output of training data # write id-delta and input of test data try: _ = open('train_id.txt', 'r') _.close() _ = open('train_input.txt', 'r') _.close() _ = open('train_output.txt', 'r') _.close() _ = open('test_id.txt', 'r') _.close() _ = open('test_input.txt', 'r') _.close() except: # train.txt -> id, delta, start1~625, stop1~625 (if size=25) -> train_id.txt : extract id and delta # -> train_input.txt : extract delta and stop1~625 (if size=25) # -> train_output.txt : extract delta and start1~625 (if size=25) RD.saveArray('train_id.txt', np.array(train)[:, 0:2]) RD.saveArray( 'train_input.txt', np.concatenate([ np.array(train)[:, 1:2], np.array(train)[:, size * size + 2:2 * size * size + 2] ], axis=1)) RD.saveArray('train_output.txt', np.array(train)[:, 1:size * size + 2]) # test.txt -> id, delta, stop1~625 (if size=25) -> test_id.txt : extract id and delta # -> test_input.txt : extract delta and stop1~625 (if size=25) RD.saveArray('test_id.txt', np.array(test)[:, 0:2]) RD.saveArray('test_input.txt', np.array(test)[:, 1:size * size + 2]) # split train and test data into files try: # try to read file for i in range(5): _ = open('train_id_sub_' + str(i) + '.txt', 'r') _.close() _ = open('train_input_sub_' + str(i) + '.txt', 'r') _.close() _ = open('train_output_sub_' + str(i) + '.txt', 'r') _.close() _ = open('test_id_sub_' + str(i) + '.txt', 'r') _.close() _ = open('test_input_sub_' + str(i) + '.txt', 'r') _.close() except: # write train_id, train_input, train_output, test_id and test_input files deltaOrder = [[1], [2], [3], [4], [5]] # order of delta (1, 2, 3, 4, 5) # train_id_sub_X.txt : id of training data with delta X # train_input_sub_X.txt : input (stop) of training data with delta X # train_output_sub_X.txt : output (start) of training data with delta X # test_id_sub_X.txt : id of test data with delta X # test_input_sub_X.txt : input (stop) of test data with delta X RD.splitArray('train_id.txt', [1], deltaOrder, True) RD.splitArray('train_input.txt', [0], deltaOrder, True) RD.splitArray('train_output.txt', [0], deltaOrder, True) RD.splitArray('test_id.txt', [1], deltaOrder, True) RD.splitArray('test_input.txt', [0], deltaOrder, True)
# meta info TRI = 'train_input.txt' TRO = 'train_output.txt' TEI = 'test_input.txt' TEO = ['test_output.txt'] TE_real = None TE_report = 'report_test.txt' VAL_rate = 0.0 VAL_report = 'report_val.txt' modelConfig = 'model_config.txt' # load array print('loading training input...') TRI_array = RD.loadArray(TRI, '\t', UTF8=False, type_='f') print('loading training output...') TRO_array = RD.loadArray(TRO, '\t', UTF8=False, type_='f') print('loading test input...') TEI_array = RD.loadArray(TEI, '\t', UTF8=False, type_='f') # user data deviceName = input('device name (for example, cpu:0 or gpu:0)') epoch = int(input('epoch')) printed = int(input('printed? (0 -> do not print)')) # print mode if VAL_rate > 0.0: print('VALIDATION mode')
import used_model if __name__ == '__main__': import warnings warnings.filterwarnings('ignore') warnings.filterwarnings('always') # training input and test input: # NORMALIZED using avg and stddev of each training input column # meta info TE_real = None TE_report = 'report_test.txt' VAL_rate = float(RD.loadArray('val_rate.txt')[0][0]) VAL_report = 'report_val.txt' modelConfig = 'model_config.txt' augmented = False if augmented == True: TRI = 'train_input_augmented.txt' TRO = 'train_output_augmented.txt' else: TRI = 'train_input.txt' TRO = 'train_output.txt' TEI = 'test_input.txt' TEO = 'test_predict.txt' # user data deviceName = input('device name (for example, cpu:0 or gpu:0)')
# open sample_submission.csv refer to # https://stackoverflow.com/questions/53410490/i-am-getting-an-error-as-name-while-opening-csv-file-in-excel-2016 import numpy as np import sys sys.path.insert(0, '../../AI_BASE') import readData as RD if __name__ == '__main__': # final result finalResult = 'Id,Votes\n' sampleSub = RD.loadArray('sample_submission.csv', ',') results = 22956 times = 16 algorithm = 'deepLearning' # avg and std for votes avgs_and_stds = RD.loadArray('train_output_avg_and_std.txt') print(avgs_and_stds) # sum of prediction of useful votes for each review final_sum = [] # read file for count in range(times): print('count = ' + str(count))
def makeDataFrame(fn, validExcept, rows, ftype, fcols, isTrain, target, exceptCols, useLog, logConstant): print('') print('+============================+') print('| Function : makeDataFrame |') print('+============================+') # copy fcols (original fcols) originalFcols = [] for i in range(len(fcols)): originalFcols.append(fcols[i]) # open and show plt data if ftype == 'json': # type is 'json' jf = open(fn, 'r') df_loaded = json.load(jf) df_data = pd.DataFrame(df_loaded) # NOT TESTED FOR THIS CASE - SO THERE MAY BE SOME ERRORS elif ftype == 'csv': # type is 'csv' df_data = pd.read_csv(fn) # NOT TESTED FOR THIS CASE - SO THERE MAY BE SOME ERRORS elif ftype == 'txt': # type is 'txt' df_array = RD.loadArray(fn) print('\n<<< [before] fcols >>>') print(fcols) print('\n<<< [before] data array [0:5] >>>') print(df_array[:5]) # remove columns indexed by elements of validExcept # (used in training but not used in validation) if validExcept != None: for i in range(len(validExcept)): try: fcols.remove(validExcept[i]) except: print('validExcept remove error (0) : ' + str(validExcept[i])) try: df_array = np.delete( df_array, getIndex(validExcept[i], originalFcols), 1) except: print('validExcept remove error (1) : ' + str(validExcept[i])) # remove target column for test data if isTrain == False: try: fcols.remove(target) except: print('target remove error (0) : ' + str(target)) try: df_array = np.delete(df_array, getIndex(target, originalFcols), 1) except: print('target remove error (1) : ' + str(target)) # remove except column from both fcols and df_array for exceptCol in exceptCols: try: fcols.remove(exceptCol) except: print('exceptCol remove error (0) : ' + str(exceptCol)) try: df_array = np.delete(df_array, getIndex(exceptCol, originalFcols), 1) except: print('exceptCol remove error (1) : ' + str(exceptCol)) print('\n<<< [after] fcols >>>') print(fcols) print('\n<<< [after] data array [0:5] >>>') print(df_array[:5]) # make dataframe using df_array if fcols != None: df_data = pd.DataFrame(data=df_array, columns=fcols) else: cols = [] for i in range(len(df_array[0])): cols.append('col' + str(i)) df_data = pd.DataFrame(data=df_array, columns=cols) targetCol = -1 # index of target column # extract column name before change into np.array dataCols = np.array(df_data.columns) # change df_data into np.array df_data = df_data.to_numpy() df_data = pd.DataFrame(df_data, columns=dataCols) print('\n<<< [0] df_data.shape >>>') print('columns : ' + str(df_data.columns)) print('shape : ' + str(df_data.shape)) # create data # .data and .target if isTrain == True: targetCol = target # target column name dataPart = [] # data columns if isTrain == True: targetPart = [] # target column extractCols = [] # extracted columns = dataPart + targetPart extractColInfo = [] # info about extracted columns (type, etc.) for col in dataCols: # except for these columns continueThis = False for i in range(len(exceptCols)): if col == exceptCols[i]: continueThis = True break if continueThis == True: continue dataPartAdded = False # accept columns only if not all values are the same (then not meaningful) # so check if max(col) > min(col) # not in targetCol and all of values are numeric -> dataPart if isTrain == True: # train mode -> targetCol exists if col != targetCol: dataPart.append(col) extractCols.append(col) extractColInfo.append('data') dataPartAdded = True else: # test mode -> targetCol does not exist dataPart.append(col) extractCols.append(col) extractColInfo.append('data') dataPartAdded = True # if equal to targetCol if isTrain == True and dataPartAdded == False: if col == targetCol: targetPart.append(col) extractCols.append(col) extractColInfo.append('target') # set index to the index of target column targetCol = len(extractCols) - 1 print('\n<<< [1] dataPart and extractCols >>>') for i in range(len(dataPart)): print(dataPart[i]) print('') for i in range(len(extractCols)): print(extractCols[i] + ' : ' + extractColInfo[i]) # bind the data and target if isTrain == True: dataSet = {'data': df_data[dataPart], 'target': df_data[targetPart]} else: dataSet = {'data': df_data[dataPart]} dataSetDF = df_data[extractCols] # change dataSetDF into float type try: dataSetDF = dataSetDF.astype(float) except: doNothing = 0 # do nothing # print dataFrame print('\n<<< [2] dataSetDF >>>') print(dataSetDF) # again, change dataSetDF into float type try: dataSetDF = dataSetDF.astype(float) except: doNothing = 0 # do nothing # apply log for extractCols if useLog is true if useLog == True: # prevent error when frequentWords is None if frequentWords == None: frequentWords = [] # actual column name is 'CT_' + each frequent word CTfreqWords = [] for i in range(len(frequentWords)): CTfreqWords.append('CT_' + frequentWords[i]) # using log: x -> log2(x + logConstant) for col in extractCols + CTfreqWords: if col == target or col[:3] == 'CT_': continue # except for target column and CT_ columns for i in range(len(dataSetDF)): dataSetDF.at[i, col] = math.log( max(0, dataSetDF.at[i, col]) + logConstant, 2) print('\n<<< [2-3] dataSetDF log applied >>>') print(dataSetDF) # again, change dataSetDF into float type try: dataSetDF = dataSetDF.astype(float) except: doNothing = 0 # do nothing print('\n<<< [2-4] dataSetDF original >>>') print(dataSetDF) # remove rows not included in 'rows' if rows != None: dataSetDF = dataSetDF.iloc[rows] print('\n<<< [2-5] dataSetDF after row extraction >>>') print(dataSetDF) print('') print('+========================+') print('| Exit : makeDataFrame |') print('+========================+') # return dataFrame return (dataSetDF, targetCol)
def valid(fn, thresholdList, size, n, modelName, validRate, use_n_sub): # window size ws = int((n-1)/2) ### read ID to validate, from the validation report id0ToValidate = [] report = open(fn, 'r') rows = report.readlines() leng = len(rows) report.close() # using parsing for i in range(leng-9): id_ = rows[i].split(']')[0][1:] id0ToValidate.append(int(id_)) ### add randomly select rows to validate, from delta = 2 to 5, using validRate # ID: delta 1 = 000000 ~ 624999 # delta 2 = 625000 ~ 1.249M # delta 3 = 1.250M ~ 1.874M # delta 4 = 1.875M ~ 2.499M # delta 5 = 2.500M ~ 3.124M (total 3,125,000 rows) # for delta = 2 to delta = 5, use (line No.) + 625000 * (delta - 1) # T: training, V: validation, then, for example # training data, delta = 1 -> TTTTTTTTVV # training data, delta = 2 -> ...V..V... # training data, delta = 3 -> .V.V...... # training data, delta = 4 -> ..V.....V. # training data, delta = 5 -> ......VV.. # load ID and training input/output file print('<00> loading ID files...') print('use_n_sub:', use_n_sub) id0 = RD.loadArray('train_id_sub_0.txt') print('id 0 finished') id1 = RD.loadArray('train_id_sub_1.txt') print('id 1 finished') id2 = RD.loadArray('train_id_sub_2.txt') print('id 2 finished') id3 = RD.loadArray('train_id_sub_3.txt') print('id 3 finished') id4 = RD.loadArray('train_id_sub_4.txt') print('id 4 finished') print('<01> loading training input/output files...') if use_n_sub == True: trainInput0 = RD.loadArray('train_input_n_sub_0.txt') print('n_sub 0 input finished') trainInput1 = RD.loadArray('train_input_n_sub_1.txt') print('n_sub 1 input finished') trainInput2 = RD.loadArray('train_input_n_sub_2.txt') print('n_sub 2 input finished') trainInput3 = RD.loadArray('train_input_n_sub_3.txt') print('n_sub 3 input finished') trainInput4 = RD.loadArray('train_input_n_sub_4.txt') print('n_sub 4 input finished') trainOutput0 = RD.loadArray('train_output_n_sub_0.txt') print('n_sub 0 output finished') trainOutput1 = RD.loadArray('train_output_n_sub_1.txt') print('n_sub 1 output finished') trainOutput2 = RD.loadArray('train_output_n_sub_2.txt') print('n_sub 2 output finished') trainOutput3 = RD.loadArray('train_output_n_sub_3.txt') print('n_sub 3 output finished') trainOutput4 = RD.loadArray('train_output_n_sub_4.txt') print('n_sub 4 output finished') else: trainInput0 = RD.loadArray('train_input_sub_0.txt') print('sub 0 input finished') trainInput1 = RD.loadArray('train_input_sub_1.txt') print('sub 1 input finished') trainInput2 = RD.loadArray('train_input_sub_2.txt') print('sub 2 input finished') trainInput3 = RD.loadArray('train_input_sub_3.txt') print('sub 3 input finished') trainInput4 = RD.loadArray('train_input_sub_4.txt') print('sub 4 input finished') trainOutput0 = RD.loadArray('train_output_sub_0.txt') print('sub 0 output finished') trainOutput1 = RD.loadArray('train_output_sub_1.txt') print('sub 1 output finished') trainOutput2 = RD.loadArray('train_output_sub_2.txt') print('sub 2 output finished') trainOutput3 = RD.loadArray('train_output_sub_3.txt') print('sub 3 output finished') trainOutput4 = RD.loadArray('train_output_sub_4.txt') print('sub 4 output finished') # list of training input/output data and validating IDs trainInputData = [trainInput0, trainInput1, trainInput2, trainInput3, trainInput4] trainOutputData = [trainOutput0, trainOutput1, trainOutput2, trainOutput3, trainOutput4] # extract data to validate from trainInput0, using ID list id0ToValidate print('<02> extracting data to validate, from training input when delta=1...') inputDataToValidate = [] for i in range(leng-9): inputDataToValidate.append(trainInput0[idToValidate[i]]) # randomly select (validate count of when delta=1) rows for delta = 2 to 5 print('<03> randomly select data to validate, from training input when delta=2~5...') totalCount = 25 * 25 * 1000 for i in range(1, 5): count = 0 # to check an ID is to be validated idToValidate_ = [] for j in range(totalCount): idToValidate.append(False) # randomly select IDs until the number of IDs for the delta reaches (validate count of when delta=1) while count < leng-9: if count % 100 == 0: print(count, '/', leng-9) rand = random.randint(i * totalCount, (i+1) * totalCount - 1) if idToValidate_[rand % totalCount] == False: idToValidate_[rand % totalCount] = True # append to the list of id to validate and input data to validate idToValidate.append(i * totalCount + rand) inputDataToValidate.append(trainInputData[i][rand]) count += 1 ### validate (get output for validation input) using model of modelName # for delta = 1 to 5 print('<04> validate when delta=1~5...') inputDataToValidate = np.array(inputDataToValidate).astype('float') rows = len(inputDataToValidate) # set of validation outputs validOutputs = [] # validate each validation input row for i in range(rows): if i % 1000 == 0: print(i, '/', rows) delta = int(idToValidate[i] / totalCount) # initialize valid output as the input data validOutput = copy.deepCopy(inputDataToValidate[i]) # derive output # for delta = 1, 2, 3, 4 and 5 for _ in range(delta): validOutput = DL.getTestResult(modelName, validOutput, 0) # inverse sigmoid for i in range(len(validOutput)): # for each output data for j in range(len(validOutput[0])): # for each value of output data validOutput[i][j] = helper.invSigmoid(validOutput[i][j]) validOutputs.append(validOutput) # (length of idToValidate) # = (length of inputDataToValidate) # = (length of validOutputs) ### compute loss (binary) # compare with training output data print('<05> comparing the result with corresponding training output data...') avgLoss = [] elementsInEachRow = len(validOutputs[0]) for thr in thresholdList: print('threshold =', thr) # sum of the loss for this threshold sumLossForThr = 0 # compute loss for each element for i in range(rows): delta = int(idToValidate[i] / totalCount) + 1 ID = idToValidate[i] % totalCount TO = trainOutputData[delta-1][ID] for j in range(elementsInEachRow): if validOutputs[i][j] >= thr and TO[j] == 0: sumLossForThr += 1 elif validOutputs[i][j] < thr and TO[j] == 1: sumLossForThr += 1 # find average loss avgLoss.append(sumLossForThr / (rows * elementsInEachRow)) ### write validation report (name: fn = report.txt -> file name = report_repeatDelta.txt) print('<06> writing validation report...') report_fn = fn.split('.')[0] + '_repeatDelta.txt' rf = open(report_fn, 'w') rfContent = '' for i in range(thresholdList): rfContent += '[thr = ' + str(thresholdList[i]) + '] loss=' + str(avgLoss[i]) + '\n' rf.write(rfContent) rf.close() ### write (validation output) + (actual training output) print('<07> writing validation and actual training output...') compare_fn = fn.split('.')[0] + '_repeatDelta_compare.txt' cf = open(compare_fn, 'w') cfContent = '' for i in range(rows): delta = int(idToValidate[i] / totalCount) + 1 ID = idToValidate[i] % totalCount cfContent += (str(delta) + '\t' + str(ID) + '\t' + str(np.array(validOutputs[i])) + '|\t' + str(np.array(trainOutputData[delta-1][ID]))) cf.write(cfContent) cf.close()
i) + '.txt' # file to make modelConfig = 'model_n_sub_' + str(i) + '.txt' # file to make modelName = 'model_n_sub_' + str(i) # model name else: # do not use n-sub mode ( -> use normal mode) trainIName = 'train_input_sub_' + str(i) + '.txt' trainOName = 'train_output_sub_' + str(i) + '.txt' testIName = 'test_input_sub_' + str(i) + '.txt' testOName = 'test_output_sub_' + str(i) + '.txt' # file to make testReport = 'test_report_sub_' + str(i) + '.txt' # file to make validReport = 'valid_report_sub_' + str(i) + '.txt' # file to make modelConfig = 'model_sub_' + str(i) + '.txt' # file to make modelName = 'model_sub_' + str(i) # model name # load arrays (no difference between normal and n-sub mode) train_id_list = RD.loadArray('train_id_sub_' + str(i) + '.txt') trainI_array = RD.loadArray(trainIName) trainO_array = RD.loadArray(trainOName) if validRate == 0.0: # for test mode test_id_list = RD.loadArray('test_id_sub_' + str(i) + '.txt') testI_array = RD.loadArray(testIName) # print training and test array (consider n-sub mode) if verbose == True: for j in range(5): trainI_ = np.array(trainI_array)[j] trainO_ = np.array(trainO_array)[j] if validRate == 0.0: testI_ = np.array(testI_array)[j] # for test mode
import sys sys.path.insert(0, '../../AI_BASE') import readData as RD import numpy as np if __name__ == '__main__': # read file testResult = RD.loadArray('test_output.txt') # write final result finalResult = [] for i in range(len(testResult)): finalResult.append([float(testResult[i][0]) + 8.0]) # write file RD.saveArray('to_submit.txt', finalResult)
import sys sys.path.insert(0, '../../AI_BASE') import readData as RD import numpy as np if __name__ == '__main__': train_rows = 300000 test_rows = 200000 input_cols_cat = 19 input_cols_cont = 11 result = '' # TRAIN train_data = RD.loadArray('train.csv', ',') train_input = np.array(train_data)[1:, 1:31] train_output = np.array(train_data)[1:, 31:] # AVG and STD for each column of CONTINUOUS training input cont_avgs = [] cont_stds = [] # categorical for i in range(input_cols_cat): cont_avgs.append(-1) cont_stds.append(-1) # continuous for i in range(input_cols_cat, input_cols_cat + input_cols_cont): thisCol = train_input[:, i].astype(float)
if __name__ == '__main__': files = 64 # [result0, result1, ...] # where each element resultX is [1, 3, 4, 2, 9, 6, 5, 7, ...] for example finalResults = [] # sum of test results sumTestResults = [] # read file for i in range(files): testResult = np.array(RD.loadArray('test_output_' + str(i) + '.txt'))[:, :9].astype(float) if i == 0: sumTestResults = np.array(copy.deepcopy(list(testResult))) else: sumTestResults = sumTestResults + np.array( copy.deepcopy(list(testResult))) finalResult = getFinalResult(testResult) finalResults.append(finalResult) # save the sum of test results RD.saveArray('sumTestResults.txt', sumTestResults) # write final result # USE THE RIGHTMOST COLUMN OF to_submit.txt AS FINAL RESULT
def writeFinalInput(trainTest, rows): business = RD.loadArray('yelp_' + str(trainTest) + '_set_business.txt') checkin = RD.loadArray('yelp_' + str(trainTest) + '_set_checkin.txt') review = RD.loadArray('yelp_' + str(trainTest) + '_set_review.txt') user = RD.loadArray('yelp_' + str(trainTest) + '_set_user.txt') finalInput = [] for i in range(rows[2]): if i % 500 == 0: print('row : ' + str(i)) thisRow = [] # append review info # columns : ['user_id', 'business_id', 'text', 'stars', 'date'] for j in range(2, 5): thisRow.append(review[i][j]) # append user info # columns : ['user_id', 'review_count', 'average_stars'] userid = review[i][0] users = rows[3] userFound = False for j in range(users): if user[j][0] == userid: for k in range(1, 3): thisRow.append(user[j][k]) userFound = True if userFound == False: for k in range(2): thisRow.append(0) # append business info # columns : ['business_id', 'review_count', 'longitude', 'stars', 'latitude', 'open'] businessid = review[i][1] businesses = rows[0] businessFound = False for j in range(businesses): if business[j][0] == businessid: for k in range(1, 6): thisRow.append(business[j][k]) businessFound = True if businessFound == False: for k in range(5): thisRow.append(0) # append checkin info # columns : ['business_id', 'checkin_info'] checkins = rows[1] checkinFound = False for j in range(checkins): if checkin[j][0] == businessid: thisRow.append(checkin[j][1]) checkinFound = True if checkinFound == False: thisRow.append(0) finalInput.append(thisRow) if trainTest == 'training': RD.saveArray('train_input.txt', finalInput) elif trainTest == 'test': RD.saveArray('test_input.txt', finalInput)
def makeData(delta, n, n_, size, limitLen, writeTestInput): # window size ws = int((n - 1) / 2) # for training/test input ws_ = int((n_ - 1) / 2) # for training/test output # read data trainInput = RD.loadArray('train_input_sub_' + str(delta - 1) + '.txt') trainOutput = RD.loadArray('train_output_sub_' + str(delta - 1) + '.txt') testInput = RD.loadArray('test_input_sub_' + str(delta - 1) + '.txt') trainLen = min(len(trainInput), limitLen) testLen = len(testInput) # input data to make trainInputData = [] # output data to make trainOutputData = [] # test input data to make if writeTestInput == True: testInputData = [] # reshape training data for i in range(trainLen): if i % 10 == 0: print('makeData (training) : ' + str(i) + ' / ' + str(trainLen)) # trainInput and trainOutput as numeric type trainInput = np.array(trainInput).astype('float') trainOutput = np.array(trainOutput).astype('float') # reshape to derive n*n training data (with ws-sized padding) trainInputReshaped = np.pad( np.array(trainInput[i]).reshape(size, size), ((ws, ws), (ws, ws)), 'wrap') trainOutputReshaped = np.pad( np.array(trainOutput[i]).reshape(size, size), ((ws_, ws_), (ws_, ws_)), 'wrap') # save training data into array trainInputData and trainOutputData for j in range(size): for k in range(size): trainInputData.append( list(trainInputReshaped[j:j + 2 * ws + 1, k:k + 2 * ws + 1].reshape(n * n))) trainOutputData.append( list(trainOutputReshaped[j:j + 2 * ws_ + 1, k:k + 2 * ws_ + 1].reshape(n_ * n_))) # reshape test data if writeTestInput == True: for i in range(testLen): if i % 10 == 0: print('makeData (test) : ' + str(i) + ' / ' + str(testLen)) # trainInput and trainOutput as numeric type testInput = np.array(testInput).astype('float') # reshape to derive n*n training data (with ws-sized padding) testInputReshaped = np.pad( np.array(testInput[i]).reshape(size, size), ((ws, ws), (ws, ws)), 'wrap') # save test data into array testInputData for j in range(size): for k in range(size): testInputData.append( list(testInputReshaped[j:j + 2 * ws + 1, k:k + 2 * ws + 1].reshape(n * n))) # save as file # [ADDED] saveSize=10000 RD.saveArray('train_input_n_sub_' + str(delta - 1) + '.txt', trainInputData, saveSize=10000) RD.saveArray('train_output_n_sub_' + str(delta - 1) + '.txt', trainOutputData, saveSize=10000) if writeTestInput == True: RD.saveArray('test_input_n_sub_' + str(delta - 1) + '.txt', testInputData)
else: _ = open('final_input.txt', 'r') _.close() _ = open('final_output.txt', 'r') _.close() _ = open('final_input_test.txt', 'r') _.close() except: # using PCA if usePCA == True: team_info_pca = RD.loadArray('team_info_pca.txt', '\t') # TRAINING INPUT : using raw_result final_input = [] final_output = [] for i in range(len(raw_result)): team0 = raw_result[i][0] team1 = raw_result[i][1] season = raw_result[i][4] team0_ = team0 - N_start team1_ = team1 - N_start
# TRAIN AND TEST USING LIGHTGBM # execute deep learning TRI = 'final_input.txt' TRO = 'final_output.txt' TEI = 'final_input_test.txt' TEO = ['final_output_test.txt'] TE_real = None TE_report = 'report_test.txt' VAL_rate = 0.0 VAL_report = 'report_val.txt' # load array TRI_array = RD.loadArray(TRI, '\t') TRO_array = RD.loadArray(TRO, '\t') TEI_array = RD.loadArray(TEI, '\t') # create Pandas DataFrame # tv_input : test / validation input # tv_output : test / validation output (train_input, train_output, tv_input, tv_output) = create_dataframe(TRI_array, TRO_array, TEI_array, TEO, TE_report, VAL_rate, VAL_report) # convert to lightgbm dataset train_ds = lgb.Dataset(train_input, label=train_output) test_ds = lgb.Dataset(tv_input, label=tv_output) # set parameters
TEI = 'test_input.txt' # merge train input and output try: _ = open(TRIO, 'r') _.close() except: mergeTrain(TRI, TRO, TRIO) # K-means clustering finalResult = None trainName = 'train_IO.txt' testName = 'test_input.txt' ftype = 'txt' TRIO_array = RD.loadArray(TRIO, '\t') TEI_array = RD.loadArray(TEI, '\t') dfTrain = pd.DataFrame(TRIO_array) dfTest = pd.DataFrame(TEI_array) dfTestWeight = None caseWeight = False targetCol = 14 targetIndex = 14 k = 100 useAverage = True # execute algorithm AIBASE_KNN.kNN(dfTrain, dfTest, dfTestWeight, caseWeight,