def split_mode(args): audio_files = sorted(os.listdir(args.dir + '/audio')) time_files = sorted(os.listdir(args.dir + '/time')) for audio, timecode in zip(audio_files, time_files): abs_audio = os.path.abspath(args.dir + '/audio/' + audio) abs_timecode = os.path.abspath(args.dir + '/time/' + timecode) if os.path.exists(abs_audio) and os.path.exists(abs_timecode): split(abs_audio, abs_timecode, os.path.abspath(args.dir)) print('************************') print('* Splitting completed! *') print('************************')
def main(): parser = argparse.ArgumentParser() #parser.add_argument('-b', "--bagging", type=int, choices=[1,2,3,4], default=1, help="bagging strategy") parser.add_argument("-t", "--training", type=int, default=12000, help="size of training data") parser.add_argument("-r", "--reverse", action="store_true", help="set -t as the size of test data") #parser.add_argument("-s", "--tr_bsize", type=int, help="the sample size of train set") #parser.add_argument("-x", "--te_bsize", type=int, help="the sample size of test set") #parser.add_argument("-m", "--train_samples", type=int, help="number of samples from training") #parser.add_argument("-n", "--test_samples", type=int, help="number of samples from test") #parser.add_argument("-i", "--input", type=str, default='./dataset/powersupply.arff', help="default input file") parser.add_argument("-i", "--input", type=str, default='/home/wzyCode/scalablelearning/dataset/kdd.arff', help="default input file") #parser.add_argument("-o", "--output", type=str, default='/home/wzyCode/scalablelearning/nmseKMM.txt', help="default output file") #parser.add_argument("-o1", "--output1", type=str, default='/home/wzyCode/scalablelearning/nmseCenKmm.txt', help="default output file") #parser.add_argument("-o2", "--output2", type=str, default='/home/wzyCode/scalablelearning/nmseEnsKmm.txt', help="default output file") #parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode") args = parser.parse_args() training_size = args.training # training set size (small training set) reverse = args.reverse # flip training to test (small test set) input_file = args.input # input file path #sc = SparkContext() # Step 1: Generate biased train and test set, as well as the orginal beta for train set start = time.time() train, train_beta, test = split(input_file, training_size, reverse) #trianBroad = sc.broadcast(train) #train_data = np.array(trianBroad.value) train_data = np.array(train) #testBoard = sc.broadcast(test) #test_data = np.array(testBoard.value) test_data = np.array(test) orig_beta_data = np.array(train_beta) np.savetxt("train_data.txt",train_data); np.savetxt("test_data.txt",test_data); np.savetxt("orig_beta_data.txt",orig_beta_data); end = time.time() split_time = end - start
def cenKmmProcess(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, default='./dataset/powersupply.arff', help="default input file") parser.add_argument("-t", "--training", type=int, default=12000, help="size of training data") # parser.add_argument("-o", "--output", type=str, default='/home/wzyCode/scalablelearning/nmseCenKMM.txt', help="default output file") args = parser.parse_args() file_name = args.input training_size = args.training input_file = '/home/wzyCode/scalablelearning/dataset/' + file_name + '.arff' # input file path #trainFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_train.txt' # input file path #testFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_test.txt' # input file path #betaFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_beta.txt' # input file path #with open(trainFileName, 'rb') as f: # train = pickle.load(f) #with open(testFileName, 'rb') as f: # test = pickle.load(f) #with open(betaFileName, 'rb') as f: # train_beta = pickle.load(f) train, train_beta, test = split(input_file, training_size) train_data = np.array(train) test_data = np.array(test) orig_beta_data = np.array(train_beta) # Step 1: Compute the estimated beta from cenKMM start = time.time() maxFeature = train_data.shape[1] gammab = computeKernelWidth(train_data) res = cenKmm(train_data, test_data, gammab, maxFeature) est_Cenbeta = res[0] end = time.time() compute_time_Cen = end - start # Step 2: Compute the NMSE between the est_beta and orig_beta through CenKMM start = time.time() final_result_Cen = computeNMSE(est_Cenbeta, orig_beta_data) end = time.time() evaluateCen_time = end - start # Step 3: statistics statisticsCen = "In CenKMM method, train_size=%i, test_size=%i" % \ (len(train_data), len(test_data)) total_time = compute_time_Cen + evaluateCen_time time_info_Cen = "compute_time=%s, evaluate_time=%s, total_time=%s\n" % \ (compute_time_Cen, evaluateCen_time, total_time) print statisticsCen print time_info_Cen messageCen = "The final NMSE for CenKMM is : %s \n" % final_result_Cen print messageCen print "---------------------------------------------------------------------------------------------" output_file = '/home/wzyCode/scalablelearning/output/CenKMM/CenKMM_' + file_name + '_trainSize' + str( training_size) + '.txt' with open(output_file, 'a') as output_file: output_file.write(statisticsCen) output_file.write(time_info_Cen) output_file.write(messageCen) #write in csv file csvFile = '/home/wzyCode/scalablelearning/output/CenKMM/CenKMM_' + file_name + '.csv' csvwrite = file(csvFile, 'a+') writer = csv.writer(csvwrite) if training_size == 100: writer.writerow(['train_size', 'accuracy', 'compute_time']) writer.writerow([training_size, final_result_Cen, compute_time_Cen]) else: writer.writerow([training_size, final_result_Cen, compute_time_Cen])
def ensKmmProcess(): parser = argparse.ArgumentParser() parser.add_argument("-t", "--training", type=int, default=12000, help="size of training data") #parser.add_argument("-x", "--te_bsize", type=int, help="the sample size of test set") parser.add_argument("-n", "--test_samples", type=int, help="number of samples from test") #parser.add_argument("-o", "--operate", type=int, help="which experiment") parser.add_argument("-c", "--core", type=int, help="the number of cores") parser.add_argument("-i", "--input", type=str, default='./dataset/powersupply.arff', help="default input file") # parser.add_argument("-o", "--output", type=str, default='/home/wzyCode/scalablelearning/nmseEnsKmm.txt', help="default output file") args = parser.parse_args() training_size = args.training # tr_bsize = args.tr_bsize # By default, the train bag size is dynamic, if specified, the train bag size will fix #te_bsize = 0 # By default, the test bag size is dynamic, if specified, the test bag size will fix # m = args.train_samples # take m samples from training #o = args.operate n = args.test_samples # take n samples from test #eta = args.eta # take eta value from file_name = args.input numOfCores = args.core input_file = '/home/wzyCode/scalablelearning/dataset/' + file_name + '.arff' # input file path base_output_file = '/home/wzyCode/scalablelearning/output/EnsKMM/3/' + file_name #trainFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_train.txt' # input file path #testFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_test.txt' # input file path #betaFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_beta.txt' # input file path #with open(trainFileName, 'rb') as f: # train = pickle.load(f) #with open(testFileName, 'rb') as f: # test = pickle.load(f) #with open(betaFileName, 'rb') as f: # train_beta = pickle.load(f) train, train_beta, test = split(input_file, training_size) train_data = np.array(train) test_data = np.array(test) orig_beta_data = np.array(train_beta) # m, tr_bsize = get_train_info(train_data, n, eta) training_size = len(train_data) testDataLength = len(test_data) te_bag_size = testDataLength / n te_bsizeValue = sc.broadcast(te_bag_size) # Bagging the train and test data from the sampled index start = time.time() tr_bag_size_ens = len(train_data) tr_bag_no_ens = 1 te_bag_size_ens, te_bag_no_ens = get_size_no(test_data, 0, n) tr_n_ens = partition(train_data, part_size=tr_bag_size_ens, part_no=tr_bag_no_ens) te_n_ens = partition(test_data, part_size=te_bag_size_ens, part_no=te_bag_no_ens) #set data as broad cast value train_data_broad = sc.broadcast(train_data) test_data_broad = sc.broadcast(test_data) #bags_ens = cartesian(train_data, test_data, tr_n_ens, te_n_ens) bags_ens = cartesianVFKMM(tr_n_ens, te_n_ens) #numOfMaps = min(numOfCores, len(tr_n_ens) * len(te_n_ens)) #rddEns = sc.parallelize(bags_ens, numOfMaps) rddEns = sc.parallelize(bags_ens, numOfCores) #print("Number of splits: ", rddEns.getNumPartitions()) end = time.time() ens_bagging_time = end - start # 2. Compute Beta Process start = time.time() # rddEns = rddEns.map(lambda (idx, tr, te): (len(idx), len(tr), len(te))) # print "rddEns",rddEns.take(5) # print "te_bsizeValue",te_bsizeValue.value #rddEns = rddEns.map(lambda (idx, tr, te): getEnsKmmBeta(idx, tr, te, te_bsizeValue.value)).flatMap(lambda x: x) #rddEns = rddEns.map(lambda (idx, tr, te): getEnsKmmBeta(idx, train_data_broad.value[tr], test_data_broad.value[te], te_bsizeValue.value)).flatMap(lambda x: x) rddEns = rddEns.map(lambda (idx, tr, te): computeBeta( idx, train_data_broad.value[tr], test_data_broad.value[te])).flatMap( lambda x: x) rddEns = rddEns.aggregateByKey((0, 0), lambda a, b: (a[0] + b, a[1] + 1), lambda a, b: (a[0] + b[0], a[1] + b[1])) est_Ensbeta_map = rddEns.mapValues(lambda v: v[0] / v[1]).collectAsMap() est_Ensbeta_idx = est_Ensbeta_map.keys() end = time.time() compute_time_Ens = end - start # 3. Compute the NMSE between the est_beta and orig_beta through KMM start = time.time() est_Ensbeta = [est_Ensbeta_map[x] for x in est_Ensbeta_idx] orig_beta = orig_beta_data[est_Ensbeta_idx] final_result_Ens = computeNMSE(est_Ensbeta, orig_beta) end = time.time() evaluateEns_time = end - start # 4. statistics statisticsEns = "In EnsKMM method, train_size=%i, test_size=%i, tr_bag_size=%i, m=%i, te_bag_size=%i, n=%i\n" % \ (len(train_data), len(test_data), tr_bag_size_ens, tr_bag_no_ens, te_bag_size_ens, te_bag_no_ens) total_time = ens_bagging_time + compute_time_Ens + evaluateEns_time time_info_Ens = "bagging_time=%s, compute_time=%s, evaluate_time=%s, total_time=%s\n" % \ (ens_bagging_time, compute_time_Ens, evaluateEns_time, total_time) print statisticsEns print time_info_Ens messageEns = "The final NMSE for EnsKMM is : %s \n" % final_result_Ens print messageEns #write in txt file output_file = base_output_file + 'EnsKMM_K=' + str( n) + '_trainSize=' + str(training_size) + '.txt' ori_beta_val = [] for i in est_Ensbeta_idx: ori_beta_val.append([i, orig_beta_data[i]]) est_beta_val = [] for i in est_Ensbeta_idx: est_beta_val.append([i, est_Ensbeta_map[i]]) with open(output_file, 'a') as output_file: output_file.write(statisticsEns) output_file.write(time_info_Ens) output_file.write(messageEns) # output_file.write("The ori beta value is:") # output_file.write('\n') # output_file.write(str(ori_beta_val)) # # output_file.write('\n') # # output_file.write("The est beta value is:") # output_file.write('\n') # output_file.write(str(est_beta_val)) #write in csv file csvFile = base_output_file + 'EnsKMM_trainSize&&K_1000&10.csv' csvwrite = file(csvFile, 'a+') writer = csv.writer(csvwrite) if numOfCores == 20: writer.writerow( [numOfCores, 'accuracy', 'bagging_time', 'compute_time']) writer.writerow( [numOfCores, final_result_Ens, ens_bagging_time, compute_time_Ens]) else: writer.writerow( [numOfCores, final_result_Ens, ens_bagging_time, compute_time_Ens])
def main(): parser = argparse.ArgumentParser() parser.add_argument('-b', "--bagging", type=int, choices=[1, 2, 3, 4], default=1, help="bagging strategy") parser.add_argument("-t", "--training", type=int, default=12000, help="size of training data") parser.add_argument("-r", "--reverse", action="store_true", help="set -t as the size of test data") parser.add_argument("-s", "--tr_bsize", type=int, help="the sample size of train set") parser.add_argument("-x", "--te_bsize", type=int, help="the sample size of test set") parser.add_argument("-m", "--train_samples", type=int, help="number of samples from training") parser.add_argument("-n", "--test_samples", type=int, help="number of samples from test") parser.add_argument("-i", "--input", type=str, default='./dataset/powersupply.arff', help="default input file") parser.add_argument("-o", "--output", type=str, default='./nmse.txt', help="default output file") parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode") args = parser.parse_args() mode = args.bagging # bagging strategy training_size = args.training # training set size (small training set) reverse = args.reverse # flip training to test (small test set) tr_bsize = args.tr_bsize # By default, the train bag size is dynamic, if specified, the train bag size will fix te_bsize = args.te_bsize # By default, the test bag size is dynamic, if specified, the test bag size will fix m = args.train_samples # take m samples from training n = args.test_samples # take n samples from test input_file = args.input # input file path output_file = args.output # output file path # Step 1: Generate biased train and test set, as well as the orginal beta for train set start = time.time() train, train_beta, test = split(input_file, training_size, reverse) train_data = np.array(train) test_data = np.array(test) orig_beta_data = np.array(train_beta) end = time.time() split_time = end - start # Step 2: Generate the bagging index using different bagging strategies start = time.time() # Bagging the train and test data from the sampled index tr_bag_size, tr_bag_no = get_size_no(train_data, tr_bsize, m) te_bag_size, te_bag_no = get_size_no(test_data, te_bsize, n) if mode == 1: # if test is too big, provide x or n to partition test set tr_n = bag(train_data, size=tr_bag_size, sample_no=tr_bag_no) te_n = partition(test_data, part_size=te_bag_size, part_no=te_bag_no) elif mode == 2: # if train is too big, provide s or m to partition train set tr_n = partition(train_data, part_size=tr_bag_size, part_no=tr_bag_no) te_n = bag(test_data, size=te_bag_size, sample_no=te_bag_no) else: # random sample, no partition tr_n = bag(train_data, size=tr_bag_size, sample_no=tr_bag_no) te_n = bag(test_data, size=te_bag_size, sample_no=te_bag_no) if mode < 4: bags = cartesian(train_data, test_data, tr_n, te_n) else: bags = pair(train_data, test_data, tr_n, te_n, sample_no=min(tr_bag_no, te_bag_no)) rdd = sc.parallelize(bags) end = time.time() bagging_time = end - start # Step 3: Compute the estimated beta start = time.time() res = rdd.map(lambda (idx, tr, te): computeBeta(idx, tr, te)).flatMap( lambda x: x) rdd1 = res.aggregateByKey((0, 0), lambda a, b: (a[0] + b, a[1] + 1), lambda a, b: (a[0] + b[0], a[1] + b[1])) est_beta_map = rdd1.mapValues(lambda v: v[0] / v[1]).collectAsMap() est_beta_idx = est_beta_map.keys() end = time.time() compute_time = end - start # Step 4: Compute the NMSE between the est_beta and orig_beta start = time.time() est_beta = [est_beta_map[x] for x in est_beta_idx] orig_beta = orig_beta_data[est_beta_idx] final_result = computeNMSE(est_beta, orig_beta) end = time.time() evaluate_time = end - start # statistics statistics = "mode=%s, train_size=%i, test_size=%i, tr_bag_size=%i, m=%i, te_bag_size=%i, n=%i\n" % \ (mode, len(train_data), len(test_data), tr_bag_size, tr_bag_no, te_bag_size, te_bag_no) total_time = split_time + bagging_time + compute_time + evaluate_time time_info = "split_time=%s, bagging_time=%s, compute_time=%s, evaluate_time=%s, total_time=%s\n" % \ (split_time, bagging_time, compute_time, evaluate_time, total_time) print statistics print time_info # Save the result into a text file with open(output_file, 'a') as output_file: message = "The final NMSE is : %s \n" % final_result print message output_file.write(statistics) output_file.write(time_info) output_file.write(message)
def main(): parser = argparse.ArgumentParser() # parser.add_argument('-b', "--bagging", type=int, choices=[1,2,3,4], default=1, help="bagging strategy") parser.add_argument("-t", "--training", type=int, default=12000, help="size of training data") parser.add_argument("-r", "--reverse", action="store_true", help="set -t as the size of test data") # parser.add_argument("-s", "--tr_bsize", type=int, help="the sample size of train set") # parser.add_argument("-x", "--te_bsize", type=int, help="the sample size of test set") # parser.add_argument("-m", "--train_samples", type=int, help="number of samples from training") # parser.add_argument("-n", "--test_samples", type=int, help="number of samples from test") # parser.add_argument("-i", "--input", type=str, default='./dataset/powersupply.arff', help="default input file") parser.add_argument( "-i", "--input", type=str, default='/home/wzyCode/scalablelearning/dataset/kdd.arff', help="default input file") args = parser.parse_args() training_size = args.training # training set size (small training set) reverse = args.reverse # flip training to test (small test set) file_name = args.input input_file = '/home/wzyCode/scalablelearning/dataset/' + args.input + '.arff' # input file path # print type(input_file) # sc = SparkContext() # Step 1: Generate biased train and test set, as well as the orginal beta for train set start = time.time() train, train_beta, test = split(input_file, training_size, reverse) # trianBroad = sc.broadcast(train) # train_data = np.array(trianBroad.value) # train_data = np.array(train) # testBoard = sc.broadcast(test) # test_data = np.array(testBoard.value) # test_data = np.array(test) # orig_beta_data = np.array(train_beta) fileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str( training_size) + '_train.txt' with open(fileName, 'wb') as f: pickle.dump(train, f) with open( '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_test.txt', 'wb') as f: pickle.dump(test, f) with open( '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_beta.txt', 'wb') as f: pickle.dump(train_beta, f) # np.savetxt('/home/wzyCode/scalablelearning/input/'+file_name + '/'+str(training_size)+'_train.txt', train_data); # np.savetxt('/home/wzyCode/scalablelearning/input/'+file_name + '/'+str(training_size)+'_test.txt', test_data); # np.savetxt('/home/wzyCode/scalablelearning/input/'+file_name + '/'+str(training_size)+'_beta.txt', orig_beta_data); end = time.time() split_time = end - start
def kmmProcess(): parser = argparse.ArgumentParser() parser.add_argument('-b', "--bagging", type=int, choices=[1, 2, 3, 4], default=1, help="bagging strategy") parser.add_argument("-t", "--training", type=int, default=12000, help="size of training data") # parser.add_argument("-s", "--tr_bsize", type=int, help="the sample size of train set") parser.add_argument("-x", "--te_bsize", type=int, help="the sample size of test set") # parser.add_argument("-m", "--train_samples", type=int, help="number of samples from training") parser.add_argument("-n", "--test_samples", type=int, help="number of samples from test") parser.add_argument("-e", "--eta", type=float, help="the eta value") parser.add_argument( "-i", "--input", type=str, default='/home/wzyCode/scalablelearning/dataset/kdd.arff', help="default input file") parser.add_argument("-o", "--operate", type=int, help="which experiment") parser.add_argument("-c", "--core", type=int, help="the number of cores") # parser.add_argument("-o", "--output", type=str, default='/home/wzyCode/scalablelearning/dataset/bag.txt', # help="default output file") args = parser.parse_args() mode = args.bagging # bagging strategy training_size = args.training # training set size (small training set) # tr_bsize = args.tr_bsize # By default, the train bag size is dynamic, if specified, the train bag size will fix te_bsize = args.te_bsize # By default, the test bag size is dynamic, if specified, the test bag size will fix # m = args.train_samples # take m samples from training n = args.test_samples # take n samples from eta = args.eta # take eta value from o = args.operate numOfCores = args.core file_name = args.input #trainFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_train.txt' # input file path #testFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_test.txt' # input file path #betaFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_beta.txt' # input file path base_output_file = '/home/wzyCode/scalablelearning/output/' + str( o) + '/VFKMM_' + file_name + '_' # Step 1: Generate biased train and test set, as well as the orginal beta for train set #start = time.time() input_file = '/home/wzyCode/scalablelearning/dataset/' + file_name + '.arff' # input file path train, train_beta, test = split(input_file, training_size) #end = time.time() #split_time = end - start train_data = np.array(train) test_data = np.array(test) orig_beta_data = np.array(train_beta) # 1.Bagging process # train_data = np.loadtxt('dataset/'+file_name+'_train.txt') # test_data = np.loadtxt('dataset/'+file_name+'_test.txt') # orig_beta_data = ('dataset/'+file_name+'_beta.txt') tr_bsize, m = get_train_info(train_data, n, eta) start = time.time() # Bagging the train and test data from the sampled index tr_bag_size, tr_bag_no = get_size_no(train_data, tr_bsize, m) te_bag_size, te_bag_no = get_size_no(test_data, te_bsize, n) print "tr_bag_size", tr_bag_size print "tr_bag_no", tr_bag_no print "te_bag_size", te_bag_size print "te_bag_no", te_bag_no bags = [] if mode == 1: # if test is too big, provide x or n to partition test set tr_n = bag(train_data, size=tr_bag_size, sample_no=tr_bag_no) te_n = partition(test_data, part_size=te_bag_size, part_no=te_bag_no) elif mode == 2: # if train is too big, provide s or m to partition train set tr_n = partition(train_data, part_size=tr_bag_size, part_no=tr_bag_no) te_n = bag(test_data, size=te_bag_size, sample_no=te_bag_no) else: # random sample, no partition tr_n = bag(train_data, size=tr_bag_size, sample_no=tr_bag_no) te_n = partition(test_data, part_size=te_bag_size, part_no=te_bag_no) # broadcast relative value: train_data_broad = sc.broadcast(train_data) test_data_broad = sc.broadcast(test_data) train_index = sc.broadcast(tr_n) test_index = sc.broadcast(te_n) print "tr_n", len(tr_n) print "te_n", len(te_n) if mode < 4: bags = cartesianVFKMM(tr_n, te_n) else: bags = pair(train_data, test_data, tr_n, te_n, sample_no=min(tr_bag_no, te_bag_no)) #numOfMaps = min(numOfCores, len(tr_n) * len(te_n)) rdd = sc.parallelize(bags, numOfCores) print("Number of splits: ", rdd.getNumPartitions()) end = time.time() bagging_time = end - start # 2. Compute Beta Process # train_data = train_data_broad.value # test_data = test_data_broad.value start = time.time() res = rdd.map(lambda (idx, tr, te): computeBeta( idx, train_data_broad.value[tr], test_data_broad.value[te])).flatMap( lambda x: x) # res = rdd.map(lambda (idx, tr, te): computeBeta(idx, tr, te)).flatMap(lambda x: x) rdd1 = res.aggregateByKey((0, 0), lambda a, b: (a[0] + b, a[1] + 1), lambda a, b: (a[0] + b[0], a[1] + b[1])) est_beta_map = rdd1.mapValues(lambda v: v[0] / v[1]).collectAsMap() est_beta_idx = est_beta_map.keys() end = time.time() compute_time = end - start # # # 3. Compute the NMSE between the est_beta and orig_beta through KMM start = time.time() est_beta = [est_beta_map[x] for x in est_beta_idx] orig_beta = orig_beta_data[est_beta_idx] final_result = computeNMSE(est_beta, orig_beta) end = time.time() evaluate_time = end - start # #4. statistics statistics = "In KMM method, mode=%s, train_size=%i, test_size=%i, size_of_train_samples=%i, number_of_train_samples=%i, size_of_test_samples=%i, K=%i,eta=%s\n" % \ (mode, len(train_data), len(test_data), tr_bag_size, tr_bag_no, te_bag_size, te_bag_no, eta) total_time = bagging_time + compute_time + evaluate_time time_info = "bagging_time=%s, compute_time=%s, evaluate_time=%s, total_time=%s\n" % \ (bagging_time, compute_time, evaluate_time, total_time) print statistics print time_info message = "The final NMSE for KMM is : %s \n" % final_result print message print "---------------------------------------------------------------------------------------------" txt_output_file = base_output_file + '_K=' + str(n) + '_trainSize=' + str( training_size) + '_eta' + str(eta) + '_tr_bag_no' + str(tr_bag_no) ori_beta_val = [] for i in est_beta_idx: ori_beta_val.append([i, orig_beta_data[i]]) est_beta_val = [] for i in est_beta_idx: est_beta_val.append([i, est_beta_map[i]]) # write in text file textFile = txt_output_file + '.txt' print textFile with open(textFile, 'a') as textFile: textFile.write(statistics) textFile.write(time_info) textFile.write(message) textFile.write("The ori beta value is:") textFile.write('\n') textFile.write(str(ori_beta_val)) textFile.write('\n') textFile.write("The est beta value is:") textFile.write('\n') textFile.write(str(est_beta_val)) # write in csv file if o == 1: csvFile = base_output_file + '_K=' + str(n) + '_eta=' + str( eta) + '.csv' csvwrite = file(csvFile, 'a+') writer = csv.writer(csvwrite) if training_size == 100: writer.writerow( ['train_size', 'accuracy', 'bagging_time', 'compute_time']) writer.writerow( [training_size, final_result, bagging_time, compute_time]) else: writer.writerow( [training_size, final_result, bagging_time, compute_time]) if o == 2: csvFile = base_output_file + '_trainSize=' + str( training_size) + '_eta=' + str(eta) + '.csv' csvwrite = file(csvFile, 'a+') writer = csv.writer(csvwrite) if n == 5: writer.writerow( ['k_value', 'accuracy', 'bagging_time', 'compute_time']) writer.writerow([n, final_result, bagging_time, compute_time]) else: writer.writerow([n, final_result, bagging_time, compute_time]) if o == 3: csvFile = base_output_file + '_trainSize=' + str( training_size) + '_K=' + str(n) + '.csv' csvwrite = file(csvFile, 'a+') writer = csv.writer(csvwrite) if eta == 0.1: writer.writerow( ['eta_value', 'accuracy', 'bagging_time', 'compute_time']) writer.writerow([eta, final_result, bagging_time, compute_time]) else: writer.writerow([eta, final_result, bagging_time, compute_time]) if o == 4: csvFile = base_output_file + '_trainSize=' + str( training_size) + '_K=' + str(n) + '_eta=' + str(eta) + '.csv' csvwrite = file(csvFile, 'a+') writer = csv.writer(csvwrite) if numOfCores == 20: writer.writerow( ['numOfCores', 'accuracy', 'bagging_time', 'compute_time']) writer.writerow( [numOfCores, final_result, bagging_time, compute_time]) else: writer.writerow( [numOfCores, final_result, bagging_time, compute_time])
def main(): parser = argparse.ArgumentParser() parser.add_argument('-b', "--bagging", type=int, choices=[1,2,3,4], default=1, help="bagging strategy") parser.add_argument("-t", "--training", type=int, default=12000, help="size of training data") parser.add_argument("-r", "--reverse", action="store_true", help="set -t as the size of test data") parser.add_argument("-s", "--tr_bsize", type=int, help="the sample size of train set") parser.add_argument("-x", "--te_bsize", type=int, help="the sample size of test set") parser.add_argument("-m", "--train_samples", type=int, help="number of samples from training") parser.add_argument("-n", "--test_samples", type=int, help="number of samples from test") parser.add_argument("-i", "--input", type=str, default='./dataset/powersupply.arff', help="default input file") parser.add_argument("-o", "--output", type=str, default='./nmse.txt', help="default output file") parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode") args = parser.parse_args() mode = args.bagging # bagging strategy training_size = args.training # training set size (small training set) reverse = args.reverse # flip training to test (small test set) tr_bsize = args.tr_bsize # By default, the train bag size is dynamic, if specified, the train bag size will fix te_bsize = args.te_bsize # By default, the test bag size is dynamic, if specified, the test bag size will fix m = args.train_samples # take m samples from training n = args.test_samples # take n samples from test input_file = args.input # input file path output_file = args.output # output file path # Step 1: Generate biased train and test set, as well as the orginal beta for train set start = time.time() train, train_beta, test = split(input_file, training_size, reverse) train_data = np.array(train) test_data = np.array(test) orig_beta_data = np.array(train_beta) end = time.time() split_time = end - start # Step 2: Generate the bagging index using different bagging strategies start = time.time() # Bagging the train and test data from the sampled index tr_bag_size, tr_bag_no = get_size_no(train_data, tr_bsize, m) te_bag_size, te_bag_no = get_size_no(test_data, te_bsize, n) if mode == 1: # if test is too big, provide x or n to partition test set tr_n = bag(train_data, size=tr_bag_size, sample_no=tr_bag_no) te_n = partition(test_data, part_size=te_bag_size, part_no=te_bag_no) elif mode == 2: # if train is too big, provide s or m to partition train set tr_n = partition(train_data, part_size=tr_bag_size, part_no=tr_bag_no) te_n = bag(test_data, size=te_bag_size, sample_no=te_bag_no) else: # random sample, no partition tr_n = bag(train_data, size=tr_bag_size, sample_no=tr_bag_no) te_n = bag(test_data, size=te_bag_size, sample_no=te_bag_no) if mode < 4: bags = cartesian(train_data, test_data, tr_n, te_n) else: bags = pair(train_data, test_data, tr_n, te_n, sample_no=min(tr_bag_no, te_bag_no)) rdd = sc.parallelize(bags) end = time.time() bagging_time = end - start # Step 3: Compute the estimated beta start = time.time() res = rdd.map(lambda (idx, tr, te): computeBeta(idx, tr, te)).flatMap(lambda x: x) rdd1 = res.aggregateByKey((0,0), lambda a,b: (a[0] + b, a[1] + 1), lambda a,b: (a[0] + b[0], a[1] + b[1])) est_beta_map = rdd1.mapValues(lambda v: v[0]/v[1]).collectAsMap() est_beta_idx = est_beta_map.keys() end = time.time() compute_time = end - start # Step 4: Compute the NMSE between the est_beta and orig_beta start = time.time() est_beta = [est_beta_map[x] for x in est_beta_idx] orig_beta = orig_beta_data[est_beta_idx] final_result = computeNMSE(est_beta, orig_beta) end = time.time() evaluate_time = end - start # statistics statistics = "mode=%s, train_size=%i, test_size=%i, tr_bag_size=%i, m=%i, te_bag_size=%i, n=%i\n" % \ (mode, len(train_data), len(test_data), tr_bag_size, tr_bag_no, te_bag_size, te_bag_no) total_time = split_time + bagging_time + compute_time + evaluate_time time_info = "split_time=%s, bagging_time=%s, compute_time=%s, evaluate_time=%s, total_time=%s\n" % \ (split_time, bagging_time, compute_time, evaluate_time, total_time) print statistics print time_info # Save the result into a text file with open(output_file, 'a') as output_file: message = "The final NMSE is : %s \n" % final_result print message output_file.write(statistics) output_file.write(time_info) output_file.write(message)