Beispiel #1
0
def split_mode(args):
    audio_files = sorted(os.listdir(args.dir + '/audio'))
    time_files = sorted(os.listdir(args.dir + '/time'))

    for audio, timecode in zip(audio_files, time_files):
        abs_audio = os.path.abspath(args.dir + '/audio/' + audio)
        abs_timecode = os.path.abspath(args.dir + '/time/' + timecode)

        if os.path.exists(abs_audio) and os.path.exists(abs_timecode):
            split(abs_audio, abs_timecode, os.path.abspath(args.dir))

    print('************************')
    print('* Splitting completed! *')
    print('************************')
def main():
    parser = argparse.ArgumentParser()
    #parser.add_argument('-b', "--bagging", type=int, choices=[1,2,3,4], default=1, help="bagging strategy")
    parser.add_argument("-t", "--training", type=int, default=12000, help="size of training data")
    parser.add_argument("-r", "--reverse", action="store_true", help="set -t as the size of test data")
    #parser.add_argument("-s", "--tr_bsize", type=int, help="the sample size of train set")
    #parser.add_argument("-x", "--te_bsize", type=int, help="the sample size of test set")
    #parser.add_argument("-m", "--train_samples", type=int, help="number of samples from training")
    #parser.add_argument("-n", "--test_samples", type=int, help="number of samples from test")
    #parser.add_argument("-i", "--input", type=str, default='./dataset/powersupply.arff', help="default input file")
    parser.add_argument("-i", "--input", type=str, default='/home/wzyCode/scalablelearning/dataset/kdd.arff', help="default input file")
    #parser.add_argument("-o", "--output", type=str, default='/home/wzyCode/scalablelearning/nmseKMM.txt', help="default output file")
    #parser.add_argument("-o1", "--output1", type=str, default='/home/wzyCode/scalablelearning/nmseCenKmm.txt', help="default output file")
    #parser.add_argument("-o2", "--output2", type=str, default='/home/wzyCode/scalablelearning/nmseEnsKmm.txt', help="default output file")
    #parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode")
    args = parser.parse_args()

    training_size = args.training # training set size (small training set)
    reverse = args.reverse # flip training to test (small test set)
    input_file = args.input # input file path
    
    #sc = SparkContext()
    
    # Step 1: Generate biased train and test set, as well as the orginal beta for train set
    start = time.time()

    train, train_beta, test = split(input_file, training_size, reverse)
    #trianBroad = sc.broadcast(train)
    #train_data = np.array(trianBroad.value)
    train_data = np.array(train)
    #testBoard = sc.broadcast(test)
    #test_data = np.array(testBoard.value)
    test_data = np.array(test)
    orig_beta_data = np.array(train_beta)
    
    np.savetxt("train_data.txt",train_data);
    np.savetxt("test_data.txt",test_data);
    np.savetxt("orig_beta_data.txt",orig_beta_data);
    
    end = time.time()
    split_time = end - start
Beispiel #3
0
def cenKmmProcess():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        type=str,
                        default='./dataset/powersupply.arff',
                        help="default input file")
    parser.add_argument("-t",
                        "--training",
                        type=int,
                        default=12000,
                        help="size of training data")
    # parser.add_argument("-o", "--output", type=str, default='/home/wzyCode/scalablelearning/nmseCenKMM.txt', help="default output file")
    args = parser.parse_args()
    file_name = args.input
    training_size = args.training

    input_file = '/home/wzyCode/scalablelearning/dataset/' + file_name + '.arff'  # input file path
    #trainFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_train.txt'  # input file path
    #testFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_test.txt'  # input file path
    #betaFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_beta.txt'  # input file path

    #with open(trainFileName, 'rb') as f:
    #    train = pickle.load(f)
    #with open(testFileName, 'rb') as f:
    #    test = pickle.load(f)
    #with open(betaFileName, 'rb') as f:
    #    train_beta = pickle.load(f)

    train, train_beta, test = split(input_file, training_size)

    train_data = np.array(train)
    test_data = np.array(test)
    orig_beta_data = np.array(train_beta)

    # Step 1: Compute the estimated beta from cenKMM
    start = time.time()
    maxFeature = train_data.shape[1]
    gammab = computeKernelWidth(train_data)
    res = cenKmm(train_data, test_data, gammab, maxFeature)
    est_Cenbeta = res[0]

    end = time.time()
    compute_time_Cen = end - start

    # Step 2: Compute the NMSE between the est_beta and orig_beta through CenKMM
    start = time.time()
    final_result_Cen = computeNMSE(est_Cenbeta, orig_beta_data)
    end = time.time()
    evaluateCen_time = end - start

    # Step 3: statistics
    statisticsCen = "In CenKMM method, train_size=%i, test_size=%i" % \
                    (len(train_data), len(test_data))
    total_time = compute_time_Cen + evaluateCen_time
    time_info_Cen = "compute_time=%s, evaluate_time=%s, total_time=%s\n" % \
                    (compute_time_Cen, evaluateCen_time, total_time)
    print statisticsCen
    print time_info_Cen

    messageCen = "The final NMSE for CenKMM is : %s \n" % final_result_Cen
    print messageCen

    print "---------------------------------------------------------------------------------------------"

    output_file = '/home/wzyCode/scalablelearning/output/CenKMM/CenKMM_' + file_name + '_trainSize' + str(
        training_size) + '.txt'

    with open(output_file, 'a') as output_file:
        output_file.write(statisticsCen)
        output_file.write(time_info_Cen)
        output_file.write(messageCen)

    #write in csv file
    csvFile = '/home/wzyCode/scalablelearning/output/CenKMM/CenKMM_' + file_name + '.csv'
    csvwrite = file(csvFile, 'a+')
    writer = csv.writer(csvwrite)
    if training_size == 100:
        writer.writerow(['train_size', 'accuracy', 'compute_time'])
        writer.writerow([training_size, final_result_Cen, compute_time_Cen])

    else:
        writer.writerow([training_size, final_result_Cen, compute_time_Cen])
def ensKmmProcess():
    parser = argparse.ArgumentParser()
    parser.add_argument("-t",
                        "--training",
                        type=int,
                        default=12000,
                        help="size of training data")
    #parser.add_argument("-x", "--te_bsize", type=int, help="the sample size of test set")
    parser.add_argument("-n",
                        "--test_samples",
                        type=int,
                        help="number of samples from test")
    #parser.add_argument("-o", "--operate", type=int, help="which experiment")
    parser.add_argument("-c", "--core", type=int, help="the number of cores")
    parser.add_argument("-i",
                        "--input",
                        type=str,
                        default='./dataset/powersupply.arff',
                        help="default input file")
    # parser.add_argument("-o", "--output", type=str, default='/home/wzyCode/scalablelearning/nmseEnsKmm.txt', help="default output file")
    args = parser.parse_args()

    training_size = args.training
    # tr_bsize = args.tr_bsize # By default, the train bag size is dynamic, if specified, the train bag size will fix
    #te_bsize = 0  # By default, the test bag size is dynamic, if specified, the test bag size will fix
    # m = args.train_samples # take m samples from training
    #o = args.operate
    n = args.test_samples  # take n samples from test
    #eta = args.eta  # take eta value from
    file_name = args.input
    numOfCores = args.core

    input_file = '/home/wzyCode/scalablelearning/dataset/' + file_name + '.arff'  # input file path
    base_output_file = '/home/wzyCode/scalablelearning/output/EnsKMM/3/' + file_name

    #trainFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_train.txt'  # input file path
    #testFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_test.txt'  # input file path
    #betaFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_beta.txt'  # input file path

    #with open(trainFileName, 'rb') as f:
    #    train = pickle.load(f)
    #with open(testFileName, 'rb') as f:
    #    test = pickle.load(f)
    #with open(betaFileName, 'rb') as f:
    #    train_beta = pickle.load(f)

    train, train_beta, test = split(input_file, training_size)

    train_data = np.array(train)
    test_data = np.array(test)
    orig_beta_data = np.array(train_beta)

    # m, tr_bsize = get_train_info(train_data, n, eta)
    training_size = len(train_data)
    testDataLength = len(test_data)
    te_bag_size = testDataLength / n
    te_bsizeValue = sc.broadcast(te_bag_size)

    # Bagging the train and test data from the sampled index
    start = time.time()
    tr_bag_size_ens = len(train_data)
    tr_bag_no_ens = 1
    te_bag_size_ens, te_bag_no_ens = get_size_no(test_data, 0, n)

    tr_n_ens = partition(train_data,
                         part_size=tr_bag_size_ens,
                         part_no=tr_bag_no_ens)
    te_n_ens = partition(test_data,
                         part_size=te_bag_size_ens,
                         part_no=te_bag_no_ens)

    #set data as broad cast value
    train_data_broad = sc.broadcast(train_data)
    test_data_broad = sc.broadcast(test_data)

    #bags_ens = cartesian(train_data, test_data, tr_n_ens, te_n_ens)
    bags_ens = cartesianVFKMM(tr_n_ens, te_n_ens)

    #numOfMaps = min(numOfCores, len(tr_n_ens) * len(te_n_ens))
    #rddEns = sc.parallelize(bags_ens, numOfMaps)
    rddEns = sc.parallelize(bags_ens, numOfCores)
    #print("Number of splits: ", rddEns.getNumPartitions())

    end = time.time()
    ens_bagging_time = end - start

    # 2. Compute Beta Process
    start = time.time()
    # rddEns = rddEns.map(lambda (idx, tr, te): (len(idx), len(tr), len(te)))
    # print "rddEns",rddEns.take(5)
    # print "te_bsizeValue",te_bsizeValue.value
    #rddEns = rddEns.map(lambda (idx, tr, te): getEnsKmmBeta(idx, tr, te, te_bsizeValue.value)).flatMap(lambda x: x)
    #rddEns = rddEns.map(lambda (idx, tr, te): getEnsKmmBeta(idx, train_data_broad.value[tr], test_data_broad.value[te], te_bsizeValue.value)).flatMap(lambda x: x)
    rddEns = rddEns.map(lambda (idx, tr, te): computeBeta(
        idx, train_data_broad.value[tr], test_data_broad.value[te])).flatMap(
            lambda x: x)

    rddEns = rddEns.aggregateByKey((0, 0), lambda a, b: (a[0] + b, a[1] + 1),
                                   lambda a, b: (a[0] + b[0], a[1] + b[1]))

    est_Ensbeta_map = rddEns.mapValues(lambda v: v[0] / v[1]).collectAsMap()
    est_Ensbeta_idx = est_Ensbeta_map.keys()
    end = time.time()
    compute_time_Ens = end - start

    # 3. Compute the NMSE between the est_beta and orig_beta through KMM
    start = time.time()

    est_Ensbeta = [est_Ensbeta_map[x] for x in est_Ensbeta_idx]
    orig_beta = orig_beta_data[est_Ensbeta_idx]
    final_result_Ens = computeNMSE(est_Ensbeta, orig_beta)

    end = time.time()
    evaluateEns_time = end - start

    # 4. statistics
    statisticsEns = "In EnsKMM method, train_size=%i, test_size=%i, tr_bag_size=%i, m=%i, te_bag_size=%i, n=%i\n" % \
                    (len(train_data), len(test_data), tr_bag_size_ens, tr_bag_no_ens, te_bag_size_ens, te_bag_no_ens)
    total_time = ens_bagging_time + compute_time_Ens + evaluateEns_time
    time_info_Ens = "bagging_time=%s, compute_time=%s, evaluate_time=%s, total_time=%s\n" % \
                    (ens_bagging_time, compute_time_Ens, evaluateEns_time, total_time)
    print statisticsEns
    print time_info_Ens

    messageEns = "The final NMSE for EnsKMM is : %s \n" % final_result_Ens
    print messageEns

    #write in txt file
    output_file = base_output_file + 'EnsKMM_K=' + str(
        n) + '_trainSize=' + str(training_size) + '.txt'

    ori_beta_val = []
    for i in est_Ensbeta_idx:
        ori_beta_val.append([i, orig_beta_data[i]])

    est_beta_val = []
    for i in est_Ensbeta_idx:
        est_beta_val.append([i, est_Ensbeta_map[i]])

    with open(output_file, 'a') as output_file:
        output_file.write(statisticsEns)
        output_file.write(time_info_Ens)
        output_file.write(messageEns)

        # output_file.write("The ori beta value is:")
        # output_file.write('\n')
        # output_file.write(str(ori_beta_val))
        #
        # output_file.write('\n')
        #
        # output_file.write("The est beta value is:")
        # output_file.write('\n')
        # output_file.write(str(est_beta_val))

    #write in csv file

    csvFile = base_output_file + 'EnsKMM_trainSize&&K_1000&10.csv'
    csvwrite = file(csvFile, 'a+')
    writer = csv.writer(csvwrite)
    if numOfCores == 20:
        writer.writerow(
            [numOfCores, 'accuracy', 'bagging_time', 'compute_time'])
        writer.writerow(
            [numOfCores, final_result_Ens, ens_bagging_time, compute_time_Ens])
    else:
        writer.writerow(
            [numOfCores, final_result_Ens, ens_bagging_time, compute_time_Ens])
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-b',
                        "--bagging",
                        type=int,
                        choices=[1, 2, 3, 4],
                        default=1,
                        help="bagging strategy")
    parser.add_argument("-t",
                        "--training",
                        type=int,
                        default=12000,
                        help="size of training data")
    parser.add_argument("-r",
                        "--reverse",
                        action="store_true",
                        help="set -t as the size of test data")
    parser.add_argument("-s",
                        "--tr_bsize",
                        type=int,
                        help="the sample size of train set")
    parser.add_argument("-x",
                        "--te_bsize",
                        type=int,
                        help="the sample size of test set")
    parser.add_argument("-m",
                        "--train_samples",
                        type=int,
                        help="number of samples from training")
    parser.add_argument("-n",
                        "--test_samples",
                        type=int,
                        help="number of samples from test")
    parser.add_argument("-i",
                        "--input",
                        type=str,
                        default='./dataset/powersupply.arff',
                        help="default input file")
    parser.add_argument("-o",
                        "--output",
                        type=str,
                        default='./nmse.txt',
                        help="default output file")
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        help="verbose mode")
    args = parser.parse_args()

    mode = args.bagging  # bagging strategy
    training_size = args.training  # training set size (small training set)
    reverse = args.reverse  # flip training to test (small test set)
    tr_bsize = args.tr_bsize  # By default, the train bag size is dynamic, if specified, the train bag size will fix
    te_bsize = args.te_bsize  # By default, the test bag size is dynamic, if specified, the test bag size will fix
    m = args.train_samples  # take m samples from training
    n = args.test_samples  # take n samples from test
    input_file = args.input  # input file path
    output_file = args.output  # output file path

    # Step 1: Generate biased train and test set, as well as the orginal beta for train set
    start = time.time()

    train, train_beta, test = split(input_file, training_size, reverse)
    train_data = np.array(train)
    test_data = np.array(test)
    orig_beta_data = np.array(train_beta)

    end = time.time()
    split_time = end - start

    # Step 2: Generate the bagging index using different bagging strategies
    start = time.time()

    # Bagging the train and test data from the sampled index
    tr_bag_size, tr_bag_no = get_size_no(train_data, tr_bsize, m)
    te_bag_size, te_bag_no = get_size_no(test_data, te_bsize, n)

    if mode == 1:  # if test is too big, provide x or n to partition test set
        tr_n = bag(train_data, size=tr_bag_size, sample_no=tr_bag_no)
        te_n = partition(test_data, part_size=te_bag_size, part_no=te_bag_no)
    elif mode == 2:  # if train is too big, provide s or m to partition train set
        tr_n = partition(train_data, part_size=tr_bag_size, part_no=tr_bag_no)
        te_n = bag(test_data, size=te_bag_size, sample_no=te_bag_no)
    else:  # random sample, no partition
        tr_n = bag(train_data, size=tr_bag_size, sample_no=tr_bag_no)
        te_n = bag(test_data, size=te_bag_size, sample_no=te_bag_no)

    if mode < 4:
        bags = cartesian(train_data, test_data, tr_n, te_n)
    else:
        bags = pair(train_data,
                    test_data,
                    tr_n,
                    te_n,
                    sample_no=min(tr_bag_no, te_bag_no))

    rdd = sc.parallelize(bags)

    end = time.time()
    bagging_time = end - start

    # Step 3: Compute the estimated beta
    start = time.time()

    res = rdd.map(lambda (idx, tr, te): computeBeta(idx, tr, te)).flatMap(
        lambda x: x)

    rdd1 = res.aggregateByKey((0, 0), lambda a, b: (a[0] + b, a[1] + 1),
                              lambda a, b: (a[0] + b[0], a[1] + b[1]))

    est_beta_map = rdd1.mapValues(lambda v: v[0] / v[1]).collectAsMap()
    est_beta_idx = est_beta_map.keys()

    end = time.time()
    compute_time = end - start

    # Step 4: Compute the NMSE between the est_beta and orig_beta
    start = time.time()

    est_beta = [est_beta_map[x] for x in est_beta_idx]
    orig_beta = orig_beta_data[est_beta_idx]
    final_result = computeNMSE(est_beta, orig_beta)

    end = time.time()
    evaluate_time = end - start

    # statistics
    statistics = "mode=%s, train_size=%i, test_size=%i, tr_bag_size=%i, m=%i, te_bag_size=%i, n=%i\n" % \
                 (mode, len(train_data), len(test_data), tr_bag_size, tr_bag_no, te_bag_size, te_bag_no)
    total_time = split_time + bagging_time + compute_time + evaluate_time
    time_info = "split_time=%s, bagging_time=%s, compute_time=%s, evaluate_time=%s, total_time=%s\n" % \
                (split_time, bagging_time, compute_time, evaluate_time, total_time)
    print statistics
    print time_info

    # Save the result into a text file
    with open(output_file, 'a') as output_file:
        message = "The final NMSE is : %s \n" % final_result
        print message
        output_file.write(statistics)
        output_file.write(time_info)
        output_file.write(message)
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser()
    # parser.add_argument('-b', "--bagging", type=int, choices=[1,2,3,4], default=1, help="bagging strategy")
    parser.add_argument("-t",
                        "--training",
                        type=int,
                        default=12000,
                        help="size of training data")
    parser.add_argument("-r",
                        "--reverse",
                        action="store_true",
                        help="set -t as the size of test data")
    # parser.add_argument("-s", "--tr_bsize", type=int, help="the sample size of train set")
    # parser.add_argument("-x", "--te_bsize", type=int, help="the sample size of test set")
    # parser.add_argument("-m", "--train_samples", type=int, help="number of samples from training")
    # parser.add_argument("-n", "--test_samples", type=int, help="number of samples from test")
    # parser.add_argument("-i", "--input", type=str, default='./dataset/powersupply.arff', help="default input file")
    parser.add_argument(
        "-i",
        "--input",
        type=str,
        default='/home/wzyCode/scalablelearning/dataset/kdd.arff',
        help="default input file")

    args = parser.parse_args()

    training_size = args.training  # training set size (small training set)
    reverse = args.reverse  # flip training to test (small test set)
    file_name = args.input
    input_file = '/home/wzyCode/scalablelearning/dataset/' + args.input + '.arff'  # input file path
    # print type(input_file)
    # sc = SparkContext()

    # Step 1: Generate biased train and test set, as well as the orginal beta for train set
    start = time.time()

    train, train_beta, test = split(input_file, training_size, reverse)

    # trianBroad = sc.broadcast(train)
    # train_data = np.array(trianBroad.value)
    # train_data = np.array(train)
    # testBoard = sc.broadcast(test)
    # test_data = np.array(testBoard.value)
    # test_data = np.array(test)
    # orig_beta_data = np.array(train_beta)

    fileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(
        training_size) + '_train.txt'

    with open(fileName, 'wb') as f:
        pickle.dump(train, f)
    with open(
            '/home/wzyCode/scalablelearning/input/' + file_name + '/' +
            str(training_size) + '_test.txt', 'wb') as f:
        pickle.dump(test, f)
    with open(
            '/home/wzyCode/scalablelearning/input/' + file_name + '/' +
            str(training_size) + '_beta.txt', 'wb') as f:
        pickle.dump(train_beta, f)


# np.savetxt('/home/wzyCode/scalablelearning/input/'+file_name + '/'+str(training_size)+'_train.txt', train_data);
# np.savetxt('/home/wzyCode/scalablelearning/input/'+file_name + '/'+str(training_size)+'_test.txt', test_data);
# np.savetxt('/home/wzyCode/scalablelearning/input/'+file_name + '/'+str(training_size)+'_beta.txt', orig_beta_data);

    end = time.time()
    split_time = end - start
Beispiel #7
0
def kmmProcess():
    parser = argparse.ArgumentParser()
    parser.add_argument('-b',
                        "--bagging",
                        type=int,
                        choices=[1, 2, 3, 4],
                        default=1,
                        help="bagging strategy")
    parser.add_argument("-t",
                        "--training",
                        type=int,
                        default=12000,
                        help="size of training data")
    # parser.add_argument("-s", "--tr_bsize", type=int, help="the sample size of train set")
    parser.add_argument("-x",
                        "--te_bsize",
                        type=int,
                        help="the sample size of test set")
    # parser.add_argument("-m", "--train_samples", type=int, help="number of samples from training")
    parser.add_argument("-n",
                        "--test_samples",
                        type=int,
                        help="number of samples from test")
    parser.add_argument("-e", "--eta", type=float, help="the eta value")
    parser.add_argument(
        "-i",
        "--input",
        type=str,
        default='/home/wzyCode/scalablelearning/dataset/kdd.arff',
        help="default input file")
    parser.add_argument("-o", "--operate", type=int, help="which experiment")
    parser.add_argument("-c", "--core", type=int, help="the number of cores")
    # parser.add_argument("-o", "--output", type=str, default='/home/wzyCode/scalablelearning/dataset/bag.txt',
    #                    help="default output file")
    args = parser.parse_args()

    mode = args.bagging  # bagging strategy
    training_size = args.training  # training set size (small training set)
    # tr_bsize = args.tr_bsize  # By default, the train bag size is dynamic, if specified, the train bag size will fix
    te_bsize = args.te_bsize  # By default, the test bag size is dynamic, if specified, the test bag size will fix
    # m = args.train_samples  # take m samples from training
    n = args.test_samples  # take n samples from
    eta = args.eta  # take eta value from
    o = args.operate
    numOfCores = args.core

    file_name = args.input
    #trainFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_train.txt'  # input file path
    #testFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_test.txt'  # input file path
    #betaFileName = '/home/wzyCode/scalablelearning/input/' + file_name + '/' + str(training_size) + '_beta.txt'  # input file path
    base_output_file = '/home/wzyCode/scalablelearning/output/' + str(
        o) + '/VFKMM_' + file_name + '_'

    # Step 1: Generate biased train and test set, as well as the orginal beta for train set
    #start = time.time()

    input_file = '/home/wzyCode/scalablelearning/dataset/' + file_name + '.arff'  # input file path

    train, train_beta, test = split(input_file, training_size)

    #end = time.time()
    #split_time = end - start

    train_data = np.array(train)
    test_data = np.array(test)
    orig_beta_data = np.array(train_beta)
    # 1.Bagging process
    # train_data = np.loadtxt('dataset/'+file_name+'_train.txt')
    # test_data = np.loadtxt('dataset/'+file_name+'_test.txt')
    # orig_beta_data = ('dataset/'+file_name+'_beta.txt')

    tr_bsize, m = get_train_info(train_data, n, eta)
    start = time.time()

    # Bagging the train and test data from the sampled index
    tr_bag_size, tr_bag_no = get_size_no(train_data, tr_bsize, m)
    te_bag_size, te_bag_no = get_size_no(test_data, te_bsize, n)
    print "tr_bag_size", tr_bag_size
    print "tr_bag_no", tr_bag_no
    print "te_bag_size", te_bag_size
    print "te_bag_no", te_bag_no

    bags = []

    if mode == 1:  # if test is too big, provide x or n to partition test set
        tr_n = bag(train_data, size=tr_bag_size, sample_no=tr_bag_no)
        te_n = partition(test_data, part_size=te_bag_size, part_no=te_bag_no)
    elif mode == 2:  # if train is too big, provide s or m to partition train set
        tr_n = partition(train_data, part_size=tr_bag_size, part_no=tr_bag_no)
        te_n = bag(test_data, size=te_bag_size, sample_no=te_bag_no)
    else:  # random sample, no partition
        tr_n = bag(train_data, size=tr_bag_size, sample_no=tr_bag_no)
        te_n = partition(test_data, part_size=te_bag_size, part_no=te_bag_no)

    # broadcast relative value:
    train_data_broad = sc.broadcast(train_data)
    test_data_broad = sc.broadcast(test_data)
    train_index = sc.broadcast(tr_n)
    test_index = sc.broadcast(te_n)

    print "tr_n", len(tr_n)
    print "te_n", len(te_n)

    if mode < 4:
        bags = cartesianVFKMM(tr_n, te_n)

    else:
        bags = pair(train_data,
                    test_data,
                    tr_n,
                    te_n,
                    sample_no=min(tr_bag_no, te_bag_no))

    #numOfMaps = min(numOfCores, len(tr_n) * len(te_n))
    rdd = sc.parallelize(bags, numOfCores)
    print("Number of splits: ", rdd.getNumPartitions())
    end = time.time()
    bagging_time = end - start

    # 2. Compute Beta Process
    # train_data = train_data_broad.value
    # test_data = test_data_broad.value

    start = time.time()
    res = rdd.map(lambda (idx, tr, te): computeBeta(
        idx, train_data_broad.value[tr], test_data_broad.value[te])).flatMap(
            lambda x: x)
    # res = rdd.map(lambda (idx, tr, te): computeBeta(idx, tr, te)).flatMap(lambda x: x)

    rdd1 = res.aggregateByKey((0, 0), lambda a, b: (a[0] + b, a[1] + 1),
                              lambda a, b: (a[0] + b[0], a[1] + b[1]))

    est_beta_map = rdd1.mapValues(lambda v: v[0] / v[1]).collectAsMap()
    est_beta_idx = est_beta_map.keys()

    end = time.time()
    compute_time = end - start

    # #

    # 3. Compute the NMSE between the est_beta and orig_beta through KMM
    start = time.time()

    est_beta = [est_beta_map[x] for x in est_beta_idx]
    orig_beta = orig_beta_data[est_beta_idx]
    final_result = computeNMSE(est_beta, orig_beta)

    end = time.time()
    evaluate_time = end - start

    # #4. statistics
    statistics = "In KMM method, mode=%s, train_size=%i, test_size=%i, size_of_train_samples=%i, number_of_train_samples=%i, size_of_test_samples=%i, K=%i,eta=%s\n" % \
                 (mode, len(train_data), len(test_data), tr_bag_size, tr_bag_no, te_bag_size, te_bag_no, eta)
    total_time = bagging_time + compute_time + evaluate_time
    time_info = "bagging_time=%s, compute_time=%s, evaluate_time=%s, total_time=%s\n" % \
                (bagging_time, compute_time, evaluate_time, total_time)
    print statistics
    print time_info

    message = "The final NMSE for KMM is : %s \n" % final_result
    print message

    print "---------------------------------------------------------------------------------------------"

    txt_output_file = base_output_file + '_K=' + str(n) + '_trainSize=' + str(
        training_size) + '_eta' + str(eta) + '_tr_bag_no' + str(tr_bag_no)

    ori_beta_val = []
    for i in est_beta_idx:
        ori_beta_val.append([i, orig_beta_data[i]])

    est_beta_val = []
    for i in est_beta_idx:
        est_beta_val.append([i, est_beta_map[i]])

    # write in text file
    textFile = txt_output_file + '.txt'
    print textFile
    with open(textFile, 'a') as textFile:

        textFile.write(statistics)
        textFile.write(time_info)
        textFile.write(message)

        textFile.write("The ori beta value is:")
        textFile.write('\n')
        textFile.write(str(ori_beta_val))

        textFile.write('\n')

        textFile.write("The est beta value is:")
        textFile.write('\n')
        textFile.write(str(est_beta_val))

    # write in csv file
    if o == 1:
        csvFile = base_output_file + '_K=' + str(n) + '_eta=' + str(
            eta) + '.csv'
        csvwrite = file(csvFile, 'a+')
        writer = csv.writer(csvwrite)
        if training_size == 100:
            writer.writerow(
                ['train_size', 'accuracy', 'bagging_time', 'compute_time'])
            writer.writerow(
                [training_size, final_result, bagging_time, compute_time])
        else:
            writer.writerow(
                [training_size, final_result, bagging_time, compute_time])

    if o == 2:
        csvFile = base_output_file + '_trainSize=' + str(
            training_size) + '_eta=' + str(eta) + '.csv'
        csvwrite = file(csvFile, 'a+')
        writer = csv.writer(csvwrite)
        if n == 5:
            writer.writerow(
                ['k_value', 'accuracy', 'bagging_time', 'compute_time'])
            writer.writerow([n, final_result, bagging_time, compute_time])
        else:
            writer.writerow([n, final_result, bagging_time, compute_time])

    if o == 3:
        csvFile = base_output_file + '_trainSize=' + str(
            training_size) + '_K=' + str(n) + '.csv'
        csvwrite = file(csvFile, 'a+')
        writer = csv.writer(csvwrite)
        if eta == 0.1:
            writer.writerow(
                ['eta_value', 'accuracy', 'bagging_time', 'compute_time'])
            writer.writerow([eta, final_result, bagging_time, compute_time])
        else:
            writer.writerow([eta, final_result, bagging_time, compute_time])

    if o == 4:
        csvFile = base_output_file + '_trainSize=' + str(
            training_size) + '_K=' + str(n) + '_eta=' + str(eta) + '.csv'
        csvwrite = file(csvFile, 'a+')
        writer = csv.writer(csvwrite)
        if numOfCores == 20:
            writer.writerow(
                ['numOfCores', 'accuracy', 'bagging_time', 'compute_time'])
            writer.writerow(
                [numOfCores, final_result, bagging_time, compute_time])
        else:
            writer.writerow(
                [numOfCores, final_result, bagging_time, compute_time])
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-b', "--bagging", type=int, choices=[1,2,3,4], default=1, help="bagging strategy")
    parser.add_argument("-t", "--training", type=int, default=12000, help="size of training data")
    parser.add_argument("-r", "--reverse", action="store_true", help="set -t as the size of test data")
    parser.add_argument("-s", "--tr_bsize", type=int, help="the sample size of train set")
    parser.add_argument("-x", "--te_bsize", type=int, help="the sample size of test set")
    parser.add_argument("-m", "--train_samples", type=int, help="number of samples from training")
    parser.add_argument("-n", "--test_samples", type=int, help="number of samples from test")
    parser.add_argument("-i", "--input", type=str, default='./dataset/powersupply.arff', help="default input file")
    parser.add_argument("-o", "--output", type=str, default='./nmse.txt', help="default output file")
    parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode")
    args = parser.parse_args()

    mode = args.bagging # bagging strategy
    training_size = args.training # training set size (small training set)
    reverse = args.reverse # flip training to test (small test set)
    tr_bsize = args.tr_bsize # By default, the train bag size is dynamic, if specified, the train bag size will fix
    te_bsize = args.te_bsize # By default, the test bag size is dynamic, if specified, the test bag size will fix
    m = args.train_samples # take m samples from training
    n = args.test_samples # take n samples from test
    input_file = args.input # input file path
    output_file = args.output # output file path

    # Step 1: Generate biased train and test set, as well as the orginal beta for train set
    start = time.time()

    train, train_beta, test = split(input_file, training_size, reverse)
    train_data = np.array(train)
    test_data = np.array(test)
    orig_beta_data = np.array(train_beta)

    end = time.time()
    split_time = end - start

    # Step 2: Generate the bagging index using different bagging strategies
    start = time.time()

    # Bagging the train and test data from the sampled index
    tr_bag_size, tr_bag_no = get_size_no(train_data, tr_bsize, m)
    te_bag_size, te_bag_no = get_size_no(test_data, te_bsize, n)

    if mode == 1:  # if test is too big, provide x or n to partition test set
        tr_n = bag(train_data, size=tr_bag_size, sample_no=tr_bag_no)
        te_n = partition(test_data, part_size=te_bag_size, part_no=te_bag_no)
    elif mode == 2:  # if train is too big, provide s or m to partition train set
        tr_n = partition(train_data, part_size=tr_bag_size, part_no=tr_bag_no)
        te_n = bag(test_data, size=te_bag_size, sample_no=te_bag_no)
    else: # random sample, no partition
        tr_n = bag(train_data, size=tr_bag_size, sample_no=tr_bag_no)
        te_n = bag(test_data, size=te_bag_size, sample_no=te_bag_no)

    if mode < 4:
        bags = cartesian(train_data, test_data, tr_n, te_n)
    else:
        bags = pair(train_data, test_data, tr_n, te_n, sample_no=min(tr_bag_no, te_bag_no))

    rdd = sc.parallelize(bags)

    end = time.time()
    bagging_time = end - start

    # Step 3: Compute the estimated beta
    start = time.time()

    res = rdd.map(lambda (idx, tr, te): computeBeta(idx, tr, te)).flatMap(lambda x: x)

    rdd1 = res.aggregateByKey((0,0), lambda a,b: (a[0] + b, a[1] + 1),
                              lambda a,b: (a[0] + b[0], a[1] + b[1]))

    est_beta_map = rdd1.mapValues(lambda v: v[0]/v[1]).collectAsMap()
    est_beta_idx = est_beta_map.keys()

    end = time.time()
    compute_time = end - start

    # Step 4: Compute the NMSE between the est_beta and orig_beta
    start = time.time()

    est_beta = [est_beta_map[x] for x in est_beta_idx]
    orig_beta = orig_beta_data[est_beta_idx]
    final_result = computeNMSE(est_beta, orig_beta)

    end = time.time()
    evaluate_time = end - start

    # statistics
    statistics = "mode=%s, train_size=%i, test_size=%i, tr_bag_size=%i, m=%i, te_bag_size=%i, n=%i\n" % \
                 (mode, len(train_data), len(test_data), tr_bag_size, tr_bag_no, te_bag_size, te_bag_no)
    total_time = split_time + bagging_time + compute_time + evaluate_time
    time_info = "split_time=%s, bagging_time=%s, compute_time=%s, evaluate_time=%s, total_time=%s\n" % \
                (split_time, bagging_time, compute_time, evaluate_time, total_time)
    print statistics
    print time_info

    # Save the result into a text file
    with open(output_file, 'a') as output_file:
        message = "The final NMSE is : %s \n" % final_result
        print message
        output_file.write(statistics)
        output_file.write(time_info)
        output_file.write(message)