Beispiel #1
0
    def fit(self,
            path=None,
            data_format="csv",
            n_features=None,
            cascade_arity=2,
            n_partitions=4,
            cascade_iterations=5,
            tol=10**-3,
            C=1.0,
            kernel="rbf",
            gamma="auto"):

        try:
            self._kernel_f = getattr(self, CascadeSVM.name_to_kernel[kernel])
        except AttributeError:
            self._kernel_f = getattr(self, CascadeSVM.name_to_kernel["rbf"])

        assert (gamma is "auto" or type(gamma) == float
                or type(float(gamma)) == float), "Gamma is not a valid float"
        assert (kernel is None or kernel in self.name_to_kernel.keys()), \
            "Incorrect kernel value [%s], available kernels are %s" % (
                kernel, self.name_to_kernel.keys())
        assert (C is None or type(C) == float or type(float(C)) == float), \
            "Incorrect C type [%s], type : %s" % (C, type(C))
        assert cascade_arity > 1, "Cascade arity must be greater than 1"
        assert cascade_iterations > 0, "Max iterations must be greater than 0"

        self._cascade_arity = cascade_arity
        self._max_iterations = cascade_iterations
        self._npartitions = n_partitions
        self._tol = tol
        self._last_W = 0
        self._clf = None
        self._clf_params = {"gamma": gamma, "C": C, "kernel": kernel}

        self.read_time = time()
        self.total_time = time()

        # if data_format == "libsvm":
        assert n_features > 0 or data_format != "libsvm"  # "Number of features is required when using libsvm format"
        files = os.listdir(path)

        if not n_features:
            n_features = self._count_features(os.path.join(path, files[0]))

        partitions = self._read_dir(files, path, data_format, n_features)

        # Uncomment to measure read time
        # barrier()
        self.read_time = time() - self.read_time
        self.fit_time = time()

        self._cascade_fit(partitions)

        barrier()

        self.fit_time = time() - self.fit_time

        self.total_time = time() - self.total_time
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--svmlight",
                        help="read files in SVMLight format",
                        action="store_true")
    parser.add_argument("-dt",
                        "--detailed_times",
                        help="get detailed execution times (read and fit)",
                        action="store_true")
    parser.add_argument("-e",
                        "--estimators",
                        metavar="N_ESTIMATORS",
                        type=int,
                        help="default is 10",
                        default=10)
    parser.add_argument("-b",
                        "--block_size",
                        metavar="BLOCK_SIZE",
                        type=str,
                        help="two comma separated ints that represent the "
                        "size of the blocks in which to divide the input "
                        "data (default is 100,100)",
                        default="100,100")
    parser.add_argument("-md",
                        "--max_depth",
                        metavar="MAX_DEPTH",
                        type=int,
                        help="default is np.inf",
                        required=False)
    parser.add_argument("-dd",
                        "--dist_depth",
                        metavar="DIST_DEPTH",
                        type=int,
                        help="default is auto",
                        required=False)
    parser.add_argument("-f",
                        "--features",
                        metavar="N_FEATURES",
                        help="number of features of the input data "
                        "(only for SVMLight files)",
                        type=int,
                        default=None,
                        required=False)
    parser.add_argument("--dense",
                        help="use dense data structures",
                        action="store_true")
    parser.add_argument("-t",
                        "--test-file",
                        metavar="TEST_FILE_PATH",
                        help="test file path",
                        type=str,
                        required=False)
    parser.add_argument("train_data",
                        help="input file in CSV or SVMLight format",
                        type=str)
    args = parser.parse_args()

    train_data = args.train_data

    s_time = time.time()
    read_time = 0

    sparse = not args.dense

    bsize = args.block_size.split(",")
    block_size = (int(bsize[0]), int(bsize[1]))

    if args.svmlight:
        x, y = ds.load_svmlight_file(train_data, block_size, args.features,
                                     sparse)
    else:
        x = ds.load_txt_file(train_data, block_size)
        y = x[:, x.shape[1] - 2:x.shape[1] - 1]
        x = x[:, :x.shape[1] - 1]

    if args.detailed_times:
        barrier()
        read_time = time.time() - s_time
        s_time = time.time()

    if args.dist_depth:
        dist_depth = args.dist_depth
    else:
        dist_depth = "auto"

    if args.max_depth:
        max_depth = args.max_depth
    else:
        max_depth = np.inf

    forest = RandomForestClassifier(n_estimators=args.estimators,
                                    max_depth=max_depth,
                                    distr_depth=dist_depth)
    forest.fit(x, y)

    barrier()
    fit_time = time.time() - s_time

    out = [
        forest.n_estimators, forest.distr_depth, forest.max_depth, read_time,
        fit_time
    ]

    if args.test_file:
        if args.svmlight:
            x_test, y_test = ds.load_svmlight_file(args.test_file, block_size,
                                                   args.features, sparse)
        else:
            x_test = ds.load_txt_file(args.test_file, block_size)
            y_test = x_test[:, x_test.shape[1] - 1:x_test.shape[1]]
            x_test = x_test[:, :x_test.shape[1] - 1]

        out.append(compss_wait_on(forest.score(x_test, y_test)))

    print(out)
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--svmlight", help="read files in SVMLight format",
                        action="store_true")
    parser.add_argument("-dt", "--detailed_times",
                        help="get detailed execution times (read and fit)",
                        action="store_true")
    parser.add_argument("-k", "--kernel", metavar="KERNEL", type=str,
                        help="linear or rbf (default is rbf)",
                        choices=["linear", "rbf"], default="rbf")
    parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int,
                        help="default is 2", default=2)
    parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str,
                        help="two comma separated ints that represent the "
                             "size of the blocks in which to divide the input "
                             "data (default is 100,100)",
                        default="100,100")
    parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS",
                        type=int, help="default is 5", default=5)
    parser.add_argument("-g", "--gamma", metavar="GAMMA", type=float,
                        help="(only for rbf kernel) default is 1 / n_features",
                        default=None)
    parser.add_argument("-c", metavar="C", type=float, default=1,
                        help="Penalty parameter C of the error term. "
                             "Default:1")
    parser.add_argument("-f", "--features", metavar="N_FEATURES",
                        help="number of features of the input data "
                             "(only for SVMLight files)",
                        type=int, default=None, required=False)
    parser.add_argument("-t", "--test-file", metavar="TEST_FILE_PATH",
                        help="test file path", type=str, required=False)
    parser.add_argument("-o", "--output_file", metavar="OUTPUT_FILE_PATH",
                        help="output file path", type=str, required=False)
    parser.add_argument("--convergence", help="check for convergence",
                        action="store_true")
    parser.add_argument("--dense", help="store data in dense format (only "
                                        "for SVMLight files)",
                        action="store_true")
    parser.add_argument("train_data",
                        help="input file in CSV or SVMLight format", type=str)
    parser.add_argument("-v", "--verbose", action="store_true")
    parser.add_argument("-s", "--shuffle", help="shuffle input data",
                        action="store_true")
    args = parser.parse_args()

    train_data = args.train_data

    s_time = time.time()
    read_time = 0

    if not args.gamma:
        gamma = "auto"
    else:
        gamma = args.gamma

    sparse = not args.dense

    bsize = args.block_size.split(",")
    block_size = (int(bsize[0]), int(bsize[1]))

    if args.svmlight:
        x, y = ds.load_svmlight_file(train_data, block_size, args.features,
                                     sparse)
    else:
        x = ds.load_txt_file(train_data, block_size)
        y = x[:, x.shape[1] - 2: x.shape[1] - 1]
        x = x[:, :x.shape[1] - 1]

    if args.shuffle:
        x, y = shuffle(x, y)

    if args.detailed_times:
        barrier()
        read_time = time.time() - s_time
        s_time = time.time()

    csvm = CascadeSVM(cascade_arity=args.arity, max_iter=args.iteration,
                      c=args.c, gamma=gamma,
                      check_convergence=args.convergence, verbose=args.verbose)

    csvm.fit(x, y)

    barrier()
    fit_time = time.time() - s_time

    out = [args.kernel, args.arity, args.part_size, csvm._clf_params["gamma"],
           args.c, csvm.iterations, csvm.converged, read_time, fit_time]

    if os.path.isdir(train_data):
        n_files = os.listdir(train_data)
        out.append(len(n_files))

    if args.test_file:
        if args.svmlight:
            x_test, y_test = ds.load_svmlight_file(args.test_file, block_size,
                                                   args.features,
                                                   sparse)
        else:
            x_test = ds.load_txt_file(args.test_file, block_size)
            y_test = x_test[:, x_test.shape[1] - 1: x_test.shape[1]]
            x_test = x_test[:, :x_test.shape[1] - 1]

        out.append(compss_wait_on(csvm.score(x_test, y_test)))

    if args.output_file:
        with open(args.output_file, "ab") as f:
            wr = csv.writer(f)
            wr.writerow(out)
    else:
        print(out)
Beispiel #4
0
    def bam_merge(self, in_bam_job_files):
        """
        Wrapper task taking any number of bam files and merging them into a
        single bam file.

        Parameters
        ----------
        bam_job_files : list
            List of the locations of the separate bam files that are to be merged
            The first file in the list will be taken as the output file name
        """
        merge_round = -1

        bam_job_files = [i for i in in_bam_job_files]
        while True:
            merge_round += 1
            if len(bam_job_files) > 1:
                tmp_alignments = []

                if bam_job_files:
                    while len(bam_job_files) >= 10:
                        current_list_len = len(bam_job_files)
                        for i in range(0, current_list_len - 9, 10):  # pylint: disable=unused-variable
                            bam_out = bam_job_files[0] + "_merge_" + str(
                                merge_round) + ".bam"
                            tmp_alignments.append(bam_out)

                            self.bam_merge_10(bam_job_files.pop(0),
                                              bam_job_files.pop(0),
                                              bam_job_files.pop(0),
                                              bam_job_files.pop(0),
                                              bam_job_files.pop(0),
                                              bam_job_files.pop(0),
                                              bam_job_files.pop(0),
                                              bam_job_files.pop(0),
                                              bam_job_files.pop(0),
                                              bam_job_files.pop(0), bam_out)

                    bam_out = bam_job_files[0] + "_merge_" + str(
                        merge_round) + ".bam"
                    if len(bam_job_files) >= 5:
                        tmp_alignments.append(bam_out)
                        self.bam_merge_5(bam_job_files.pop(0),
                                         bam_job_files.pop(0),
                                         bam_job_files.pop(0),
                                         bam_job_files.pop(0),
                                         bam_job_files.pop(0), bam_out)
                        bam_out = bam_job_files[0] + "_merge_" + str(
                            merge_round) + ".bam"

                    if len(bam_job_files) == 4:
                        tmp_alignments.append(bam_out)
                        self.bam_merge_4(bam_job_files.pop(0),
                                         bam_job_files.pop(0),
                                         bam_job_files.pop(0),
                                         bam_job_files.pop(0), bam_out)
                    elif len(bam_job_files) == 3:
                        tmp_alignments.append(bam_out)
                        self.bam_merge_3(bam_job_files.pop(0),
                                         bam_job_files.pop(0),
                                         bam_job_files.pop(0), bam_out)
                    elif len(bam_job_files) == 2:
                        tmp_alignments.append(bam_out)
                        self.bam_merge_2(bam_job_files.pop(0),
                                         bam_job_files.pop(0), bam_out)
                    else:
                        tmp_alignments.append(bam_job_files[0])

                barrier()

                bam_job_files = []
                bam_job_files = [new_bam for new_bam in tmp_alignments]

            else:
                break

        return bam_job_files[0]
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--svmlight", help="read files in SVMLight format",
                        action="store_true")
    parser.add_argument("-dt", "--detailed_times",
                        help="get detailed execution times (read and fit)",
                        action="store_true")
    parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int,
                        help="default is 50", default=50)
    parser.add_argument("-c", "--centers", metavar="N_CENTERS", type=int,
                        help="default is 2", default=2)
    parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str,
                        help="two comma separated ints that represent the "
                             "size of the blocks in which to divide the input "
                             "data (default is 100,100)",
                        default="100,100")
    parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS",
                        type=int, help="default is 5", default=5)
    parser.add_argument("-f", "--features", metavar="N_FEATURES",
                        help="number of features of the input data "
                             "(only for SVMLight files)",
                        type=int, default=None, required=False)
    parser.add_argument("--dense", help="store data in dense format (only "
                                        "for SVMLight files)",
                        action="store_true")
    parser.add_argument("--labeled", help="the last column of the input file "
                                          "represents labels (only for text "
                                          "files)",
                        action="store_true")
    parser.add_argument("train_data",
                        help="input file in CSV or SVMLight format", type=str)
    args = parser.parse_args()

    train_data = args.train_data

    s_time = time.time()
    read_time = 0

    sparse = not args.dense

    bsize = args.block_size.split(",")
    block_size = (int(bsize[0]), int(bsize[1]))

    if args.svmlight:
        x, y = ds.load_svmlight_file(train_data, block_size, args.features,
                                     sparse)
    else:
        x = ds.load_txt_file(train_data, block_size)

    n_features = x.shape[1]

    if args.labeled and not args.svmlight:
        x = x[:, :n_features - 1]

    if args.detailed_times:
        barrier()
        read_time = time.time() - s_time
        s_time = time.time()

    kmeans = KMeans(n_clusters=args.clusters, max_iter=args.iteration,
                    arity=args.arity, verbose=True)
    kmeans.fit(x)

    barrier()
    fit_time = time.time() - s_time

    out = [args.clusters, args.arity, args.part_size, read_time, fit_time]

    print(out)
Beispiel #6
0
def main():
    import errno
    import os
    import sys
    import time
    from pycompss.api.api import barrier, compss_wait_on

    # usage
    if len(sys.argv) != 8:
        print("Usage: {} BWA_DB_FILE CONTIG_FILE REFERENCE_FILE REFERENCE_INDEX_FILE INPUT_DIR WORK_DIR " \
              "NUM_PROCESSES [NUM_BUCKETS=2*NUM_PROCESSES]\n\n" \
              "Program name must be called with an absolute path (starting with '/').".format(sys.argv[0]))
        return 1

    # find program directory and basenames
    cmd_dir = os.path.dirname(sys.argv[0])
    if cmd_dir == "" or cmd_dir[0] != '/':
        print(
            "Program must be called with an absolute path (starting with '/')")
        return 1
    prog_basename = os.path.basename(os.path.splitext(sys.argv[0])[0])

    # read inputs
    bwa_db_file = sys.argv[1]
    contig_file = sys.argv[2]
    ref_file = sys.argv[3]
    ref_idx_file = sys.argv[4]
    in_dir_prefix = sys.argv[5]
    work_dir = sys.argv[6]
    num_processes = int(sys.argv[7])
    num_buckets = int(sys.argv[8])

    # setup directories
    in_dirs = [in_dir_prefix + '/' + str(x) for x in range(num_processes)]

    out_dir = "{}/{}_OUT".format(work_dir, prog_basename)
    try:
        os.makedirs(out_dir, mode=0o700)
    except OSError as e:
        if e.errno != errno.EEXIST:
            print("Failed to create Directory[{}].\n".format(out_dir))
            raise

    start_time = time.time()

    # mapping & merge
    inputs = []
    for in_dir in in_dirs:
        exts = set(
            [os.path.splitext(file1)[1] for file1 in os.listdir(in_dir)])
        for ext in exts:
            elem = [
                in_dir + '/' + f for f in os.listdir(in_dir) if f.endswith(ext)
            ]
            elem.sort()
            inputs.append(elem)
    # inputs = [[in_dir+'part_1.'+i, in_dir+'part_2.'+i] for i in range(num_processes)]
    #        ~ [[part_1.0, part_2.0], [part_1.1, part_2.1], ...]

    print("Inputs: ", str(inputs))  # dbg
    contigs = reduce(
        lambda e1, e2: mapping_merge(e1, cmd_dir, bwa_db_file, contig_file, e2
                                     ), inputs, {})
    print("before compss_wait_on")  # dbg
    contigs = compss_wait_on(contigs)
    print("after compss_wait_on")  # dbg
    with open('output.dict', 'w') as f:  # dbg
        f.write(str(contigs))  # dbg
    buckets = split(num_buckets, contigs)

    # rm_dup & analyze
    tar_file = init_tar(out_dir)
    reduce(
        lambda tar_file1, bucket: rmdup_analyze_tar(
            cmd_dir, ref_idx_file, ref_file, bucket, tar_file1), buckets,
        tar_file)
    print("before barrier")  # dbg
    barrier()
    print("after barrier")  # dbg

    print("NGSA-mini-py with {} processes. Ellapsed Time {} (s)".format(
        num_processes,
        time.time() - start_time))