Beispiel #1
0
    def test_azure_p2p_multiproc(self):
        from pysnptools.util.mapreduce1.mapreduce import map_reduce
        import threading

        logging.info("test_azure_p2p_multiproc")
        runner = LocalMultiProc(3, just_one_process=False)  # Local()
        #runner = LocalMultiThread(3,just_one_process=False)

        storage = AzureP2P("/flstor/testazurep2p/multiproc",
                           local_lambda=ip_address_pid_local)
        #storage = AzureStorage("test/multiproc",default_shared_dir_lambda=closure)
        #storage = PeerToPeer(directory=AzureStorage("test/multiproc/directory",default_shared_dir_lambda=lambda:closure()+"/azure"),
        #                    storage_lambda=lambda:closure()+"/storage",unique_name=lambda:"{0}.{1}".format(os.environ['COMPUTERNAME'],os.getpid()))
        storage.rmtree()
        storage.save("a/b/c.txt", "Hello")
        shutil.rmtree(storage.file_share.local_lambda()
                      [1])  #Removing the main file share copy

        def mapper_closure(id):
            assert storage.load("a/b/c.txt") == "Hello"
            return True

        result = map_reduce(range(4), mapper=mapper_closure, runner=runner)

        logging.info(result)
        logging.info("done with test")
Beispiel #2
0
    def too_slow_test_peertopeer(self):
        logging.info("test_peertopeer")

        output_file = self.file_name("peertopeer")

        def id_and_path_function():
            from pysnptools.util.filecache import ip_address_pid
            ip_pid = ip_address_pid()
            #Need to put the 'cache_top' here explicitly.
            return ip_pid, 'peertopeer/{0}'.format(ip_pid)

        storage = PeerToPeer(common_directory='peertopeer/common',
                             id_and_path_function=id_and_path_function)
        test_snps_cache = storage.join('test_snps')
        test_snps_cache.rmtree()
        test_snps = DistributedBed.write(test_snps_cache,
                                         self.bed,
                                         piece_per_chrom_count=2)

        runner = LocalMultiProc(
            taskcount=5)  #Run on 5 additional Python processes

        for clear_cache in (True, False):
            if clear_cache:
                storage.join('cache').rmtree()
            results_df = single_snp_scale(test_snps=test_snps,
                                          pheno=self.phen_fn,
                                          covar=self.cov_fn,
                                          cache=storage.join('cache'),
                                          output_file_name=output_file,
                                          runner=runner)
            self.compare_files(results_df, "old")
Beispiel #3
0
    def test_localinmultiproc_with_weights(self):
        from pysnptools.util.mapreduce1 import map_reduce
        from pysnptools.util.mapreduce1.runner import LocalMultiProc

        def holder1(n, runner):
            def mapper1(x):
                return int(os.environ['TEST_ENVIRON'])

            def reducer1(sequence):
                return sum(sequence) + int(os.environ['TEST_ENVIRON'])

            return map_reduce(range(n),
                              mapper=mapper1,
                              reducer=reducer1,
                              runner=runner)

        weights = [1, 97, 1, 1]

        def taskindex_to_environ(taskindex):
            return {'TEST_ENVIRON': str(taskindex)}

        runner = LocalMultiProc(4,
                                weights=weights,
                                taskindex_to_environ=taskindex_to_environ)
        assert 'TEST_ENVIRON' not in os.environ
        result = holder1(100, runner)
        assert result == 0 * 1 + 1 * 97 + 2 * 1 + 3 * 1 + 4
    def too_slow_test_notebook(self):
        do_plot = False
        runner = LocalMultiProc(multiprocessing.cpu_count(), mkl_num_threads=2)
        output_file_name = self.file_name("notebook")

        logging.info("TestSingleSnpAllPlusSelect test_notebook")
        # define file names
        snp_reader = Bed(self.pythonpath + "/tests/datasets/synth/all.bed",
                         count_A1=False)
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        # find the chr5 SNPs
        test_snps = snp_reader[:, snp_reader.pos[:, 0] == 5]

        #select the 2nd kernel and run GWAS
        results = single_snp_all_plus_select(test_snps=test_snps,
                                             G=snp_reader,
                                             pheno=pheno_fn,
                                             GB_goal=2,
                                             do_plot=do_plot,
                                             output_file_name=output_file_name,
                                             runner=runner,
                                             count_A1=False)

        self.compare_files(results, "notebook")
    def too_slowtest_two(self):  #!!! rather a big test case
        from pysnptools.util.mapreduce1.runner import Local, LocalMultiProc
        logging.info("TestSingleSnpAllPlusSelect test_two")
        do_plot = False

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        # partition snps on chr5 vs rest
        test_chr = 5
        snp_reader = Bed(bed_fn, count_A1=False)
        test_snps = snp_reader[:, snp_reader.pos[:, 0] == test_chr]
        runner = LocalMultiProc(multiprocessing.cpu_count(), mkl_num_threads=2)

        output_file_name = self.file_name("two")
        for GB_goal in [None, 2]:
            results = single_snp_all_plus_select(
                test_snps=test_snps,
                G=bed_fn,
                pheno=pheno_fn,
                covar=cov_fn,
                k_list=[int(k) for k in np.logspace(0, 7, base=2, num=7)],
                n_folds=7,
                seed=42,
                do_plot=do_plot,
                GB_goal=GB_goal,
                output_file_name=output_file_name,
                runner=runner,
                count_A1=False)
            logging.info(results.head())
            self.compare_files(results, "two")
Beispiel #6
0
    def test_mapreduce1_runner(self):
        logging.info("test_mapreduce1_runner")

        output_file = self.file_name("mapreduce1_runner")
        runner = LocalMultiProc(taskcount=4, just_one_process=True)
        results_df = single_snp_scale(test_snps=self.bed,
                                      pheno=self.phen_fn,
                                      covar=self.cov_fn,
                                      output_file_name=output_file,
                                      runner=runner)
        self.compare_files(results_df, "old")
Beispiel #7
0
    def test_localmultiproc(self):
        from pysnptools.util.mapreduce1 import map_reduce
        from pysnptools.util.mapreduce1.runner import LocalMultiProc

        def holder1(n, runner):
            def mapper1(x):
                return x * x

            def reducer1(sequence):
                return sum(sequence)

            return map_reduce(range(n),
                              mapper=mapper1,
                              reducer=reducer1,
                              runner=runner)

        assert 328350 == holder1(100, LocalMultiProc(4))
    def too_slow_test_three(self):
        logging.info("TestSingleSnpAllPlusSelect test_three")

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        bed_fn = Bed(bed_fn, count_A1=False)
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"
        runner = LocalMultiProc(multiprocessing.cpu_count(), mkl_num_threads=2)

        output_file_name = self.file_name("three")
        results = single_snp_all_plus_select(
            test_snps=bed_fn,
            pheno=pheno_fn,
            covar=cov_fn,
            k_list=[int(k) for k in np.logspace(0, 7, base=2, num=7)],
            n_folds=7,
            seed=42,
            do_plot=False,
            GB_goal=2,
            output_file_name=output_file_name,
            runner=runner,
            count_A1=False)
        logging.info(results)
        self.compare_files(results, "three")
Beispiel #9
0
                pairs = _Pairs(synbed_part_i) if i==j else _Pairs(synbed_part_i,synbed_part_j)
                #print(pairs.iid)
                print('{:,}'.format(pairs.sid_count))
                #print(pairs.sid)
                #print(pairs.pos)
                #print(pairs.row_property)
                snpdata = pairs.read()#
                #print(snpdata.val)

    import datetime
    from pysnptools.kernelreader import SnpKernel
    from pysnptools.standardizer import Unit
    from pysnptools.util.mapreduce1.runner import LocalMultiProc
    from pysnptools.util.mapreduce1 import map_reduce
    #runner=None
    runner = LocalMultiProc(1,just_one_process=False)

    part_pair_count = (part_count*part_count+part_count)//2
    part_pair_index = -1
    print("part_pair_count={0:,}".format(part_pair_count))

    K0 = SnpKernel(synbed,standardizer=Unit()).read() #Precompute the similarity

    start_time = datetime.datetime.now()
    for i,part_i in enumerate(part_list):
        def mapper1(j):
            #from fastlmm.association import single_snp
            #from pysnptools.snpreader import Pairs
            #print('Z')
            #part_j = part_list[j]
            #print('A')
Beispiel #10
0
        fastlmm.association.tests.testepistasis.getTestSuite(),
        fastlmm.association.tests.test_heritability_spatial_correction.
        getTestSuite(),
        fastlmm.util.test.getTestSuite(),
        fastlmm.inference.tests.test.getTestSuite(),
        fastlmm.association.tests.test_single_snp.getTestSuite(),
        fastlmm.association.tests.test_single_snp_linreg.getTestSuite(),
    ])

    if True:  #Standard test run
        r = unittest.TextTestRunner(failfast=False)
        r.run(suites)
    else:  #Cluster test run
        #Because both pysnptools and fastlmm contain a tests folder, to run on cluster must have fastlmm listed first in the PYTHONPATH

        runner = Local()
        runner = LocalMultiProc(taskcount=6,
                                mkl_num_threads=5,
                                just_one_process=False)
        #runner = LocalInParts(1,2,mkl_num_threads=1) # For debugging the cluster runs
        distributable_test = DistributableTest(suites, "temp_test")
        runner.run(distributable_test)

    debian_count = len(os.listdir('expected-debian'))
    if debian_count > 0:
        logging.warning(
            "The tests contain {0} expected-results files that differ between Windows and Debian"
            .format(debian_count))

    logging.info("done with testing")
Beispiel #11
0
def mf_to_runner_function(mf):
    excluded_nodes = [
    ]  #'GCRCM07B20','GCRCM11B05','GCRCM10B06','GCRCM02B07']#'GCRCM02B11','GCRCM03B07'] #'GCRCM22B06','GCRCN0383','GCRCM02B07','GCRCN0179','GCRCM37B13','GCRCN0376','GCRCN0456']#'gcrcn0231']#"MSR-HDP-DN0316","MSR-HDP-DN0321","MSR-HDP-DN0336","MSR-HDP-DN0377","MSR-HDP-DN0378","MSR-HDP-DN0314","MSR-HDP-DN0335","MSRQC073","MSRQC002","MSRQC015"]
    remote_python_parent = r"\\GCR\Scratch\RR1\escience\carlk\data\carlk\pythonpath10262016"
    clean_up = False

    if mf == "debug":
        runner_function = lambda ignore: LocalInParts(
            215,
            215,
            mkl_num_threads=20,
            result_file="result.p",
            run_dir=r"C:\deldir\test\outputx")
    elif mf == "local":
        runner_function = lambda ignore: Local()
    elif mf == "local1":
        runner_function = lambda ignore: Local(1)
    elif mf == "lmp":
        runner_function = lambda ignore: LocalMultiProc(22, 5)
    elif mf == "lmt":
        runner_function = lambda ignore: LocalMultiThread(22, 5)
    elif mf == "lmtl":
        runner_function = lambda ignore: LocalMultiThread(
            22, 5, just_one_process=True)
    elif mf == "lmp4":
        runner_function = lambda ignore: LocalMultiProc(4, 5)
    elif mf == "lmpl":
        runner_function = lambda taskcount: LocalMultiProc(
            taskcount, taskcount, just_one_process=True)
    elif mf == "nodeP":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 30100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='node',  #core, socket, node
            update_remote_python_parent=True,
            template="Preemptable",
            priority="Lowest",
            excluded_nodes=excluded_nodes,
            #mkl_num_threads=20,
            nodegroups="Preemptable",
            runtime="0:11:0",  # day:hour:min
            #min = 10 #max(1,min(taskcount,110)//20)
            #max = min(taskcount,500),
            clean_up=clean_up,
        )
    elif mf == "nodeP99":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 30100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='node',  #core, socket, node
            update_remote_python_parent=True,
            template="Preemptable",
            priority="Lowest",
            excluded_nodes=excluded_nodes,
            #mkl_num_threads=20,
            nodegroups="Preemptable,B99",
            runtime="0:11:0",  # day:hour:min
            #min = 10 #max(1,min(taskcount,110)//20)
            #max = min(taskcount,500),
            clean_up=clean_up,
        )
    elif mf == "nodeL99":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 30100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='node',  #core, socket, node
            update_remote_python_parent=True,
            template="LongRunQ",
            priority="Lowest",
            excluded_nodes=excluded_nodes,
            #mkl_num_threads=20,
            nodegroups="LongRunQ,B99",
            runtime="11:0:0",  # day:hour:min
            #min = 10 #max(1,min(taskcount,110)//20)
            #max = min(taskcount,500),
            clean_up=clean_up,
        )
    elif mf == "socketP":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 30100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='socket',  #core, socket, node
            update_remote_python_parent=True,
            template="Preemptable",
            priority="Lowest",
            excluded_nodes=excluded_nodes,
            mkl_num_threads=10,
            nodegroups="Preemptable",
            runtime="0:11:0",  # day:hour:min
            #min = max(1,min(taskcount,110)//20),
            clean_up=clean_up,
        )
    elif mf == "coreP":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 1000),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='core',  #core, socket, node
            update_remote_python_parent=True,
            template="Preemptable",
            priority="Lowest",
            excluded_nodes=excluded_nodes,
            mkl_num_threads=1,
            runtime="0:11:0",  # day:hour:min
            nodegroups="Preemptable",
            #min = min(taskcount,1100)
            min=1,
            max=200 * 20,
            clean_up=clean_up,
        )
    elif mf == "coreP99":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 1000),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='core',  #core, socket, node
            update_remote_python_parent=True,
            template="Preemptable",
            priority="Lowest",
            excluded_nodes=excluded_nodes,
            mkl_num_threads=1,
            runtime="0:11:0",  # day:hour:min
            nodegroups="Preemptable,B99",
            #min = min(taskcount,1100)
            min=1,
            max=200 * 20,
            clean_up=clean_up,
        )
    elif mf == "coreAz":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 1000),
            'GCR',
            r"\\GCR\Scratch\AZ-USCentral\escience",
            remote_python_parent=
            r"\\GCR\Scratch\AZ-USCentral\escience\carlk\data\carlk\pythonpath",
            unit='core',  #core, socket, node
            update_remote_python_parent=True,
            template="Azure IaaS USCentral",
            mkl_num_threads=1,
            runtime="0:8:0",  # day:hour:min,
            clean_up=clean_up,
        )
    elif mf == "nodeE":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 10100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='node',  #core, socket, node
            update_remote_python_parent=True,
            template="ExpressQ",
            priority="Normal",
            #node_local = False,
            #mkl_num_threads=20,
            runtime="0:4:0",  # day:hour:min
            #min = min(taskcount,100),
            clean_up=clean_up,
        )
    elif mf == "50tasks":
        runner_function = lambda taskcount: HPC(
            50,
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='node',  #core, socket, node
            update_remote_python_parent=True,
            template="ExpressQ",
            priority="Normal",
            #mkl_num_threads=20,
            runtime="0:4:0",  # day:hour:min
            #min = min(taskcount,100),
            clean_up=clean_up,
        )
    elif mf == "coreE":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 10100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='core',  #core, socket, node
            update_remote_python_parent=True,
            template="ExpressQ",
            priority="Normal",
            mkl_num_threads=1,
            runtime="0:4:0",  # day:hour:min
            #min = min(taskcount,100),
            clean_up=clean_up,
        )
    elif mf == "nodeA":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 30100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='node',  #core, socket, node
            update_remote_python_parent=True,
            template="Admin Template",
            clean_up=clean_up,
        )
    elif mf == "socketA":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 30100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='socket',  #core, socket, node
            update_remote_python_parent=True,
            template="Admin Template",
            clean_up=clean_up,
        )
    elif mf == "coreA":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 30100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='core',  #core, socket, node
            update_remote_python_parent=True,
            template="Admin Template",
            clean_up=clean_up,
        )
    elif mf == "nodeH":
        runner_function = lambda taskcount: Hadoop2(min(taskcount, 100000),
                                                    mapmemory=58 * 1024,
                                                    reducememory=8 * 1024,
                                                    min_alloc=2048,
                                                    xmx=3072,
                                                    mkl_num_threads=14,
                                                    queue="shared",
                                                    skipdatacheck=True,
                                                    skipsourcecheck=True)
    elif mf == "coreH":
        runner_function = lambda taskcount: Hadoop2(min(taskcount, 100000),
                                                    mapmemory=8 * 1024,
                                                    reducememory=8 * 1024,
                                                    min_alloc=2048,
                                                    xmx=3072,
                                                    mkl_num_threads=1,
                                                    queue="shared",
                                                    skipdatacheck=True,
                                                    skipsourcecheck=True)
    else:
        raise Exception("don't find mf=" + mf)
    return runner_function
Beispiel #12
0
def one_experiment(
    test_snps,
    K0_goal,
    seed,
    pheno,
    covar,
    leave_out_one_chrom,
    use_gpu,
    proc_count,
    GB_goal,
    just_one_process=False,
    cpu_weight=1,
    gpu_weight=1,
    gpu_count=1,
    test_case="?",
    force=None,
    two_ks=False,
):
    import numpy as np
    from pysnptools.util.mapreduce1.runner import LocalMultiProc
    from unittest.mock import patch
    from fastlmm.association import single_snp

    if K0_goal is not None:
        K0 = test_snps[:, ::test_snps.sid_count // K0_goal]
    else:
        K0 = None

    if proc_count == 1:
        runner = None
        xp = "cupy" if use_gpu > 0 else "numpy"
    else:
        if use_gpu == 0:
            runner = LocalMultiProc(proc_count,
                                    just_one_process=just_one_process)
            xp = "numpy"
        else:
            assert gpu_count <= proc_count
            weights = [gpu_weight] * gpu_count + [cpu_weight
                                                  ] * (proc_count - gpu_count)

            def taskindex_to_environ(taskindex):
                if taskindex < gpu_count:
                    return {
                        "ARRAY_MODULE": "cupy",
                        "GPU_INDEX": str(taskindex)
                    }
                else:
                    return {"ARRAY_MODULE": "numpy"}

            xp = "cupy"
            runner = LocalMultiProc(
                proc_count,
                weights=weights,
                taskindex_to_environ=taskindex_to_environ,
                just_one_process=just_one_process,
            )

    force_full_rank = False
    force_low_rank = False
    if force == "full_rank":
        force_full_rank = True
    elif force == "low_rank":
        force_low_rank = True
    elif force is None:
        pass
    else:
        assert False

    start_time = time.time()

    results_dataframe = single_snp(
        K0=K0,
        K1=test_snps if two_ks else None,
        test_snps=test_snps,
        pheno=pheno,
        covar=covar,
        leave_out_one_chrom=leave_out_one_chrom,
        count_A1=False,
        GB_goal=GB_goal,
        runner=runner,
        xp=xp,
        force_full_rank=force_full_rank,
        force_low_rank=force_low_rank,
    )
    delta_time = time.time() - start_time

    K0_count = test_snps.iid_count if K0 is None else K0.sid_count

    perf_result = {
        "computer_name": os.environ.get("COMPUTERNAME", "<unknown>"),
        "cpu_count": multiprocessing.cpu_count(),
        "gpu_count": "1?",
        "test_case": test_case,
        "linked": "MKL?/OpenBLAS?",
        "test_snps": str(test_snps),
        "iid_count": test_snps.iid_count,
        "test_sid_count": test_snps.sid_count,
        "low_rank": 0 if test_snps.iid_count == K0_count else 1,
        "K0_count": K0_count,
        "seed": seed,
        "chrom_count": len(np.unique(test_snps.pos[:, 0])),
        "covar_count": covar.col_count,
        "leave_out_one_chrom": 1 if leave_out_one_chrom else 0,
        "num_threads": os.environ.get("MKL_NUM_THREADS", "<none>"),
        "use_gpu": use_gpu,
        "cpu_weight": cpu_weight,
        "gpu_weight": gpu_weight,
        "proc_count": proc_count,
        "just_one_process": 1 if just_one_process else 0,
        "GB_goal": GB_goal,
        "time (s)": delta_time,
        "force": force if force is not None else "<none>",
        "two_ks": 1 if two_ks else 0,
    }
    return perf_result
Beispiel #13
0
def getTestSuite():
    """
    set up composite test suite
    """
    test_suite = unittest.TestSuite([])
    test_suite.addTests(unittest.TestLoader().loadTestsFromTestCase(
        TestSingleSnpScale))  #Tests Ludicrous Speed GWAS
    return test_suite


if __name__ == '__main__':
    #logging.basicConfig(level=logging.INFO)
    logging.getLogger().setLevel(logging.WARN)
    suites = getTestSuite()

    if True:
        r = unittest.TextTestRunner(failfast=False)
        ret = r.run(suites)
        assert ret.wasSuccessful()
    else:  #runner test run
        logging.basicConfig(level=logging.INFO)

        from pysnptools.util.mapreduce1.distributabletest import DistributableTest
        runner = LocalMultiProc(taskcount=22,
                                mkl_num_threads=5,
                                just_one_process=False)
        distributable_test = DistributableTest(suites, "temp_test")
        print(runner.run(distributable_test))

    logging.info("done")