def test_azure_p2p_multiproc(self): from pysnptools.util.mapreduce1.mapreduce import map_reduce import threading logging.info("test_azure_p2p_multiproc") runner = LocalMultiProc(3, just_one_process=False) # Local() #runner = LocalMultiThread(3,just_one_process=False) storage = AzureP2P("/flstor/testazurep2p/multiproc", local_lambda=ip_address_pid_local) #storage = AzureStorage("test/multiproc",default_shared_dir_lambda=closure) #storage = PeerToPeer(directory=AzureStorage("test/multiproc/directory",default_shared_dir_lambda=lambda:closure()+"/azure"), # storage_lambda=lambda:closure()+"/storage",unique_name=lambda:"{0}.{1}".format(os.environ['COMPUTERNAME'],os.getpid())) storage.rmtree() storage.save("a/b/c.txt", "Hello") shutil.rmtree(storage.file_share.local_lambda() [1]) #Removing the main file share copy def mapper_closure(id): assert storage.load("a/b/c.txt") == "Hello" return True result = map_reduce(range(4), mapper=mapper_closure, runner=runner) logging.info(result) logging.info("done with test")
def too_slow_test_peertopeer(self): logging.info("test_peertopeer") output_file = self.file_name("peertopeer") def id_and_path_function(): from pysnptools.util.filecache import ip_address_pid ip_pid = ip_address_pid() #Need to put the 'cache_top' here explicitly. return ip_pid, 'peertopeer/{0}'.format(ip_pid) storage = PeerToPeer(common_directory='peertopeer/common', id_and_path_function=id_and_path_function) test_snps_cache = storage.join('test_snps') test_snps_cache.rmtree() test_snps = DistributedBed.write(test_snps_cache, self.bed, piece_per_chrom_count=2) runner = LocalMultiProc( taskcount=5) #Run on 5 additional Python processes for clear_cache in (True, False): if clear_cache: storage.join('cache').rmtree() results_df = single_snp_scale(test_snps=test_snps, pheno=self.phen_fn, covar=self.cov_fn, cache=storage.join('cache'), output_file_name=output_file, runner=runner) self.compare_files(results_df, "old")
def test_localinmultiproc_with_weights(self): from pysnptools.util.mapreduce1 import map_reduce from pysnptools.util.mapreduce1.runner import LocalMultiProc def holder1(n, runner): def mapper1(x): return int(os.environ['TEST_ENVIRON']) def reducer1(sequence): return sum(sequence) + int(os.environ['TEST_ENVIRON']) return map_reduce(range(n), mapper=mapper1, reducer=reducer1, runner=runner) weights = [1, 97, 1, 1] def taskindex_to_environ(taskindex): return {'TEST_ENVIRON': str(taskindex)} runner = LocalMultiProc(4, weights=weights, taskindex_to_environ=taskindex_to_environ) assert 'TEST_ENVIRON' not in os.environ result = holder1(100, runner) assert result == 0 * 1 + 1 * 97 + 2 * 1 + 3 * 1 + 4
def too_slow_test_notebook(self): do_plot = False runner = LocalMultiProc(multiprocessing.cpu_count(), mkl_num_threads=2) output_file_name = self.file_name("notebook") logging.info("TestSingleSnpAllPlusSelect test_notebook") # define file names snp_reader = Bed(self.pythonpath + "/tests/datasets/synth/all.bed", count_A1=False) pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt" # find the chr5 SNPs test_snps = snp_reader[:, snp_reader.pos[:, 0] == 5] #select the 2nd kernel and run GWAS results = single_snp_all_plus_select(test_snps=test_snps, G=snp_reader, pheno=pheno_fn, GB_goal=2, do_plot=do_plot, output_file_name=output_file_name, runner=runner, count_A1=False) self.compare_files(results, "notebook")
def too_slowtest_two(self): #!!! rather a big test case from pysnptools.util.mapreduce1.runner import Local, LocalMultiProc logging.info("TestSingleSnpAllPlusSelect test_two") do_plot = False bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed" pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt" # partition snps on chr5 vs rest test_chr = 5 snp_reader = Bed(bed_fn, count_A1=False) test_snps = snp_reader[:, snp_reader.pos[:, 0] == test_chr] runner = LocalMultiProc(multiprocessing.cpu_count(), mkl_num_threads=2) output_file_name = self.file_name("two") for GB_goal in [None, 2]: results = single_snp_all_plus_select( test_snps=test_snps, G=bed_fn, pheno=pheno_fn, covar=cov_fn, k_list=[int(k) for k in np.logspace(0, 7, base=2, num=7)], n_folds=7, seed=42, do_plot=do_plot, GB_goal=GB_goal, output_file_name=output_file_name, runner=runner, count_A1=False) logging.info(results.head()) self.compare_files(results, "two")
def test_mapreduce1_runner(self): logging.info("test_mapreduce1_runner") output_file = self.file_name("mapreduce1_runner") runner = LocalMultiProc(taskcount=4, just_one_process=True) results_df = single_snp_scale(test_snps=self.bed, pheno=self.phen_fn, covar=self.cov_fn, output_file_name=output_file, runner=runner) self.compare_files(results_df, "old")
def test_localmultiproc(self): from pysnptools.util.mapreduce1 import map_reduce from pysnptools.util.mapreduce1.runner import LocalMultiProc def holder1(n, runner): def mapper1(x): return x * x def reducer1(sequence): return sum(sequence) return map_reduce(range(n), mapper=mapper1, reducer=reducer1, runner=runner) assert 328350 == holder1(100, LocalMultiProc(4))
def too_slow_test_three(self): logging.info("TestSingleSnpAllPlusSelect test_three") bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed" bed_fn = Bed(bed_fn, count_A1=False) pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt" runner = LocalMultiProc(multiprocessing.cpu_count(), mkl_num_threads=2) output_file_name = self.file_name("three") results = single_snp_all_plus_select( test_snps=bed_fn, pheno=pheno_fn, covar=cov_fn, k_list=[int(k) for k in np.logspace(0, 7, base=2, num=7)], n_folds=7, seed=42, do_plot=False, GB_goal=2, output_file_name=output_file_name, runner=runner, count_A1=False) logging.info(results) self.compare_files(results, "three")
pairs = _Pairs(synbed_part_i) if i==j else _Pairs(synbed_part_i,synbed_part_j) #print(pairs.iid) print('{:,}'.format(pairs.sid_count)) #print(pairs.sid) #print(pairs.pos) #print(pairs.row_property) snpdata = pairs.read()# #print(snpdata.val) import datetime from pysnptools.kernelreader import SnpKernel from pysnptools.standardizer import Unit from pysnptools.util.mapreduce1.runner import LocalMultiProc from pysnptools.util.mapreduce1 import map_reduce #runner=None runner = LocalMultiProc(1,just_one_process=False) part_pair_count = (part_count*part_count+part_count)//2 part_pair_index = -1 print("part_pair_count={0:,}".format(part_pair_count)) K0 = SnpKernel(synbed,standardizer=Unit()).read() #Precompute the similarity start_time = datetime.datetime.now() for i,part_i in enumerate(part_list): def mapper1(j): #from fastlmm.association import single_snp #from pysnptools.snpreader import Pairs #print('Z') #part_j = part_list[j] #print('A')
fastlmm.association.tests.testepistasis.getTestSuite(), fastlmm.association.tests.test_heritability_spatial_correction. getTestSuite(), fastlmm.util.test.getTestSuite(), fastlmm.inference.tests.test.getTestSuite(), fastlmm.association.tests.test_single_snp.getTestSuite(), fastlmm.association.tests.test_single_snp_linreg.getTestSuite(), ]) if True: #Standard test run r = unittest.TextTestRunner(failfast=False) r.run(suites) else: #Cluster test run #Because both pysnptools and fastlmm contain a tests folder, to run on cluster must have fastlmm listed first in the PYTHONPATH runner = Local() runner = LocalMultiProc(taskcount=6, mkl_num_threads=5, just_one_process=False) #runner = LocalInParts(1,2,mkl_num_threads=1) # For debugging the cluster runs distributable_test = DistributableTest(suites, "temp_test") runner.run(distributable_test) debian_count = len(os.listdir('expected-debian')) if debian_count > 0: logging.warning( "The tests contain {0} expected-results files that differ between Windows and Debian" .format(debian_count)) logging.info("done with testing")
def mf_to_runner_function(mf): excluded_nodes = [ ] #'GCRCM07B20','GCRCM11B05','GCRCM10B06','GCRCM02B07']#'GCRCM02B11','GCRCM03B07'] #'GCRCM22B06','GCRCN0383','GCRCM02B07','GCRCN0179','GCRCM37B13','GCRCN0376','GCRCN0456']#'gcrcn0231']#"MSR-HDP-DN0316","MSR-HDP-DN0321","MSR-HDP-DN0336","MSR-HDP-DN0377","MSR-HDP-DN0378","MSR-HDP-DN0314","MSR-HDP-DN0335","MSRQC073","MSRQC002","MSRQC015"] remote_python_parent = r"\\GCR\Scratch\RR1\escience\carlk\data\carlk\pythonpath10262016" clean_up = False if mf == "debug": runner_function = lambda ignore: LocalInParts( 215, 215, mkl_num_threads=20, result_file="result.p", run_dir=r"C:\deldir\test\outputx") elif mf == "local": runner_function = lambda ignore: Local() elif mf == "local1": runner_function = lambda ignore: Local(1) elif mf == "lmp": runner_function = lambda ignore: LocalMultiProc(22, 5) elif mf == "lmt": runner_function = lambda ignore: LocalMultiThread(22, 5) elif mf == "lmtl": runner_function = lambda ignore: LocalMultiThread( 22, 5, just_one_process=True) elif mf == "lmp4": runner_function = lambda ignore: LocalMultiProc(4, 5) elif mf == "lmpl": runner_function = lambda taskcount: LocalMultiProc( taskcount, taskcount, just_one_process=True) elif mf == "nodeP": runner_function = lambda taskcount: HPC( min(taskcount, 30100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='node', #core, socket, node update_remote_python_parent=True, template="Preemptable", priority="Lowest", excluded_nodes=excluded_nodes, #mkl_num_threads=20, nodegroups="Preemptable", runtime="0:11:0", # day:hour:min #min = 10 #max(1,min(taskcount,110)//20) #max = min(taskcount,500), clean_up=clean_up, ) elif mf == "nodeP99": runner_function = lambda taskcount: HPC( min(taskcount, 30100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='node', #core, socket, node update_remote_python_parent=True, template="Preemptable", priority="Lowest", excluded_nodes=excluded_nodes, #mkl_num_threads=20, nodegroups="Preemptable,B99", runtime="0:11:0", # day:hour:min #min = 10 #max(1,min(taskcount,110)//20) #max = min(taskcount,500), clean_up=clean_up, ) elif mf == "nodeL99": runner_function = lambda taskcount: HPC( min(taskcount, 30100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='node', #core, socket, node update_remote_python_parent=True, template="LongRunQ", priority="Lowest", excluded_nodes=excluded_nodes, #mkl_num_threads=20, nodegroups="LongRunQ,B99", runtime="11:0:0", # day:hour:min #min = 10 #max(1,min(taskcount,110)//20) #max = min(taskcount,500), clean_up=clean_up, ) elif mf == "socketP": runner_function = lambda taskcount: HPC( min(taskcount, 30100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='socket', #core, socket, node update_remote_python_parent=True, template="Preemptable", priority="Lowest", excluded_nodes=excluded_nodes, mkl_num_threads=10, nodegroups="Preemptable", runtime="0:11:0", # day:hour:min #min = max(1,min(taskcount,110)//20), clean_up=clean_up, ) elif mf == "coreP": runner_function = lambda taskcount: HPC( min(taskcount, 1000), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='core', #core, socket, node update_remote_python_parent=True, template="Preemptable", priority="Lowest", excluded_nodes=excluded_nodes, mkl_num_threads=1, runtime="0:11:0", # day:hour:min nodegroups="Preemptable", #min = min(taskcount,1100) min=1, max=200 * 20, clean_up=clean_up, ) elif mf == "coreP99": runner_function = lambda taskcount: HPC( min(taskcount, 1000), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='core', #core, socket, node update_remote_python_parent=True, template="Preemptable", priority="Lowest", excluded_nodes=excluded_nodes, mkl_num_threads=1, runtime="0:11:0", # day:hour:min nodegroups="Preemptable,B99", #min = min(taskcount,1100) min=1, max=200 * 20, clean_up=clean_up, ) elif mf == "coreAz": runner_function = lambda taskcount: HPC( min(taskcount, 1000), 'GCR', r"\\GCR\Scratch\AZ-USCentral\escience", remote_python_parent= r"\\GCR\Scratch\AZ-USCentral\escience\carlk\data\carlk\pythonpath", unit='core', #core, socket, node update_remote_python_parent=True, template="Azure IaaS USCentral", mkl_num_threads=1, runtime="0:8:0", # day:hour:min, clean_up=clean_up, ) elif mf == "nodeE": runner_function = lambda taskcount: HPC( min(taskcount, 10100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='node', #core, socket, node update_remote_python_parent=True, template="ExpressQ", priority="Normal", #node_local = False, #mkl_num_threads=20, runtime="0:4:0", # day:hour:min #min = min(taskcount,100), clean_up=clean_up, ) elif mf == "50tasks": runner_function = lambda taskcount: HPC( 50, 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='node', #core, socket, node update_remote_python_parent=True, template="ExpressQ", priority="Normal", #mkl_num_threads=20, runtime="0:4:0", # day:hour:min #min = min(taskcount,100), clean_up=clean_up, ) elif mf == "coreE": runner_function = lambda taskcount: HPC( min(taskcount, 10100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='core', #core, socket, node update_remote_python_parent=True, template="ExpressQ", priority="Normal", mkl_num_threads=1, runtime="0:4:0", # day:hour:min #min = min(taskcount,100), clean_up=clean_up, ) elif mf == "nodeA": runner_function = lambda taskcount: HPC( min(taskcount, 30100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='node', #core, socket, node update_remote_python_parent=True, template="Admin Template", clean_up=clean_up, ) elif mf == "socketA": runner_function = lambda taskcount: HPC( min(taskcount, 30100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='socket', #core, socket, node update_remote_python_parent=True, template="Admin Template", clean_up=clean_up, ) elif mf == "coreA": runner_function = lambda taskcount: HPC( min(taskcount, 30100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='core', #core, socket, node update_remote_python_parent=True, template="Admin Template", clean_up=clean_up, ) elif mf == "nodeH": runner_function = lambda taskcount: Hadoop2(min(taskcount, 100000), mapmemory=58 * 1024, reducememory=8 * 1024, min_alloc=2048, xmx=3072, mkl_num_threads=14, queue="shared", skipdatacheck=True, skipsourcecheck=True) elif mf == "coreH": runner_function = lambda taskcount: Hadoop2(min(taskcount, 100000), mapmemory=8 * 1024, reducememory=8 * 1024, min_alloc=2048, xmx=3072, mkl_num_threads=1, queue="shared", skipdatacheck=True, skipsourcecheck=True) else: raise Exception("don't find mf=" + mf) return runner_function
def one_experiment( test_snps, K0_goal, seed, pheno, covar, leave_out_one_chrom, use_gpu, proc_count, GB_goal, just_one_process=False, cpu_weight=1, gpu_weight=1, gpu_count=1, test_case="?", force=None, two_ks=False, ): import numpy as np from pysnptools.util.mapreduce1.runner import LocalMultiProc from unittest.mock import patch from fastlmm.association import single_snp if K0_goal is not None: K0 = test_snps[:, ::test_snps.sid_count // K0_goal] else: K0 = None if proc_count == 1: runner = None xp = "cupy" if use_gpu > 0 else "numpy" else: if use_gpu == 0: runner = LocalMultiProc(proc_count, just_one_process=just_one_process) xp = "numpy" else: assert gpu_count <= proc_count weights = [gpu_weight] * gpu_count + [cpu_weight ] * (proc_count - gpu_count) def taskindex_to_environ(taskindex): if taskindex < gpu_count: return { "ARRAY_MODULE": "cupy", "GPU_INDEX": str(taskindex) } else: return {"ARRAY_MODULE": "numpy"} xp = "cupy" runner = LocalMultiProc( proc_count, weights=weights, taskindex_to_environ=taskindex_to_environ, just_one_process=just_one_process, ) force_full_rank = False force_low_rank = False if force == "full_rank": force_full_rank = True elif force == "low_rank": force_low_rank = True elif force is None: pass else: assert False start_time = time.time() results_dataframe = single_snp( K0=K0, K1=test_snps if two_ks else None, test_snps=test_snps, pheno=pheno, covar=covar, leave_out_one_chrom=leave_out_one_chrom, count_A1=False, GB_goal=GB_goal, runner=runner, xp=xp, force_full_rank=force_full_rank, force_low_rank=force_low_rank, ) delta_time = time.time() - start_time K0_count = test_snps.iid_count if K0 is None else K0.sid_count perf_result = { "computer_name": os.environ.get("COMPUTERNAME", "<unknown>"), "cpu_count": multiprocessing.cpu_count(), "gpu_count": "1?", "test_case": test_case, "linked": "MKL?/OpenBLAS?", "test_snps": str(test_snps), "iid_count": test_snps.iid_count, "test_sid_count": test_snps.sid_count, "low_rank": 0 if test_snps.iid_count == K0_count else 1, "K0_count": K0_count, "seed": seed, "chrom_count": len(np.unique(test_snps.pos[:, 0])), "covar_count": covar.col_count, "leave_out_one_chrom": 1 if leave_out_one_chrom else 0, "num_threads": os.environ.get("MKL_NUM_THREADS", "<none>"), "use_gpu": use_gpu, "cpu_weight": cpu_weight, "gpu_weight": gpu_weight, "proc_count": proc_count, "just_one_process": 1 if just_one_process else 0, "GB_goal": GB_goal, "time (s)": delta_time, "force": force if force is not None else "<none>", "two_ks": 1 if two_ks else 0, } return perf_result
def getTestSuite(): """ set up composite test suite """ test_suite = unittest.TestSuite([]) test_suite.addTests(unittest.TestLoader().loadTestsFromTestCase( TestSingleSnpScale)) #Tests Ludicrous Speed GWAS return test_suite if __name__ == '__main__': #logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.WARN) suites = getTestSuite() if True: r = unittest.TextTestRunner(failfast=False) ret = r.run(suites) assert ret.wasSuccessful() else: #runner test run logging.basicConfig(level=logging.INFO) from pysnptools.util.mapreduce1.distributabletest import DistributableTest runner = LocalMultiProc(taskcount=22, mkl_num_threads=5, just_one_process=False) distributable_test = DistributableTest(suites, "temp_test") print(runner.run(distributable_test)) logging.info("done")