def pyarrow_cpu_count(cpu_count=pyarrow.cpu_count()): old_cpu_count = pyarrow.cpu_count() pyarrow.set_cpu_count(cpu_count) try: yield finally: pyarrow.set_cpu_count(old_cpu_count)
def test_cpu_count(): n = pa.cpu_count() assert n > 0 try: pa.set_cpu_count(n + 5) assert pa.cpu_count() == n + 5 finally: pa.set_cpu_count(n)
def test_cpu_count(): n = pa.cpu_count() assert n > 0 try: pa.set_cpu_count(n + 5) assert pa.cpu_count() == n + 5 finally: pa.set_cpu_count(n)
def all_parsers(request): """ Fixture all of the CSV parsers. """ if request.param.engine == "pyarrow": pytest.importorskip("pyarrow", VERSIONS["pyarrow"]) # Try setting num cpus to 1 to avoid hangs? import pyarrow pyarrow.set_cpu_count(1) return request.param
def all_parsers(request): """ Fixture all of the CSV parsers. """ parser = request.param() if parser.engine == "pyarrow": pytest.importorskip("pyarrow", VERSIONS["pyarrow"]) # Try finding a way to disable threads all together # for more stable CI runs import pyarrow pyarrow.set_cpu_count(1) return parser
def all_parsers(request): """ Fixture all of the CSV parsers. """ parser = request.param() if parser.engine == "pyarrow": pytest.importorskip("pyarrow", VERSIONS["pyarrow"]) # Try setting num cpus to 1 to avoid hangs on Azure MacOS/Windows builds # or better yet find a way to disable threads # TODO(GH#44584) pytest.mark.single these tests import pyarrow pyarrow.set_cpu_count(1) return parser
def run_benchmarks(num_threads, what='read'): pa.set_cpu_count(num_threads) all_results = [] for name, info in files.items(): benchmarker = Benchmarker(info) if what == 'read': print("Benchmarking reads") file_results = benchmarker.bench_read() elif what == 'write': print("Benchmarking writes") file_results = benchmarker.bench_write() else: raise ValueError(what) file_results['dataset'] = name all_results.append(file_results) print(all_results) return pd.concat(all_results, ignore_index=True)
class Benchmark(conbench.runner.Benchmark): def __init__(self): self.conbench = conbench.runner.Conbench() self.arrow_info = self._arrow_info() self.run_info = self._run_info(self.arrow_info) self.r_info = None def benchmark(self, f, extra_tags, options, case=None): cpu_count = options.get("cpu_count", None) if cpu_count is not None: pyarrow.set_cpu_count(cpu_count) tags, context = self._get_tags_and_context(case, extra_tags) benchmark, output = self.conbench.benchmark(f, self.name, tags, context, self.run_info, options) self.conbench.publish(benchmark) return benchmark, output
print('failed') else: print('done') quit() import boto3 import pandas as pd import pyarrow as pa import pyarrow.parquet as pq from joblib import Parallel, delayed # cpu core 갯수를 설정 ncpu = pa.cpu_count() print('{} cpu core'.format(ncpu)) pa.set_cpu_count(ncpu) pa.set_io_thread_count(ncpu) bucket_name = 'vitaldb-parquets' prefix = 'vitaldb2017/1608/D1' track_names = ['Solar8000/HR', 'SNUADC/ECG_II'] odir = 'vital_files' if not os.path.exists(odir): os.mkdir(odir) # 병렬 처리 s3 = boto3.resource('s3') def save_file(uri, track_names):
# def get_count(self): # return self.count # cs = CountedSlows.remote() # Note how actor construction works # futures = [cs.slow.remote(r) for r in records] # while len(futures) > 0: # finished, rest = ray.wait(futures) # value = ray.get(finished[0]) # print(value) # futures = rest # count_future_id = cs.get_count.remote() # ray.get(count_future_id) pa.set_cpu_count(2) ray.init(num_cpus=5) # mols = [m for m in Chem.SDMolSupplier('cdk2.sdf')] # for m in mols: # molid = m.GetProp('id') # m.SetProp('_Name', molid) #_Name prop is required for align with shape-it # ref = Chem.Mol(mols[0]) chunksize = 1048576/10000 chunks = 10 # ref_smiles = 'CCC1=CC(Cl)=C(OC)C(C(NC[C@H]2C[C@H](OC)CN2CC)=O)=C1O' include_columns = ['SMILES', 'Name'] table_list = csv_chunk_extractor(chunks, chunksize, include_columns) ray.shutdown() print('finished alignment')
# def get_count(self): # return self.count # cs = CountedSlows.remote() # Note how actor construction works # futures = [cs.slow.remote(r) for r in records] # while len(futures) > 0: # finished, rest = ray.wait(futures) # value = ray.get(finished[0]) # print(value) # futures = rest # count_future_id = cs.get_count.remote() # ray.get(count_future_id) pa.set_cpu_count(1) ray.init(num_cpus=4) # mols = [m for m in Chem.SDMolSupplier('cdk2.sdf')] # for m in mols: # molid = m.GetProp('id') # m.SetProp('_Name', molid) #_Name prop is required for align with shape-it # ref = Chem.Mol(mols[0]) chunksize = 1048576 / 10000 chunks = 10 suppl = [ m for m in AllChem.SDMolSupplier( '/Users/tom/code_test_repository/arrow_testing/cdk2.sdf', removeHs=False)
def set_nbr_pyarrow_threads(nbr_threads=None): # Set number of threads to use for PyArrow when converting Feather database to pandas dataframe. pa.set_cpu_count( nbr_threads if nbr_threads else PyArrowThreads.pyarrow_threads)