Ejemplo n.º 1
0
def pyarrow_cpu_count(cpu_count=pyarrow.cpu_count()):
    old_cpu_count = pyarrow.cpu_count()
    pyarrow.set_cpu_count(cpu_count)
    try:
        yield
    finally:
        pyarrow.set_cpu_count(old_cpu_count)
Ejemplo n.º 2
0
def test_cpu_count():
    n = pa.cpu_count()
    assert n > 0
    try:
        pa.set_cpu_count(n + 5)
        assert pa.cpu_count() == n + 5
    finally:
        pa.set_cpu_count(n)
Ejemplo n.º 3
0
def test_cpu_count():
    n = pa.cpu_count()
    assert n > 0
    try:
        pa.set_cpu_count(n + 5)
        assert pa.cpu_count() == n + 5
    finally:
        pa.set_cpu_count(n)
Ejemplo n.º 4
0
def all_parsers(request):
    """
    Fixture all of the CSV parsers.
    """
    if request.param.engine == "pyarrow":
        pytest.importorskip("pyarrow", VERSIONS["pyarrow"])
        # Try setting num cpus to 1 to avoid hangs?
        import pyarrow

        pyarrow.set_cpu_count(1)
    return request.param
Ejemplo n.º 5
0
def all_parsers(request):
    """
    Fixture all of the CSV parsers.
    """
    parser = request.param()
    if parser.engine == "pyarrow":
        pytest.importorskip("pyarrow", VERSIONS["pyarrow"])
        # Try finding a way to disable threads all together
        # for more stable CI runs
        import pyarrow

        pyarrow.set_cpu_count(1)
    return parser
Ejemplo n.º 6
0
def all_parsers(request):
    """
    Fixture all of the CSV parsers.
    """
    parser = request.param()
    if parser.engine == "pyarrow":
        pytest.importorskip("pyarrow", VERSIONS["pyarrow"])
        # Try setting num cpus to 1 to avoid hangs on Azure MacOS/Windows builds
        # or better yet find a way to disable threads
        # TODO(GH#44584) pytest.mark.single these tests
        import pyarrow

        pyarrow.set_cpu_count(1)
    return parser
Ejemplo n.º 7
0
def run_benchmarks(num_threads, what='read'):
    pa.set_cpu_count(num_threads)

    all_results = []
    for name, info in files.items():
        benchmarker = Benchmarker(info)
        if what == 'read':
            print("Benchmarking reads")
            file_results = benchmarker.bench_read()
        elif what == 'write':
            print("Benchmarking writes")
            file_results = benchmarker.bench_write()
        else:
            raise ValueError(what)
        file_results['dataset'] = name
        all_results.append(file_results)

    print(all_results)
    return pd.concat(all_results, ignore_index=True)
Ejemplo n.º 8
0
class Benchmark(conbench.runner.Benchmark):
    def __init__(self):
        self.conbench = conbench.runner.Conbench()
        self.arrow_info = self._arrow_info()
        self.run_info = self._run_info(self.arrow_info)
        self.r_info = None

    def benchmark(self, f, extra_tags, options, case=None):
        cpu_count = options.get("cpu_count", None)
        if cpu_count is not None:
            pyarrow.set_cpu_count(cpu_count)
        tags, context = self._get_tags_and_context(case, extra_tags)
        benchmark, output = self.conbench.benchmark(f, self.name, tags,
                                                    context, self.run_info,
                                                    options)
        self.conbench.publish(benchmark)
        return benchmark, output
Ejemplo n.º 9
0
            print('failed')
        else:
            print('done')

quit()

import boto3
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from joblib import Parallel, delayed

# cpu core 갯수를 설정
ncpu = pa.cpu_count()
print('{} cpu core'.format(ncpu))
pa.set_cpu_count(ncpu)
pa.set_io_thread_count(ncpu)

bucket_name = 'vitaldb-parquets'
prefix = 'vitaldb2017/1608/D1'
track_names = ['Solar8000/HR', 'SNUADC/ECG_II']

odir = 'vital_files'
if not os.path.exists(odir):
    os.mkdir(odir)

# 병렬 처리
s3 = boto3.resource('s3')


def save_file(uri, track_names):
Ejemplo n.º 10
0
#     def get_count(self):
#         return self.count
    
# cs = CountedSlows.remote() # Note how actor construction works
# futures = [cs.slow.remote(r) for r in records]

# while len(futures) > 0:
#      finished, rest = ray.wait(futures)
#      value = ray.get(finished[0])
#      print(value)
#      futures = rest

# count_future_id = cs.get_count.remote()
# ray.get(count_future_id)

pa.set_cpu_count(2)
ray.init(num_cpus=5)

# mols = [m for m in Chem.SDMolSupplier('cdk2.sdf')]
# for m in mols:
#     molid = m.GetProp('id')
#     m.SetProp('_Name', molid) #_Name prop is required for align with shape-it
# ref = Chem.Mol(mols[0])

chunksize = 1048576/10000
chunks = 10
# ref_smiles = 'CCC1=CC(Cl)=C(OC)C(C(NC[C@H]2C[C@H](OC)CN2CC)=O)=C1O'
include_columns = ['SMILES', 'Name']
table_list = csv_chunk_extractor(chunks, chunksize, include_columns)
ray.shutdown()
print('finished alignment')
Ejemplo n.º 11
0
#     def get_count(self):
#         return self.count

# cs = CountedSlows.remote() # Note how actor construction works
# futures = [cs.slow.remote(r) for r in records]

# while len(futures) > 0:
#      finished, rest = ray.wait(futures)
#      value = ray.get(finished[0])
#      print(value)
#      futures = rest

# count_future_id = cs.get_count.remote()
# ray.get(count_future_id)

pa.set_cpu_count(1)
ray.init(num_cpus=4)

# mols = [m for m in Chem.SDMolSupplier('cdk2.sdf')]
# for m in mols:
#     molid = m.GetProp('id')
#     m.SetProp('_Name', molid) #_Name prop is required for align with shape-it
# ref = Chem.Mol(mols[0])

chunksize = 1048576 / 10000
chunks = 10

suppl = [
    m for m in AllChem.SDMolSupplier(
        '/Users/tom/code_test_repository/arrow_testing/cdk2.sdf',
        removeHs=False)
Ejemplo n.º 12
0
 def set_nbr_pyarrow_threads(nbr_threads=None):
     # Set number of threads to use for PyArrow when converting Feather database to pandas dataframe.
     pa.set_cpu_count(
         nbr_threads if nbr_threads else PyArrowThreads.pyarrow_threads)