def test_make_chunks_splits_with_num_zero(self): res = make_chunks(0) expected = [] self.assertEqual(expected, res)
def get_thor_file(connection, thor_file, max_workers=10, chunk_size='auto', max_attempts=3, max_sleep=60, dtype=None): """ Return a thor file as a pandas.DataFrame. Note: Ordering of the resulting DataFrame is not deterministic and may not be the same as on the HPCC cluster. Parameters ---------- connection: hpycc.Connection HPCC Connection instance, see also `Connection`. thor_file: str Name of thor file to be downloaded. max_workers: int, optional Number of concurrent threads to use when downloading file. Warning: too many may cause instability! 10 by default. chunk_size: int, optional Size of chunks to use when downloading file. If auto this is rows / workers (bounded between 100,000 and 400,000). If give then no limits are enforced. max_attempts: int, optional Maximum number of times a chunk should attempt to be downloaded in the case of an exception being raised. 3 by default. max_sleep: int, optional Minimum time, in seconds, to sleep between attempts. The true sleep time is a random int between `max_sleep` and `max_sleep` * 0.75. dtype: type name or dict of col -> type, optional Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32}. If converters are specified, they will be applied INSTEAD of dtype conversion. If None, or columns are missing from the provided dict, they will be converted to one of bool, str or int based on the HPCC datatype. None by default. Returns ------- df: pandas.DataFrame Thor file as a pandas.DataFrame. See Also -------- save_thor_file Examples -------- >>> import hpycc >>> import pandas >>> conn = hpycc.Connection("user") >>> df = pandas.DataFrame({"col1": [1, 2, 3]}) >>> df.to_csv("example.csv", index=False) >>> hpycc.spray_file(conn,"example.csv","example") >>> hpycc.get_thor_file(conn, "example") col1 0 1 1 2 2 3 >>> import hpycc >>> import pandas >>> conn = hpycc.Connection("user") >>> df = pandas.DataFrame({"col1": [1, 2, 3]}) >>> df.to_csv("example.csv", index=False) >>> hpycc.spray_file(conn,"example.csv","example") >>> hpycc.get_thor_file(conn, "example", dtype=str) col1 0 '1' 1 '2' 2 '3' """ resp = connection.get_chunk_from_hpcc(thor_file, 0, 1, max_attempts, max_sleep) try: wuresultresponse = resp["WUResultResponse"] schema_str = wuresultresponse["Result"]["XmlSchema"]["xml"] schema = parse_schema_from_xml(schema_str) schema = apply_custom_dtypes(schema, dtype) num_rows = wuresultresponse["Total"] except (KeyError, TypeError) as exc: msg = "Can't find schema in returned json: {}".format(resp) raise type(exc)(msg) from exc if chunk_size == 'auto': # Automagically optimise. TODO: we could use width too. suggested_size = ceil(num_rows / max_workers) chunk_size = num_rows if suggested_size < 10000 else suggested_size # Don't chunk small stuff. chunk_size = 325000 if suggested_size > 325000 else chunk_size # More chunks than workers for big stuff. if not num_rows or num_rows == 0: # if there are no rows to go and get, we should return an empty dataframe return pd.DataFrame(columns=schema.keys()) chunks = filechunker.make_chunks(num_rows, chunk_size) with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [ executor.submit(connection.get_logical_file_chunk, thor_file, start_row, n_rows, max_attempts, max_sleep) for start_row, n_rows in chunks ] results = {key: [] for key in schema.keys()} for result in as_completed(futures): result = result.result() [results[k].extend(result[k]) for k in results.keys()] del result results = pd.DataFrame(results) for col in schema.keys(): c = schema[col] nam = col typ = c['type'] if c['is_a_set']: # TODO: Nested DF are also caught here. Open issue to fix results[nam] = results[nam].map( lambda x: [typ(i) for i in x["Item"]]) else: try: results[nam] = results[nam].astype(typ) except OverflowError: # An int that is horrifically long cannot be converted properly. Use float instead results[nam] = results[nam].astype('float') return results
def test_make_chunks_chunksize_equal_zero(self): with self.assertRaises(ZeroDivisionError): make_chunks(10, 0)
def test_make_chunks_uses_10000_as_default_chunksize(self): res = make_chunks(10000) expected = [(0, 10000)] self.assertEqual(expected, res)
def test_make_chunks_chunks_num_greater_than_chunksize(self): res = make_chunks(10, 3) expected = [(0, 3), (3, 3), (6, 3), (9, 1)] self.assertEqual(expected, res)
def test_make_chunks_chunks_num_less_than_chunksize(self): res = make_chunks(3, 10) expected = [(0, 3)] self.assertEqual(expected, res)
def test_make_chunks_chunks_sum_correctly(self): res = make_chunks(500, 3) summed = sum([i[1] for i in res]) self.assertEqual(summed, 500)
def test_make_chunks_splits_with_two_full_chunks(self): res = make_chunks(20, 10) expected = [(0, 10), (10, 10)] self.assertEqual(expected, res)
def spray_file(connection, source_file, logical_file, overwrite=False, expire=None, chunk_size=100000, max_workers=5, delete_workunit=True): """ Spray a file to a HPCC logical file, bypassing the landing zone. Parameters ---------- connection: `Connection` HPCC Connection instance, see also `Connection`. source_file: str, pd.DataFrame A pandas DataFrame or the path to a csv. logical_file: str Logical file name on THOR. overwrite: bool, optional Should the file overwrite any pre-existing logical file. False by default. chunk_size: int, optional Size of chunks to use when spraying file. 100000 by default. max_workers: int, optional Number of concurrent threads to use when spraying. Warning: too many will likely cause either your machine or your cluster to crash! 3 by default. expire: int How long (days) until the produced logical file expires? None (ie no expiry) by default delete_workunit: bool Delete workunit once completed. Returns ------- None """ if isinstance(source_file, pd.DataFrame): df = source_file elif isinstance(source_file, str): df = pd.read_csv(source_file, encoding='latin') else: raise TypeError if logical_file[0] != '~': SyntaxWarning("""Your Logical file name (%s) did not start with ~ so may not be sprayed to root""" % logical_file) record_set = _make_record_set(df) chunks = make_chunks(len(df), chunk_size=chunk_size) print('Any unicode characters will be converted to ASCII, not saying you ' 'have any, just warning you! If you are getting odd errors you may ' 'want to deal with your UTF before spraying.') stringified_rows = (_stringify_rows(df, start_row, num_rows) for start_row, num_rows in chunks) target_names = ["~TEMPHPYCC::{}from{}to{}".format( logical_file.replace("~", ""), start_row, start_row + num_rows) for start_row, num_rows in chunks] with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [ executor.submit(_spray_stringified_data, connection, row, record_set, name, overwrite, delete_workunit) for row, name in zip(stringified_rows, target_names)] _, __ = wait(futures) _ = [f.result() for f in futures] _concatenate_logical_files(connection, target_names, logical_file, record_set, overwrite, expire, delete_workunit) for tmp in target_names: delete_logical_file(connection, tmp, delete_workunit)