def _numpy_and_codec_from_arrow_type(field_type): from pyarrow import types if types.is_int8(field_type): np_type = np.int8 elif types.is_int16(field_type): np_type = np.int16 elif types.is_int32(field_type): np_type = np.int32 elif types.is_int64(field_type): np_type = np.int64 elif types.is_string(field_type): np_type = np.unicode_ elif types.is_boolean(field_type): np_type = np.bool_ elif types.is_float32(field_type): np_type = np.float32 elif types.is_float64(field_type): np_type = np.float64 elif types.is_decimal(field_type): np_type = Decimal elif types.is_binary(field_type): np_type = np.string_ elif types.is_fixed_size_binary(field_type): np_type = np.string_ elif types.is_date(field_type): np_type = np.datetime64 elif types.is_timestamp(field_type): np_type = np.datetime64 elif types.is_list(field_type): np_type = _numpy_and_codec_from_arrow_type(field_type.value_type) else: raise ValueError('Cannot auto-create unischema due to unsupported column type {}'.format(field_type)) return np_type
def convertPyArrowTypeToGlueType(pyarrowType: pa.DataType) -> str: if (types.is_string(pyarrowType) or types.is_unicode(pyarrowType) or types.is_large_string(pyarrowType) or types.is_large_unicode(pyarrowType)): return 'string' if (types.is_int64(pyarrowType) or types.is_uint64(pyarrowType)): return 'bigint' if (types.is_binary(pyarrowType)): return 'binary' if (types.is_boolean(pyarrowType)): return 'boolean' if (types.is_date(pyarrowType) or types.is_date32(pyarrowType) or types.is_date64(pyarrowType)): return 'date' if (types.is_decimal(pyarrowType)): return 'decimal(16,2)' if (types.is_float64(pyarrowType)): 'return double' if (types.is_float16(pyarrowType) or types.is_float32(pyarrowType)): return 'float' if (types.is_int16(pyarrowType) or types.is_int32(pyarrowType) or types.is_uint16(pyarrowType) or types.is_uint32(pyarrowType)): return 'int' if (types.is_map(pyarrowType)): return 'map' if (types.is_struct(pyarrowType)): return 'struct' if (types.is_timestamp(pyarrowType)): return 'timestamp' if (types.is_union(pyarrowType)): return 'union' return str(pyarrowType)
def _cast_float(val: Any, dtype: pa.DataType) -> Union[np.float32, np.float64]: if is_float32(dtype): casted = np.float32(val) elif is_float64(dtype): casted = np.float64(val) else: raise NotImplementedError return casted
def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType( [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def _numpy_and_codec_from_arrow_type(field_type): from pyarrow import types if types.is_int8(field_type): np_type = np.int8 codec = ScalarCodec(ByteType()) elif types.is_int16(field_type): np_type = np.int16 codec = ScalarCodec(ShortType()) elif types.is_int32(field_type): np_type = np.int32 codec = ScalarCodec(IntegerType()) elif types.is_int64(field_type): np_type = np.int64 codec = ScalarCodec(LongType()) elif types.is_string(field_type): np_type = np.unicode_ codec = ScalarCodec(StringType()) elif types.is_boolean(field_type): np_type = np.bool_ codec = ScalarCodec(BooleanType()) elif types.is_float32(field_type): np_type = np.float32 codec = ScalarCodec(FloatType()) elif types.is_float64(field_type): np_type = np.float64 codec = ScalarCodec(DoubleType()) elif types.is_decimal(field_type): np_type = Decimal codec = ScalarCodec(DecimalType(field_type.precision, field_type.scale)) elif types.is_binary(field_type): codec = ScalarCodec(StringType()) np_type = np.string_ elif types.is_fixed_size_binary(field_type): codec = ScalarCodec(StringType()) np_type = np.string_ elif types.is_date(field_type): np_type = np.datetime64 codec = ScalarCodec(DateType()) elif types.is_timestamp(field_type): np_type = np.datetime64 codec = ScalarCodec(TimestampType()) elif types.is_list(field_type): _, np_type = _numpy_and_codec_from_arrow_type(field_type.value_type) codec = None else: raise ValueError( 'Cannot auto-create unischema due to unsupported column type {}'. format(field_type)) return codec, np_type
def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError( "Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType([ StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at ]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def main(args=None): """Main method for postprocessing the raw outputs from an MC run.""" if args is None: args = sys.argv[1:] args = parser.parse_args() # Start parsing args quantiles = args.quantiles verbose = args.verbose prefix = args.prefix use_gpu = args.gpu if verbose: logging.info(args) # File Management top_output_dir = args.output # Check if it exists, make if not if not os.path.exists(top_output_dir): os.makedirs(top_output_dir) # Use lookup, add prefix # TODO need to handle lookup weights if args.lookup is not None: lookup_df = read_lookup(args.lookup) if prefix is None: prefix = Path(args.lookup).stem # TODO if args.lookup we need to check it for weights # Create subfolder for this run using UUID of run uuid = args.file.split("/")[-2] if prefix is not None: uuid = prefix + "_" + uuid # Create directory if it doesn't exist output_dir = os.path.join(top_output_dir, uuid) if not os.path.exists(output_dir): os.makedirs(output_dir) data_dir = os.path.join(args.file, "data/") metadata_dir = os.path.join(args.file, "metadata/") adm_mapping = pd.read_csv(os.path.join(metadata_dir, "adm_mapping.csv")) dates = pd.read_csv(os.path.join(metadata_dir, "dates.csv")) dates = dates["date"].to_numpy() n_adm2 = len(adm_mapping) adm2_sorted_ind = xp.argsort(xp.array(adm_mapping["adm2"].to_numpy())) if use_gpu: enable_cupy(optimize=True) reimport_numerical_libs("postprocess") per_capita_cols = [ "cumulative_reported_cases", "cumulative_deaths", "current_hospitalizations", "daily_reported_cases", "daily_deaths", "vacc_dose1", "vacc_dose2", "immune", ] pop_weighted_cols = [ "case_reporting_rate", "R_eff", "frac_vacc_dose1", "frac_vacc_dose2", "frac_vacc_dose1_65", "frac_vacc_dose2_65", "frac_immune", "frac_immune_65", "state_phase", ] adm_mapping["adm0"] = 1 adm_map = adm_mapping.to_dict(orient="list") adm_map = {k: xp.array(v)[adm2_sorted_ind] for k, v in adm_map.items()} adm_array_map = { k: xp.unique(v, return_inverse=True)[1] for k, v in adm_map.items() } adm_sizes = { k: xp.to_cpu(xp.max(v) + 1).item() for k, v in adm_array_map.items() } adm_level_values = {k: xp.to_cpu(xp.unique(v)) for k, v in adm_map.items()} adm_level_values["adm0"] = np.array(["US"]) if args.lookup is not None and "weight" in lookup_df.columns: weight_series = lookup_df.set_index("adm2")["weight"].reindex( adm_mapping["adm2"], fill_value=0.0) weights = np.array(weight_series.to_numpy(), dtype=np.float32) # TODO we should ignore all the adm2 not in weights rather than just 0ing them (it'll go alot faster) else: weights = np.ones_like(adm2_sorted_ind, dtype=np.float32) write_queue = queue.Queue() def _writer(): """Write thread that will pull from a queue.""" # Call to_write.get() until it returns None file_tables = {} for fname, q_dict in iter(write_queue.get, None): df = pd.DataFrame(q_dict) id_col = df.columns[df.columns.str.contains("adm.")].values[0] df = df.set_index([id_col, "date", "quantile"]) df = df.reindex(sorted(df.columns), axis=1) if fname in file_tables: tmp = pa.table(q_dict) file_tables[fname] = pa.concat_tables( [file_tables[fname], tmp]) else: file_tables[fname] = pa.table(q_dict) write_queue.task_done() # dump tables to disk for fname in tqdm.tqdm(file_tables): df = file_tables[fname].to_pandas() id_col = df.columns[df.columns.str.contains("adm.")].values[0] df = df.set_index([id_col, "date", "quantile"]) df = df.reindex(sorted(df.columns), axis=1) df.to_csv(fname, header=True, mode="w") write_queue.task_done() write_thread = threading.Thread(target=_writer) write_thread.start() # TODO this depends on out of scope vars, need to clean that up def pa_array_quantiles(array, level): """Calculate the quantiles of a pyarrow array after shipping it to the GPU.""" data = array.to_numpy().reshape(-1, n_adm2) data = data[:, adm2_sorted_ind] data_gpu = xp.array(data.T) if adm_sizes[level] == 1: # TODO need switching here b/c cupy handles xp.percentile weird with a size 1 dim :( if use_gpu: level_data_gpu = xp.sum(data_gpu, axis=0) # need this if cupy else: level_data_gpu = xp.sum(data_gpu, axis=0, keepdims=True).T # for numpy q_data_gpu = xp.empty((len(percentiles), adm_sizes[level]), dtype=level_data_gpu.dtype) # It appears theres a cupy bug when the 1st axis of the array passed to percentiles has size 1 xp.percentile(level_data_gpu, q=percentiles, axis=0, out=q_data_gpu) else: level_data_gpu = xp.zeros((adm_sizes[level], data_gpu.shape[1]), dtype=data_gpu.dtype) xp.scatter_add(level_data_gpu, adm_array_map[level], data_gpu) q_data_gpu = xp.empty((len(percentiles), adm_sizes[level]), dtype=level_data_gpu.dtype) xp.percentile(level_data_gpu, q=percentiles, axis=1, out=q_data_gpu) return q_data_gpu try: percentiles = xp.array(quantiles, dtype=np.float64) * 100.0 quantiles = np.array(quantiles) for date_i, date in enumerate(tqdm.tqdm(dates)): dataset = ds.dataset(data_dir, format="parquet", partitioning=["date"]) table = dataset.to_table(filter=ds.field("date") == "date=" + str(date_i)) table = table.drop( ("date", "rid", "adm2_id")) # we don't need these b/c metadata pop_weight_table = table.select(pop_weighted_cols) table = table.drop(pop_weighted_cols) w = np.ravel( np.broadcast_to( weights, (table.shape[0] // weights.shape[0], weights.shape[0]))) for i, col in enumerate(table.column_names): if pat.is_float64(table.column(i).type): typed_w = w.astype(np.float64) else: typed_w = w.astype(np.float32) tmp = pac.multiply_checked(table.column(i), typed_w) table = table.set_column(i, col, tmp) for col in pop_weighted_cols: if pat.is_float64(pop_weight_table[col].type): typed_w = table["total_population"].to_numpy().astype( np.float64) else: typed_w = table["total_population"].to_numpy().astype( np.float32) tmp = pac.multiply_checked(pop_weight_table[col], typed_w) table = table.append_column(col, tmp) for level in args.levels: all_q_data = {} for col in table.column_names: # TODO can we do all at once since we dropped date? all_q_data[col] = pa_array_quantiles(table[col], level) # all_q_data = {col: pa_array_quantiles(table[col]) for col in table.column_names} # we could do this outside the date loop and cache for each adm level... out_shape = ( len(percentiles), ) + adm_level_values[level].shape all_q_data[level] = np.broadcast_to(adm_level_values[level], out_shape) all_q_data["date"] = np.broadcast_to(date, out_shape) all_q_data["quantile"] = np.broadcast_to( quantiles[..., None], out_shape) for col in per_capita_cols: all_q_data[col + "_per_100k"] = 100000.0 * all_q_data[ col] / all_q_data["total_population"] for col in pop_weighted_cols: all_q_data[ col] = all_q_data[col] / all_q_data["total_population"] for col in all_q_data: all_q_data[col] = xp.to_cpu(all_q_data[col].T.ravel()) write_queue.put( (os.path.join(output_dir, level + "_quantiles.csv"), all_q_data)) del dataset gc.collect() except (KeyboardInterrupt, SystemExit): logging.warning("Caught SIGINT, cleaning up") write_queue.put(None) # send signal to term loop write_thread.join() # join the write_thread finally: write_queue.put(None) # send signal to term loop write_thread.join() # join the write_thread