def partitioning(field_names=None, flavor=None): """ Specify a partitioning scheme. The supported schemes include: - "DirectoryPartitioning": this scheme expects one segment in the file path for each field in the specified schema (all fields are required to be present). For example given schema<year:int16, month:int8> the path "/2009/11" would be parsed to ("year"_ == 2009 and "month"_ == 11). - "HivePartitioning": a scheme for "/$key=$value/" nested directories as found in Apache Hive. This is a multi-level, directory based partitioning scheme. Data is partitioned by static values of a particular column in the schema. Partition keys are represented in the form $key=$value in directory names. Field order is ignored, as are missing or unrecognized field names. For example, given schema<year:int16, month:int8, day:int8>, a possible path would be "/year=2009/month=11/day=15" (but the field order does not need to match). Parameters ---------- field_names : pyarrow.Schema or list of str The schema that describes the partitions present in the file path. If a list of strings (field names) is passed, the schema's types are inferred from the file paths (only valid for DirectoryPartitioning). flavor : str, default None The default is DirectoryPartitioning. Specify ``flavor="hive"`` for a HivePartitioning. Returns ------- Partitioning or PartitioningFactory Examples -------- Specify the Schema for paths like "/2009/June": >>> partitioning(pa.schema([("year", pa.int16()), ("month", pa.string())])) or let the types be inferred by only specifying the field names: >>> partitioning(["year", "month"]) For paths like "/2009/June", the year will be inferred as int32 while month will be inferred as string. Create a Hive scheme for a path like "/year=2009/month=11": >>> partitioning( ... pa.schema([("year", pa.int16()), ("month", pa.int8())]), ... flavor="hive") A Hive scheme can also be discovered from the directory structure (and types will be inferred): >>> partitioning(flavor="hive") """ if flavor is None: # default flavor if isinstance(field_names, pa.Schema): return DirectoryPartitioning(field_names) elif isinstance(field_names, list): return DirectoryPartitioning.discover(field_names) elif field_names is None: raise ValueError( "For the default directory flavor, need to specify " "'field_names' as Schema or list of field names") else: raise ValueError( "Expected Schema or list of field names, got {0}".format( type(field_names))) elif flavor == 'hive': if isinstance(field_names, pa.Schema): return HivePartitioning(field_names) elif field_names is None: return HivePartitioning.discover() else: raise ValueError( "Expected Schema or None for 'field_names', got {0}".format( type(field_names))) else: raise ValueError("Unsupported flavor")
def partitioning(schema=None, field_names=None, flavor=None, dictionaries=None): """ Specify a partitioning scheme. The supported schemes include: - "DirectoryPartitioning": this scheme expects one segment in the file path for each field in the specified schema (all fields are required to be present). For example given schema<year:int16, month:int8> the path "/2009/11" would be parsed to ("year"_ == 2009 and "month"_ == 11). - "HivePartitioning": a scheme for "/$key=$value/" nested directories as found in Apache Hive. This is a multi-level, directory based partitioning scheme. Data is partitioned by static values of a particular column in the schema. Partition keys are represented in the form $key=$value in directory names. Field order is ignored, as are missing or unrecognized field names. For example, given schema<year:int16, month:int8, day:int8>, a possible path would be "/year=2009/month=11/day=15" (but the field order does not need to match). Parameters ---------- schema : pyarrow.Schema, default None The schema that describes the partitions present in the file path. If not specified, and `field_names` and/or `flavor` are specified, the schema will be inferred from the file path (and a PartitioningFactory is returned). field_names : list of str, default None A list of strings (field names). If specified, the schema's types are inferred from the file paths (only valid for DirectoryPartitioning). flavor : str, default None The default is DirectoryPartitioning. Specify ``flavor="hive"`` for a HivePartitioning. dictionaries : Dict[str, Array] If the type of any field of `schema` is a dictionary type, the corresponding entry of `dictionaries` must be an array containing every value which may be taken by the corresponding column or an error will be raised in parsing. Alternatively, pass `infer` to have Arrow discover the dictionary values, in which case a PartitioningFactory is returned. Returns ------- Partitioning or PartitioningFactory Examples -------- Specify the Schema for paths like "/2009/June": >>> partitioning(pa.schema([("year", pa.int16()), ("month", pa.string())])) or let the types be inferred by only specifying the field names: >>> partitioning(field_names=["year", "month"]) For paths like "/2009/June", the year will be inferred as int32 while month will be inferred as string. Specify a Schema with dictionary encoding, providing dictionary values: >>> partitioning( ... pa.schema([ ... ("year", pa.int16()), ... ("month", pa.dictionary(pa.int8(), pa.string())) ... ]), ... dictionaries={ ... "month": pa.array(["January", "February", "March"]), ... }) Alternatively, specify a Schema with dictionary encoding, but have Arrow infer the dictionary values: >>> partitioning( ... pa.schema([ ... ("year", pa.int16()), ... ("month", pa.dictionary(pa.int8(), pa.string())) ... ]), ... dictionaries="infer") Create a Hive scheme for a path like "/year=2009/month=11": >>> partitioning( ... pa.schema([("year", pa.int16()), ("month", pa.int8())]), ... flavor="hive") A Hive scheme can also be discovered from the directory structure (and types will be inferred): >>> partitioning(flavor="hive") """ if flavor is None: # default flavor if schema is not None: if field_names is not None: raise ValueError( "Cannot specify both 'schema' and 'field_names'") if dictionaries == 'infer': return DirectoryPartitioning.discover(schema=schema) return DirectoryPartitioning(schema, dictionaries) elif field_names is not None: if isinstance(field_names, list): return DirectoryPartitioning.discover(field_names) else: raise ValueError("Expected list of field names, got {}".format( type(field_names))) else: raise ValueError( "For the default directory flavor, need to specify " "a Schema or a list of field names") elif flavor == 'hive': if field_names is not None: raise ValueError("Cannot specify 'field_names' for flavor 'hive'") elif schema is not None: if isinstance(schema, pa.Schema): if dictionaries == 'infer': return HivePartitioning.discover(schema=schema) return HivePartitioning(schema, dictionaries) else: raise ValueError("Expected Schema for 'schema', got {}".format( type(schema))) else: return HivePartitioning.discover() else: raise ValueError("Unsupported flavor")