def __init__(self, inputs: List[str], ioctx: IOContext = None): """Initialize a JsonReader. Arguments: inputs (str|list): either a glob expression for files, e.g., "/tmp/**/*.json", or a list of single file paths or URIs, e.g., ["s3://bucket/file.json", "s3://bucket/file2.json"]. ioctx (IOContext): current IO context object. """ self.ioctx = ioctx or IOContext() if isinstance(inputs, str): inputs = os.path.abspath(os.path.expanduser(inputs)) if os.path.isdir(inputs): inputs = os.path.join(inputs, "*.json") logger.warning( "Treating input directory as glob pattern: {}".format( inputs)) if urlparse(inputs).scheme not in [""] + WINDOWS_DRIVES: raise ValueError( "Don't know how to glob over `{}`, ".format(inputs) + "please specify a list of files to read instead.") else: self.files = glob.glob(inputs) elif type(inputs) is list: self.files = inputs else: raise ValueError( "type of inputs must be list or str, not {}".format(inputs)) if self.files: logger.info("Found {} input files.".format(len(self.files))) else: raise ValueError("No files found matching {}".format(inputs)) self.cur_file = None
def __init__(self, path, ioctx=None, max_file_size=64 * 1024 * 1024, compress_columns=frozenset(["obs", "new_obs"])): """Initialize a JsonWriter. Arguments: path (str): a path/URI of the output directory to save files in. ioctx (IOContext): current IO context object. max_file_size (int): max size of single files before rolling over. compress_columns (list): list of sample batch columns to compress. """ self.path = path self.ioctx = ioctx or IOContext() self.max_file_size = max_file_size self.compress_columns = compress_columns if urlparse(path).scheme: self.path_is_uri = True else: # Try to create local dirs if they don't exist try: os.makedirs(path) except OSError: pass # already exists assert os.path.exists(path), "Failed to create {}".format(path) self.path_is_uri = False self.file_index = 0 self.bytes_written = 0 self.cur_file = None
def __init__(self, path: str, ioctx: IOContext = None, max_file_size: int = 64 * 1024 * 1024, compress_columns: List[str] = frozenset(["obs", "new_obs"])): """Initializes a JsonWriter instance. Args: path: a path/URI of the output directory to save files in. ioctx: current IO context object. max_file_size: max size of single files before rolling over. compress_columns: list of sample batch columns to compress. """ self.ioctx = ioctx or IOContext() self.max_file_size = max_file_size self.compress_columns = compress_columns if urlparse(path).scheme not in [""] + WINDOWS_DRIVES: self.path_is_uri = True else: path = os.path.abspath(os.path.expanduser(path)) # Try to create local dirs if they don't exist try: os.makedirs(path) except OSError: pass # already exists assert os.path.exists(path), "Failed to create {}".format(path) self.path_is_uri = False self.path = path self.file_index = 0 self.bytes_written = 0 self.cur_file = None
def __init__(self, ioctx: IOContext = None, compress_columns: List[str] = frozenset(["obs", "new_obs"])): """Initializes a DatasetWriter instance. Examples: config = { "output"="dataset", "output_config"={ "format": "json", "path": "/tmp/test_samples/", "max_num_samples_per_file": 100000, } } Args: ioctx: current IO context object. compress_columns: list of sample batch columns to compress. """ self.ioctx = ioctx or IOContext() output_config: Dict = ioctx.output_config assert "format" in output_config, ( "output_config.type must be specified when using Dataset output.") assert "path" in output_config, ( "output_config.path must be specified when using Dataset output.") self.format = output_config["format"] self.path = os.path.abspath(os.path.expanduser(output_config["path"])) self.max_num_samples_per_file = ( output_config["max_num_samples_per_file"] if "max_num_samples_per_file" in output_config else 100000) self.compress_columns = compress_columns self.samples = []
def __init__(self, inputs: Union[str, List[str]], ioctx: Optional[IOContext] = None): """Initializes a JsonReader instance. Args: inputs: Either a glob expression for files, e.g. `/tmp/**/*.json`, or a list of single file paths or URIs, e.g., ["s3://bucket/file.json", "s3://bucket/file2.json"]. ioctx: Current IO context object or None. """ logger.info("You are using JSONReader. It is recommended to use " + "DatasetReader instead for better sharding support.") self.ioctx = ioctx or IOContext() self.default_policy = self.policy_map = None if self.ioctx.worker is not None: self.policy_map = self.ioctx.worker.policy_map self.default_policy = self.policy_map.get(DEFAULT_POLICY_ID) if isinstance(inputs, str): inputs = os.path.abspath(os.path.expanduser(inputs)) if os.path.isdir(inputs): inputs = [ os.path.join(inputs, "*.json"), os.path.join(inputs, "*.zip") ] logger.warning( f"Treating input directory as glob patterns: {inputs}") else: inputs = [inputs] if any( urlparse(i).scheme not in [""] + WINDOWS_DRIVES for i in inputs): raise ValueError( "Don't know how to glob over `{}`, ".format(inputs) + "please specify a list of files to read instead.") else: self.files = [] for i in inputs: self.files.extend(glob.glob(i)) elif isinstance(inputs, (list, tuple)): self.files = list(inputs) else: raise ValueError( "type of inputs must be list or str, not {}".format(inputs)) if self.files: logger.info("Found {} input files.".format(len(self.files))) else: raise ValueError("No files found matching {}".format(inputs)) self.cur_file = None
def __init__(self, inputs: List[str], ioctx: IOContext = None): """Initialize a JsonReader. Args: inputs (str|list): Either a glob expression for files, e.g., "/tmp/**/*.json", or a list of single file paths or URIs, e.g., ["s3://bucket/file.json", "s3://bucket/file2.json"]. ioctx (IOContext): Current IO context object. """ self.ioctx = ioctx or IOContext() self.default_policy = None if self.ioctx.worker is not None: self.default_policy = \ self.ioctx.worker.policy_map.get(DEFAULT_POLICY_ID) if isinstance(inputs, str): inputs = os.path.abspath(os.path.expanduser(inputs)) if os.path.isdir(inputs): inputs = [ os.path.join(inputs, "*.json"), os.path.join(inputs, "*.zip") ] logger.warning( f"Treating input directory as glob patterns: {inputs}") else: inputs = [inputs] if any( urlparse(i).scheme not in [""] + WINDOWS_DRIVES for i in inputs): raise ValueError( "Don't know how to glob over `{}`, ".format(inputs) + "please specify a list of files to read instead.") else: self.files = [] for i in inputs: self.files.extend(glob.glob(i)) elif type(inputs) is list: self.files = inputs else: raise ValueError( "type of inputs must be list or str, not {}".format(inputs)) if self.files: logger.info("Found {} input files.".format(len(self.files))) else: raise ValueError("No files found matching {}".format(inputs)) self.cur_file = None
def __init__(self, dist: Dict[JsonReader, float], ioctx: IOContext): """Initialize a MixedInput. Args: dist (dict): dict mapping JSONReader paths or "sampler" to probabilities. The probabilities must sum to 1.0. ioctx (IOContext): current IO context object. """ if sum(dist.values()) != 1.0: raise ValueError("Values must sum to 1.0: {}".format(dist)) self.choices = [] self.p = [] for k, v in dist.items(): if k == "sampler": self.choices.append(ioctx.default_sampler_input()) else: self.choices.append(JsonReader(k)) self.p.append(v)
def __init__(self, dist: Dict[JsonReader, float], ioctx: IOContext): """Initialize a MixedInput. Args: dist (dict): dict mapping JSONReader paths or "sampler" to probabilities. The probabilities must sum to 1.0. ioctx (IOContext): current IO context object. """ if sum(dist.values()) != 1.0: raise ValueError("Values must sum to 1.0: {}".format(dist)) self.choices = [] self.p = [] for k, v in dist.items(): if k == "sampler": self.choices.append(ioctx.default_sampler_input()) elif isinstance(k, FunctionType): self.choices.append(k(ioctx)) elif isinstance(k, str) and registry_contains_input(k): input_creator = registry_get_input(k) self.choices.append(input_creator(ioctx)) else: self.choices.append(JsonReader(k, ioctx)) self.p.append(v)