def read_text(self, path): """ Creates a DataSet that represents the Strings produced by reading the given file line wise. The file will be read with the system's default character set. :param path: The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). :return: A DataSet that represents the data read from the given file as text lines. """ child = OperationInfo() child_set = DataSet(self, child) child.identifier = _Identifier.SOURCE_TEXT child.path = path self._sources.append(child) return child_set
def generate_sequence(self, frm, to): """ Creates a new data set that contains the given sequence :param frm: The start number for the sequence. :param to: The end number for the sequence. :return: A DataSet representing the given sequence of numbers. """ child = OperationInfo() child_set = DataSet(self, child) child.identifier = _Identifier.SOURCE_SEQ child.frm = frm child.to = to self._sources.append(child) return child_set
def read_custom(self, path, filter, splits, format): """ Creates a DataSet using a custom input format that is executed directly in the Python process. """ child = OperationInfo() child_set = DataSet(self, child) child.identifier = _Identifier.SOURCE_CUSTOM child.name = "PythonInputFormat" child.path = path child.filter = filter child.computeSplits = splits child.operator = copy.deepcopy(format) child.types = _createArrayTypeInfo() self._sources.append(child) return child_set
def from_elements(self, *elements): """ Creates a new data set that contains the given elements. The elements must all be of the same type, for example, all of the String or Integer. The sequence of elements must not be empty. :param elements: The elements to make up the data set. :return: A DataSet representing the given list of elements. """ child = OperationInfo() child_set = DataSet(self, child) child.identifier = _Identifier.SOURCE_VALUE child.values = elements self._sources.append(child) return child_set
def read_csv(self, path, types, line_delimiter="\n", field_delimiter=','): """ Create a DataSet that represents the tuples produced by reading the given CSV file. :param path: The path of the CSV file. :param types: Specifies the types for the CSV fields. :return:A CsvReader that can be used to configure the CSV input. """ child = OperationInfo() child_set = DataSet(self, child) child.identifier = _Identifier.SOURCE_CSV child.delimiter_line = line_delimiter child.delimiter_field = field_delimiter child.path = path child.types = types self._sources.append(child) return child_set
def read_csv(self, path, types, line_delimiter="\n", field_delimiter=','): """ Create a DataSet that represents the tuples produced by reading the given CSV file. :param path: The path of the CSV file. :param types: Specifies the types for the CSV fields. :return:A CsvReader that can be used to configure the CSV input. """ child = dict() child_set = DataSet(self, child) child[_Fields.IDENTIFIER] = _Identifier.SOURCE_CSV child[_Fields.DELIMITER_LINE] = line_delimiter child[_Fields.DELIMITER_FIELD] = field_delimiter child[_Fields.PATH] = path child[_Fields.TYPES] = types self._sources.append(child) return child_set