Beispiel #1
0
    def read_text(self, path):
        """
        Creates a DataSet that represents the Strings produced by reading the given file line wise.

        The file will be read with the system's default character set.

        :param path: The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
        :return: A DataSet that represents the data read from the given file as text lines.
        """
        child = OperationInfo()
        child_set = DataSet(self, child)
        child.identifier = _Identifier.SOURCE_TEXT
        child.path = path
        self._sources.append(child)
        return child_set
Beispiel #2
0
    def generate_sequence(self, frm, to):
        """
        Creates a new data set that contains the given sequence

        :param frm: The start number for the sequence.
        :param to: The end number for the sequence.
        :return: A DataSet representing the given sequence of numbers.
        """
        child = OperationInfo()
        child_set = DataSet(self, child)
        child.identifier = _Identifier.SOURCE_SEQ
        child.frm = frm
        child.to = to
        self._sources.append(child)
        return child_set
Beispiel #3
0
 def read_custom(self, path, filter, splits, format):
     """
     Creates a DataSet using a custom input format that is executed directly in the Python process.
     """
     child = OperationInfo()
     child_set = DataSet(self, child)
     child.identifier = _Identifier.SOURCE_CUSTOM
     child.name = "PythonInputFormat"
     child.path = path
     child.filter = filter
     child.computeSplits = splits
     child.operator = copy.deepcopy(format)
     child.types = _createArrayTypeInfo()
     self._sources.append(child)
     return child_set
Beispiel #4
0
    def from_elements(self, *elements):
        """
        Creates a new data set that contains the given elements.

        The elements must all be of the same type, for example, all of the String or Integer.
        The sequence of elements must not be empty.

        :param elements: The elements to make up the data set.
        :return: A DataSet representing the given list of elements.
        """
        child = OperationInfo()
        child_set = DataSet(self, child)
        child.identifier = _Identifier.SOURCE_VALUE
        child.values = elements
        self._sources.append(child)
        return child_set
Beispiel #5
0
    def read_csv(self, path, types, line_delimiter="\n", field_delimiter=','):
        """
        Create a DataSet that represents the tuples produced by reading the given CSV file.

        :param path: The path of the CSV file.
        :param types: Specifies the types for the CSV fields.
        :return:A CsvReader that can be used to configure the CSV input.
        """
        child = OperationInfo()
        child_set = DataSet(self, child)
        child.identifier = _Identifier.SOURCE_CSV
        child.delimiter_line = line_delimiter
        child.delimiter_field = field_delimiter
        child.path = path
        child.types = types
        self._sources.append(child)
        return child_set
Beispiel #6
0
    def read_csv(self, path, types, line_delimiter="\n", field_delimiter=','):
        """
        Create a DataSet that represents the tuples produced by reading the given CSV file.

        :param path: The path of the CSV file.
        :param types: Specifies the types for the CSV fields.
        :return:A CsvReader that can be used to configure the CSV input.
        """
        child = dict()
        child_set = DataSet(self, child)
        child[_Fields.IDENTIFIER] = _Identifier.SOURCE_CSV
        child[_Fields.DELIMITER_LINE] = line_delimiter
        child[_Fields.DELIMITER_FIELD] = field_delimiter
        child[_Fields.PATH] = path
        child[_Fields.TYPES] = types
        self._sources.append(child)
        return child_set