Exemple #1
0
    def initialize(self):
        if isinstance(self.formula, basestring):
            self._expression = compile(self.formula, "DeriveNode expression", "eval")
            self._formula_callable = self._eval_expression
        else:
            self._formula_callable = self.formula

        self._output_fields = FieldList()

        for field in self.input.fields:
            self._output_fields.append(field)

        new_field = Field(self.field_name, analytical_type=self.analytical_type, storage_type=self.storage_type)
        self._output_fields.append(new_field)
Exemple #2
0
 def __init__(self, read_header=False, dialect=None, encoding=None,
              detect_header=False, sample_size=200, skip_rows=None,
              empty_as_null=True, **reader_args):
     super(CSVDataSource, self).__init__(**reader_args)
     """Creates a CSV data source stream.
     
     :Attributes:
         * resource: file name, URL or a file handle with CVS data
         * read_header: flag determining whether first line contains header
           or not. ``False`` by default.
         * encoding: source character encoding, by default no conversion is
           performed.
         * detect_headers: try to determine whether data source has headers
           in first row or not
         * sample_size: maximum bytes to be read when detecting encoding
           and headers in file. By default it is set to 200 bytes to
           prevent loading huge CSV files at once.
         * skip_rows: number of rows to be skipped. Default: ``None``
         * empty_as_null: treat empty strings as ``Null`` values
         
     Note: avoid auto-detection when you are reading from remote URL
     stream.
     
     """
     self.read_header = read_header
     self.encoding = encoding
     self.detect_header = detect_header
     self.empty_as_null = empty_as_null
     
     self.sample_size = sample_size
     self.reader_args = reader_args
     self.reader = None
     self.dialect = dialect
     
     self.close_file = False
     self.skip_rows = skip_rows
     self.delim = self.getVar('delim', reader_args)
     self.fieldlist = self.getVar('fields', reader_args)
     self.fields = FieldList(self.fieldlist)
Exemple #3
0
class DeriveNode(dp.DataProcess):
    node_info = {
        "label": "Derive",
        "description": "Select or discard records from the stream according to a predicate.",
        "output": "same fields as input",
        "attributes": [
            {
                "name": "condition",
                "description": "Callable or a string with python expression that will evaluate to " "a boolean value",
            },
            {
                "name": "discard",
                "description": "flag whether the records matching condition are discarded or included",
                "default": "False",
            },
        ],
    }

    @staticmethod
    def getTypeName():
        return "derive"

    node_info = {
        "label": "Derive Node",
        "description": "Derive a new field using an expression.",
        "attributes": [
            {"name": "field_name", "description": "Derived field name", "default": "new_field"},
            {
                "name": "formula",
                "description": "Callable or a string with python expression that will evaluate to " "new field value",
            },
            {"name": "analytical_type", "description": "Analytical type of the new field", "default": "unknown"},
            {"name": "storage_type", "description": "Storage type of the new field", "default": "unknown"},
        ],
    }

    def __init__(
        self, formula=None, field_name="new_field", analytical_type="unknown", storage_type="unknown", **kwargs
    ):
        """Creates and initializes selection node
        """
        super(DeriveNode, self).__init__(**kwargs)
        self.formula = formula
        self.field_name = field_name
        self.analytical_type = analytical_type
        self.storage_type = storage_type
        self._output_fields = None

    def _initMembersDict(self):
        super(DeriveNode, self)._initMembersDict()

        self._members["formula"] = [str, ""]
        self._members["field_name"] = [str, ""]

    def updateMembers(self):
        super(DeriveNode, self).updateMembers()

        self.setMember("formula", self.formula)
        self.setMember("field_name", self.field_name)

    @property
    def output_fields(self):
        return self._output_fields

    def initialize(self):
        if isinstance(self.formula, basestring):
            self._expression = compile(self.formula, "DeriveNode expression", "eval")
            self._formula_callable = self._eval_expression
        else:
            self._formula_callable = self.formula

        self._output_fields = FieldList()

        for field in self.input.fields:
            self._output_fields.append(field)

        new_field = Field(self.field_name, analytical_type=self.analytical_type, storage_type=self.storage_type)
        self._output_fields.append(new_field)

    def _eval_expression(self, **record):
        return eval(self._expression, None, record)

    def run(self):
        for record in self.input.records():
            if self._formula_callable:
                record[self.field_name] = self._formula_callable(**record)
            else:
                record[self.field_name] = None

            self.put_record(record)
Exemple #4
0
    def initialize(self):
        """Initialize CSV source stream:
        
        #. perform autodetection if required:
            #. detect encoding from a sample data (if requested)
            #. detect whether CSV has headers from a sample data (if
            requested)
        #.  create CSV reader object
        #.  read CSV headers if requested and initialize stream fields
        
        If fields are explicitly set prior to initialization, and header
        reading is requested, then the header row is just skipped and fields
        that were set before are used. Do not set fields if you want to read
        the header.

        All fields are set to `storage_type` = ``string`` and
        `analytical_type` = ``unknown``.
        """

        self.file, self.close_file = open_resource(self.resource)

        handle = None
        
        if self.detect_header:
            
            sample = self.file.read(self.sample_size)

            # Encoding test
            sample = sample.encode('utf-8')
            sniffer = csv.Sniffer()
            self.read_header = sniffer.has_header(sample)

            self.file.seek(0)
            
        if self.dialect:
            if type(self.dialect) == str:
                dialect = csv.get_dialect(self.dialect)
            else:
                dialect = self.dialect
                
            self.reader_args["dialect"] = dialect

        # self.reader = csv.reader(handle, **self.reader_args)
        self.reader = UnicodeReader(self.file, encoding=self.encoding,
                                    empty_as_null=self.empty_as_null,
                                    delim=self.delim)

        if self.skip_rows:
            for i in range(0, self.skip_rows):
                self.reader.next()
                
        # Initialize field list
        if self.read_header:
            field_names = self.reader.next()
            
            # Fields set explicitly take priority over what is read from the
            # header. (Issue #17 might be somehow related)
            if not self.fields:
                fields = [ (name, "string", "default") for name in field_names]
                self.fields = FieldList(fields)

        if not self.fields:
            raise RuntimeError("Fields are not initialized. "
                               "Either read fields from CSV header or "
                               "set them manually")

        else:
            self.fields = FieldList(self.fields)

        self.reader.set_fields(self.fields)
Exemple #5
0
class CSVDataSource(DataSource):
    @staticmethod
    def getTypeName():
        return "csv"

    def __init__(self, read_header=False, dialect=None, encoding=None,
                 detect_header=False, sample_size=200, skip_rows=None,
                 empty_as_null=True, **reader_args):
        super(CSVDataSource, self).__init__(**reader_args)
        """Creates a CSV data source stream.
        
        :Attributes:
            * resource: file name, URL or a file handle with CVS data
            * read_header: flag determining whether first line contains header
              or not. ``False`` by default.
            * encoding: source character encoding, by default no conversion is
              performed.
            * detect_headers: try to determine whether data source has headers
              in first row or not
            * sample_size: maximum bytes to be read when detecting encoding
              and headers in file. By default it is set to 200 bytes to
              prevent loading huge CSV files at once.
            * skip_rows: number of rows to be skipped. Default: ``None``
            * empty_as_null: treat empty strings as ``Null`` values
            
        Note: avoid auto-detection when you are reading from remote URL
        stream.
        
        """
        self.read_header = read_header
        self.encoding = encoding
        self.detect_header = detect_header
        self.empty_as_null = empty_as_null
        
        self.sample_size = sample_size
        self.reader_args = reader_args
        self.reader = None
        self.dialect = dialect
        
        self.close_file = False
        self.skip_rows = skip_rows
        self.delim = self.getVar('delim', reader_args)
        self.fieldlist = self.getVar('fields', reader_args)
        self.fields = FieldList(self.fieldlist)

    def _initMembersDict(self):
        super(CSVDataSource, self)._initMembersDict()
        self._members['fields'] = [str, [
          ['id', 'integer'], ['pos_x', 'float'], ['pos_y','float'], ['pos_z', 'float'], 
          ['euler_x', 'float'], ['euler_y', 'float'], ['euler_z', 'float']]
          ]
        self._members['delim'] = [str, ","]

    def updateMembers(self):
        super(CSVDataSource, self).updateMembers()
        self.setMember('fields', self.fieldlist)
        self.setMember('delim', self.delim)
        
    def initialize(self):
        """Initialize CSV source stream:
        
        #. perform autodetection if required:
            #. detect encoding from a sample data (if requested)
            #. detect whether CSV has headers from a sample data (if
            requested)
        #.  create CSV reader object
        #.  read CSV headers if requested and initialize stream fields
        
        If fields are explicitly set prior to initialization, and header
        reading is requested, then the header row is just skipped and fields
        that were set before are used. Do not set fields if you want to read
        the header.

        All fields are set to `storage_type` = ``string`` and
        `analytical_type` = ``unknown``.
        """

        self.file, self.close_file = open_resource(self.resource)

        handle = None
        
        if self.detect_header:
            
            sample = self.file.read(self.sample_size)

            # Encoding test
            sample = sample.encode('utf-8')
            sniffer = csv.Sniffer()
            self.read_header = sniffer.has_header(sample)

            self.file.seek(0)
            
        if self.dialect:
            if type(self.dialect) == str:
                dialect = csv.get_dialect(self.dialect)
            else:
                dialect = self.dialect
                
            self.reader_args["dialect"] = dialect

        # self.reader = csv.reader(handle, **self.reader_args)
        self.reader = UnicodeReader(self.file, encoding=self.encoding,
                                    empty_as_null=self.empty_as_null,
                                    delim=self.delim)

        if self.skip_rows:
            for i in range(0, self.skip_rows):
                self.reader.next()
                
        # Initialize field list
        if self.read_header:
            field_names = self.reader.next()
            
            # Fields set explicitly take priority over what is read from the
            # header. (Issue #17 might be somehow related)
            if not self.fields:
                fields = [ (name, "string", "default") for name in field_names]
                self.fields = FieldList(fields)

        if not self.fields:
            raise RuntimeError("Fields are not initialized. "
                               "Either read fields from CSV header or "
                               "set them manually")

        else:
            self.fields = FieldList(self.fields)

        self.reader.set_fields(self.fields)
        
    def finalize(self):
        if self.file and self.close_file:
            self.file.close()

    def rows(self):
        if not self.reader:
            raise RuntimeError("Stream is not initialized")
        if not self.fields:
            raise RuntimeError("Fields are not initialized")
        return self.reader

    def records(self):
        fields = self.fields.names()
        for row in self.reader:
            yield dict(zip(fields, row))