def initialize(self): if isinstance(self.formula, basestring): self._expression = compile(self.formula, "DeriveNode expression", "eval") self._formula_callable = self._eval_expression else: self._formula_callable = self.formula self._output_fields = FieldList() for field in self.input.fields: self._output_fields.append(field) new_field = Field(self.field_name, analytical_type=self.analytical_type, storage_type=self.storage_type) self._output_fields.append(new_field)
def __init__(self, read_header=False, dialect=None, encoding=None, detect_header=False, sample_size=200, skip_rows=None, empty_as_null=True, **reader_args): super(CSVDataSource, self).__init__(**reader_args) """Creates a CSV data source stream. :Attributes: * resource: file name, URL or a file handle with CVS data * read_header: flag determining whether first line contains header or not. ``False`` by default. * encoding: source character encoding, by default no conversion is performed. * detect_headers: try to determine whether data source has headers in first row or not * sample_size: maximum bytes to be read when detecting encoding and headers in file. By default it is set to 200 bytes to prevent loading huge CSV files at once. * skip_rows: number of rows to be skipped. Default: ``None`` * empty_as_null: treat empty strings as ``Null`` values Note: avoid auto-detection when you are reading from remote URL stream. """ self.read_header = read_header self.encoding = encoding self.detect_header = detect_header self.empty_as_null = empty_as_null self.sample_size = sample_size self.reader_args = reader_args self.reader = None self.dialect = dialect self.close_file = False self.skip_rows = skip_rows self.delim = self.getVar('delim', reader_args) self.fieldlist = self.getVar('fields', reader_args) self.fields = FieldList(self.fieldlist)
class DeriveNode(dp.DataProcess): node_info = { "label": "Derive", "description": "Select or discard records from the stream according to a predicate.", "output": "same fields as input", "attributes": [ { "name": "condition", "description": "Callable or a string with python expression that will evaluate to " "a boolean value", }, { "name": "discard", "description": "flag whether the records matching condition are discarded or included", "default": "False", }, ], } @staticmethod def getTypeName(): return "derive" node_info = { "label": "Derive Node", "description": "Derive a new field using an expression.", "attributes": [ {"name": "field_name", "description": "Derived field name", "default": "new_field"}, { "name": "formula", "description": "Callable or a string with python expression that will evaluate to " "new field value", }, {"name": "analytical_type", "description": "Analytical type of the new field", "default": "unknown"}, {"name": "storage_type", "description": "Storage type of the new field", "default": "unknown"}, ], } def __init__( self, formula=None, field_name="new_field", analytical_type="unknown", storage_type="unknown", **kwargs ): """Creates and initializes selection node """ super(DeriveNode, self).__init__(**kwargs) self.formula = formula self.field_name = field_name self.analytical_type = analytical_type self.storage_type = storage_type self._output_fields = None def _initMembersDict(self): super(DeriveNode, self)._initMembersDict() self._members["formula"] = [str, ""] self._members["field_name"] = [str, ""] def updateMembers(self): super(DeriveNode, self).updateMembers() self.setMember("formula", self.formula) self.setMember("field_name", self.field_name) @property def output_fields(self): return self._output_fields def initialize(self): if isinstance(self.formula, basestring): self._expression = compile(self.formula, "DeriveNode expression", "eval") self._formula_callable = self._eval_expression else: self._formula_callable = self.formula self._output_fields = FieldList() for field in self.input.fields: self._output_fields.append(field) new_field = Field(self.field_name, analytical_type=self.analytical_type, storage_type=self.storage_type) self._output_fields.append(new_field) def _eval_expression(self, **record): return eval(self._expression, None, record) def run(self): for record in self.input.records(): if self._formula_callable: record[self.field_name] = self._formula_callable(**record) else: record[self.field_name] = None self.put_record(record)
def initialize(self): """Initialize CSV source stream: #. perform autodetection if required: #. detect encoding from a sample data (if requested) #. detect whether CSV has headers from a sample data (if requested) #. create CSV reader object #. read CSV headers if requested and initialize stream fields If fields are explicitly set prior to initialization, and header reading is requested, then the header row is just skipped and fields that were set before are used. Do not set fields if you want to read the header. All fields are set to `storage_type` = ``string`` and `analytical_type` = ``unknown``. """ self.file, self.close_file = open_resource(self.resource) handle = None if self.detect_header: sample = self.file.read(self.sample_size) # Encoding test sample = sample.encode('utf-8') sniffer = csv.Sniffer() self.read_header = sniffer.has_header(sample) self.file.seek(0) if self.dialect: if type(self.dialect) == str: dialect = csv.get_dialect(self.dialect) else: dialect = self.dialect self.reader_args["dialect"] = dialect # self.reader = csv.reader(handle, **self.reader_args) self.reader = UnicodeReader(self.file, encoding=self.encoding, empty_as_null=self.empty_as_null, delim=self.delim) if self.skip_rows: for i in range(0, self.skip_rows): self.reader.next() # Initialize field list if self.read_header: field_names = self.reader.next() # Fields set explicitly take priority over what is read from the # header. (Issue #17 might be somehow related) if not self.fields: fields = [ (name, "string", "default") for name in field_names] self.fields = FieldList(fields) if not self.fields: raise RuntimeError("Fields are not initialized. " "Either read fields from CSV header or " "set them manually") else: self.fields = FieldList(self.fields) self.reader.set_fields(self.fields)
class CSVDataSource(DataSource): @staticmethod def getTypeName(): return "csv" def __init__(self, read_header=False, dialect=None, encoding=None, detect_header=False, sample_size=200, skip_rows=None, empty_as_null=True, **reader_args): super(CSVDataSource, self).__init__(**reader_args) """Creates a CSV data source stream. :Attributes: * resource: file name, URL or a file handle with CVS data * read_header: flag determining whether first line contains header or not. ``False`` by default. * encoding: source character encoding, by default no conversion is performed. * detect_headers: try to determine whether data source has headers in first row or not * sample_size: maximum bytes to be read when detecting encoding and headers in file. By default it is set to 200 bytes to prevent loading huge CSV files at once. * skip_rows: number of rows to be skipped. Default: ``None`` * empty_as_null: treat empty strings as ``Null`` values Note: avoid auto-detection when you are reading from remote URL stream. """ self.read_header = read_header self.encoding = encoding self.detect_header = detect_header self.empty_as_null = empty_as_null self.sample_size = sample_size self.reader_args = reader_args self.reader = None self.dialect = dialect self.close_file = False self.skip_rows = skip_rows self.delim = self.getVar('delim', reader_args) self.fieldlist = self.getVar('fields', reader_args) self.fields = FieldList(self.fieldlist) def _initMembersDict(self): super(CSVDataSource, self)._initMembersDict() self._members['fields'] = [str, [ ['id', 'integer'], ['pos_x', 'float'], ['pos_y','float'], ['pos_z', 'float'], ['euler_x', 'float'], ['euler_y', 'float'], ['euler_z', 'float']] ] self._members['delim'] = [str, ","] def updateMembers(self): super(CSVDataSource, self).updateMembers() self.setMember('fields', self.fieldlist) self.setMember('delim', self.delim) def initialize(self): """Initialize CSV source stream: #. perform autodetection if required: #. detect encoding from a sample data (if requested) #. detect whether CSV has headers from a sample data (if requested) #. create CSV reader object #. read CSV headers if requested and initialize stream fields If fields are explicitly set prior to initialization, and header reading is requested, then the header row is just skipped and fields that were set before are used. Do not set fields if you want to read the header. All fields are set to `storage_type` = ``string`` and `analytical_type` = ``unknown``. """ self.file, self.close_file = open_resource(self.resource) handle = None if self.detect_header: sample = self.file.read(self.sample_size) # Encoding test sample = sample.encode('utf-8') sniffer = csv.Sniffer() self.read_header = sniffer.has_header(sample) self.file.seek(0) if self.dialect: if type(self.dialect) == str: dialect = csv.get_dialect(self.dialect) else: dialect = self.dialect self.reader_args["dialect"] = dialect # self.reader = csv.reader(handle, **self.reader_args) self.reader = UnicodeReader(self.file, encoding=self.encoding, empty_as_null=self.empty_as_null, delim=self.delim) if self.skip_rows: for i in range(0, self.skip_rows): self.reader.next() # Initialize field list if self.read_header: field_names = self.reader.next() # Fields set explicitly take priority over what is read from the # header. (Issue #17 might be somehow related) if not self.fields: fields = [ (name, "string", "default") for name in field_names] self.fields = FieldList(fields) if not self.fields: raise RuntimeError("Fields are not initialized. " "Either read fields from CSV header or " "set them manually") else: self.fields = FieldList(self.fields) self.reader.set_fields(self.fields) def finalize(self): if self.file and self.close_file: self.file.close() def rows(self): if not self.reader: raise RuntimeError("Stream is not initialized") if not self.fields: raise RuntimeError("Fields are not initialized") return self.reader def records(self): fields = self.fields.names() for row in self.reader: yield dict(zip(fields, row))