def typed_prop_to_binary(prop_val, prop_type): # All format strings start with an unsigned char to represent our prop_type enum format_str = "=B" # Remove leading and trailing whitespace prop_val = prop_val.strip() if prop_val == "": # An empty string indicates a NULL property. # TODO This is not allowed in Cypher, consider how to handle it here rather than in-module. return struct.pack(format_str, 0) # TODO allow ID type specification if prop_type == Type.LONG: try: numeric_prop = int(prop_val) return struct.pack(format_str + "q", Type.LONG.value, numeric_prop) except (ValueError, struct.error): # TODO ugly, rethink if prop_type == Type.LONG: raise SchemaError("Could not parse '%s' as a long" % prop_val) elif prop_type == Type.DOUBLE: try: numeric_prop = float(prop_val) if not math.isnan(numeric_prop) and not math.isinf( numeric_prop): # Don't accept non-finite values. return struct.pack(format_str + "d", Type.DOUBLE.value, numeric_prop) except (ValueError, struct.error): # TODO ugly, rethink if prop_type == Type.DOUBLE: raise SchemaError("Could not parse '%s' as a double" % prop_val) elif prop_type == Type.BOOL: # If field is 'false' or 'true', it is a boolean if prop_val.lower() == 'false': return struct.pack(format_str + '?', Type.BOOL.value, False) elif prop_val.lower() == 'true': return struct.pack(format_str + '?', Type.BOOL.value, True) else: raise SchemaError("Could not parse '%s' as a boolean" % prop_val) elif prop_type == Type.ID or prop_type == Type.STRING: # If we've reached this point, the property is a string encoded_str = str.encode( prop_val) # struct.pack requires bytes objects as arguments # Encoding len+1 adds a null terminator to the string format_str += "%ds" % (len(encoded_str) + 1) return struct.pack(format_str, Type.STRING.value, encoded_str) elif prop_type == Type.ARRAY: if prop_val[0] != '[' or prop_val[-1] != ']': raise SchemaError("Could not parse '%s' as an array" % prop_val) return array_prop_to_binary(format_str, prop_val) # If it hasn't returned by this point, it is trying to set it to a type that it can't adopt raise SchemaError("unable to parse [" + prop_val + "] with type [" + repr(prop_type) + "]")
def convert_header_with_schema(self, header): self.types = [None] * self.column_count # Value type of every column. for idx, field in enumerate(header): pair = field.split(':') # Multiple colons found in column name, emit error. # TODO might need to check for backtick escapes if len(pair) > 2: raise CSVError("%s: Field '%s' had %d colons" % (self.infile.name, field, len(field))) # Convert the column type. col_type = convert_schema_type(pair[1].upper().strip()) # If the column did not have a name but the type requires one, emit an error. if len(pair[0]) == 0 and col_type not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE): raise SchemaError( "%s: Each property in the header should be a colon-separated pair" % (self.infile.name)) else: # We have a column name and a type. # Only store the name if the column's values should be added as properties. if len(pair[0]) > 0 and col_type not in (Type.START_ID, Type.END_ID, Type.IGNORE): column_name = pair[0].strip() self.column_names[idx] = column_name # Store the column type. self.types[idx] = col_type
def post_process_header_with_schema(self, header): # Can interleave these tasks if preferred. if self.types.count(Type.START_ID) != 1: raise SchemaError( "Relation file '%s' should have exactly one START_ID column." % (self.infile.name)) if self.types.count(Type.END_ID) != 1: raise SchemaError( "Relation file '%s' should have exactly one END_ID column." % (self.infile.name)) self.start_id = self.types.index(Type.START_ID) self.end_id = self.types.index(Type.END_ID) # Capture namespaces of start and end IDs if provided start_match = re.search(r"\((\w+)\)", header[self.start_id]) if start_match: self.start_namespace = start_match.group(1) end_match = re.search(r"\((\w+)\)", header[self.end_id]) if end_match: self.end_namespace = end_match.group(1)
def process_entities(self): entities_created = 0 with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader: for row in reader: self.validate_row(row) try: start_id = row[self.start_id] if self.start_namespace: start_id = self.start_namespace + '.' + str(start_id) end_id = row[self.end_id] if self.end_namespace: end_id = self.end_namespace + '.' + str(end_id) src = self.query_buffer.nodes[start_id] dest = self.query_buffer.nodes[end_id] except KeyError as e: print( "%s:%d Relationship specified a non-existent identifier. src: %s; dest: %s" % (self.infile.name, self.reader.line_num - 1, row[self.start_id], row[self.end_id])) if self.config.skip_invalid_edges is False: raise e continue fmt = "=QQ" # 8-byte unsigned ints for src and dest try: row_binary = struct.pack(fmt, src, dest) + self.pack_props(row) except SchemaError as e: raise SchemaError( "%s:%d %s" % (self.infile.name, self.reader.line_num, str(e))) row_binary_len = len(row_binary) # If the addition of this entity will make the binary token grow too large, # send the buffer now. added_size = self.binary_size + row_binary_len if added_size >= self.config.max_token_size or self.query_buffer.buffer_size + added_size >= self.config.max_buffer_size: self.query_buffer.reltypes.append(self.to_binary()) self.query_buffer.send_buffer() self.reset_partial_binary() # Push the reltype onto the query buffer again, as there are more entities to process. self.query_buffer.reltypes.append(self.to_binary()) self.query_buffer.relation_count += 1 entities_created += 1 self.binary_size += row_binary_len self.binary_entities.append(row_binary) self.query_buffer.reltypes.append(self.to_binary()) self.infile.close() print("%d relations created for type '%s'" % (entities_created, self.entity_str))
def convert_schema_type(in_type): try: return Type[in_type] except KeyError: # Handling for ID namespaces # TODO think of better alternatives if in_type.startswith('ID('): return Type.ID elif in_type.startswith('START_ID('): return Type.START_ID elif in_type.startswith('END_ID('): return Type.END_ID else: raise SchemaError("Encountered invalid field type '%s'" % in_type)
def post_process_header_with_schema(self, header): # No ID field is required if we're only inserting nodes. if self.config.store_node_identifiers is False: return # Verify that exactly one field is labeled ID. if self.types.count(Type.ID) != 1: raise SchemaError( "Node file '%s' should have exactly one ID column." % (self.infile.name)) self.id = self.types.index( Type.ID) # Track the offset containing the node ID. id_field = header[self.id] # If the ID field specifies an ID namespace in parentheses like "val:ID(NAMESPACE)", capture the namespace. match = re.search(r"\((\w+)\)", id_field) if match: self.id_namespace = match.group(1)
def process_entities(self): entities_created = 0 with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader: for row in reader: self.validate_row(row) # Update the node identifier dictionary if necessary if self.config.store_node_identifiers: id_field = row[self.id] if self.id_namespace is not None: id_field = self.id_namespace + '.' + str(id_field) self.update_node_dictionary(id_field) try: row_binary = self.pack_props(row) except SchemaError as e: # TODO why is line_num off by one? raise SchemaError( "%s:%d %s" % (self.infile.name, self.reader.line_num - 1, str(e))) row_binary_len = len(row_binary) # If the addition of this entity will make the binary token grow too large, # send the buffer now. # TODO how much of this can be made uniform w/ relations and moved to Querybuffer? added_size = self.binary_size + row_binary_len if added_size >= self.config.max_token_size or self.query_buffer.buffer_size + added_size >= self.config.max_buffer_size: self.query_buffer.labels.append(self.to_binary()) self.query_buffer.send_buffer() self.reset_partial_binary() # Push the label onto the query buffer again, as there are more entities to process. self.query_buffer.labels.append(self.to_binary()) self.query_buffer.node_count += 1 entities_created += 1 self.binary_size += row_binary_len self.binary_entities.append(row_binary) self.query_buffer.labels.append(self.to_binary()) self.infile.close() print("%d nodes created with label '%s'" % (entities_created, self.entity_str))
def check_schema(cls, schema): for error in cls(cls.META_SCHEMA).iter_errors(schema): raise SchemaError.create_from(error)