def validate_csv(rows: Sequence[MutableMapping], key_field: str, overwrite: bool) -> None: """Perform any necessary CSV validation, and return appropriate errors.""" if not rows: raise ValidationFailed([MissingBody()]) if is_edge_table(rows): validate_edge_table(rows) elif is_node_table(rows, key_field): validate_node_table(rows, key_field, overwrite) else: raise ValidationFailed([UnsupportedTable()])
def validate_edge_table(self, edge_table: str) -> EdgeTableProperties: """ Validate that an edge table is suitable for use in a graph. If validation is successful, the edge table properties are returned. Otherwise, a ValidationFailed error is raised. """ loaded_edge_table = self.table(edge_table) edge_table_properties = loaded_edge_table.edge_properties() referenced_tables = edge_table_properties["table_keys"] errors: List[ValidationFailure] = [] for table, keys in referenced_tables.items(): if not self.has_table(table): errors.append(UndefinedTable(table=table)) else: table_keys = set(self.table(table).keys()) undefined = keys - table_keys if undefined: errors.append( UndefinedKeys(table=table, keys=list(undefined))) if errors: raise ValidationFailed(errors) return edge_table_properties
def validate_newick(tree: List[newick.Node]) -> None: """Validate newick tree.""" data_errors: List[ValidationFailure] = [] unique_keys: Set[str] = set() unique_edges: Set[Tuple[str, str, float]] = set() def read_tree(parent: Optional[str], node: newick.Node) -> None: key = node.name or uuid.uuid4().hex if key in unique_keys: data_errors.append(DuplicateKey(key=key)) else: unique_keys.add(key) for desc in node.descendants: read_tree(key, desc) if parent: unique = (parent, key, node.length) if unique in unique_edges: data_errors.append( DuplicateEdge( _from=f"table/{parent}", _to=f"table/{key}", length=node.length ) ) else: unique_edges.add(unique) read_tree(None, tree[0]) if len(data_errors) > 0: raise ValidationFailed(data_errors)
def upload(workspace: str, graph: str) -> Any: """Store a d3 json-encoded graph into the database, with node and edge tables. `workspace` - the target workspace `graph` - the target graph `data` - the json data, passed in the request body. The json data should contain nodes: [] and links: [] """ loaded_workspace = Workspace(workspace) if loaded_workspace.has_graph(graph): raise AlreadyExists("graph", graph) # Get data from the request and load it as json body = decode_data(request.data) data = json.load(StringIO(body), object_pairs_hook=OrderedDict) # Check file structure errors = validate_d3_json(data) if len(errors) > 0: raise ValidationFailed(errors) node_table_name = f"{graph}_nodes" edge_table_name = f"{graph}_links" # Change column names from the d3 format to the arango format nodes = data["nodes"] for node in nodes: node["_key"] = str(node["id"]) del node["id"] links = data["links"] for link in links: link["_from"] = f"{node_table_name}/{link['source']}" link["_to"] = f"{node_table_name}/{link['target']}" del link["source"] del link["target"] # Create or retrieve the node and edge tables if loaded_workspace.has_table(node_table_name): node_table = loaded_workspace.table(node_table_name) else: node_table = loaded_workspace.create_table(node_table_name, edge=False) if loaded_workspace.has_table(edge_table_name): edge_table = loaded_workspace.table(edge_table_name) else: edge_table = loaded_workspace.create_table(edge_table_name, edge=True) # Insert data node_table.insert(nodes) edge_table.insert(links) loaded_workspace.create_graph(graph, edge_table_name) return {"nodecount": len(nodes), "edgecount": len(links)}
def validate_node_table(rows: Sequence[MutableMapping], key_field: str, overwrite: bool) -> None: """Validate that the given table is a valid node table.""" fieldnames = rows[0].keys() data_errors: List[ValidationFailure] = [] if key_field != "_key" and key_field not in fieldnames: data_errors.append(KeyFieldDoesNotExist(key=key_field)) raise ValidationFailed(data_errors) if "_key" in fieldnames and key_field != "_key" and not overwrite: data_errors.append(KeyFieldAlreadyExists(key=key_field)) raise ValidationFailed(data_errors) keys = (row[key_field] for row in rows) unique_keys: Set[str] = set() for key in keys: if key in unique_keys: data_errors.append(DuplicateKey(key=key)) else: unique_keys.add(key) if len(data_errors) > 0: raise ValidationFailed(data_errors)
def create_aql_table(self, table: str, aql_query: str) -> Table: """Create a table in this workspace from an aql query.""" if self.has_table(table): raise AlreadyExists("table", table) # In the future, the result of this validation can be # used to determine dependencies in virtual tables rows = list(self.run_query(aql_query)) errors = validate_csv(rows, "_key", False) if errors: raise ValidationFailed(errors=errors) loaded_table = self.create_table(table, False) loaded_table.insert(rows) return loaded_table
def validate_edge_table(rows: Sequence[MutableMapping]) -> None: """Validate that the given table is a valid edge table.""" data_errors: List[ValidationFailure] = [] # Checks that a cell has the form table_name/key valid_cell = re.compile("[^/]+/[^/]+") for i, row in enumerate(rows): fields: List[str] = [] if not valid_cell.match(row["_from"]): fields.append("_from") if not valid_cell.match(row["_to"]): fields.append("_to") if fields: # i+2 -> +1 for index offset, +1 due to header row data_errors.append(InvalidRow(columns=fields, row=i + 2)) if len(data_errors) > 0: raise ValidationFailed(data_errors)
def upload( workspace: str, table: str, key: str = "_key", overwrite: bool = False, metadata: Optional[str] = None, ) -> Any: """ Store a CSV file into the database as a node or edge table. `workspace` - the target workspace `table` - the target table `data` - the CSV data, passed in the request body. If the CSV data contains `_from` and `_to` fields, it will be treated as an edge table. """ loaded_workspace = Workspace(workspace) if loaded_workspace.has_table(table): raise AlreadyExists("table", table) app.logger.info("Bulk Loading") # Read the request body into CSV format body = decode_data(request.data) try: # Type to a Dict rather than an OrderedDict csv_rows: List[UnprocessedTableRow] = list(csv.DictReader(StringIO(body))) except csv.Error: raise CSVReadError() # TODO: This temporarily needs to be done here, so that validation of the metadata # can be done before the table is actually created. Once the API is updated, this # will change. # https://github.com/multinet-app/multinet-server/issues/493 metadata_dict = {} if metadata: try: metadata_dict = json.loads(metadata) except json.decoder.JSONDecodeError: raise BadQueryArgument("metadata", metadata) table_metadata = table_metadata_from_dict(metadata_dict) rows, metadata_validation_errors = process_rows(csv_rows, table_metadata.columns) # Perform validation. csv_validation_errors = validate_csv(rows, key, overwrite) validation_errors = [*metadata_validation_errors, *csv_validation_errors] if len(validation_errors): raise ValidationFailed(errors=validation_errors) # Once we reach here, we know that the specified key field must be present, # and either: # key == "_key" # noqa: E800 # or key != "_key" and the "_key" field is not present # or key != "_key" and "_key" is present, but overwrite = True if key != "_key": rows = set_table_key(rows, key) # Create table and insert the data loaded_table = loaded_workspace.create_table(table, edge=is_edge_table(rows)) # Set table metadata loaded_table.set_metadata(metadata_dict) results = loaded_table.insert(rows) return {"count": len(results)}