Esempio n. 1
0
def validate_csv(rows: Sequence[MutableMapping], key_field: str,
                 overwrite: bool) -> None:
    """Perform any necessary CSV validation, and return appropriate errors."""
    if not rows:
        raise ValidationFailed([MissingBody()])

    if is_edge_table(rows):
        validate_edge_table(rows)
    elif is_node_table(rows, key_field):
        validate_node_table(rows, key_field, overwrite)
    else:
        raise ValidationFailed([UnsupportedTable()])
Esempio n. 2
0
    def validate_edge_table(self, edge_table: str) -> EdgeTableProperties:
        """
        Validate that an edge table is suitable for use in a graph.

        If validation is successful, the edge table properties are returned.
        Otherwise, a ValidationFailed error is raised.
        """
        loaded_edge_table = self.table(edge_table)
        edge_table_properties = loaded_edge_table.edge_properties()

        referenced_tables = edge_table_properties["table_keys"]

        errors: List[ValidationFailure] = []
        for table, keys in referenced_tables.items():
            if not self.has_table(table):
                errors.append(UndefinedTable(table=table))
            else:
                table_keys = set(self.table(table).keys())
                undefined = keys - table_keys

                if undefined:
                    errors.append(
                        UndefinedKeys(table=table, keys=list(undefined)))

        if errors:
            raise ValidationFailed(errors)

        return edge_table_properties
Esempio n. 3
0
def validate_newick(tree: List[newick.Node]) -> None:
    """Validate newick tree."""
    data_errors: List[ValidationFailure] = []
    unique_keys: Set[str] = set()
    unique_edges: Set[Tuple[str, str, float]] = set()

    def read_tree(parent: Optional[str], node: newick.Node) -> None:
        key = node.name or uuid.uuid4().hex

        if key in unique_keys:
            data_errors.append(DuplicateKey(key=key))
        else:
            unique_keys.add(key)

        for desc in node.descendants:
            read_tree(key, desc)

        if parent:
            unique = (parent, key, node.length)
            if unique in unique_edges:
                data_errors.append(
                    DuplicateEdge(
                        _from=f"table/{parent}", _to=f"table/{key}", length=node.length
                    )
                )
            else:
                unique_edges.add(unique)

    read_tree(None, tree[0])

    if len(data_errors) > 0:
        raise ValidationFailed(data_errors)
Esempio n. 4
0
def upload(workspace: str, graph: str) -> Any:
    """Store a d3 json-encoded graph into the database, with node and edge tables.

    `workspace` - the target workspace
    `graph` - the target graph
    `data` - the json data, passed in the request body. The json data should contain
    nodes: [] and links: []
    """
    loaded_workspace = Workspace(workspace)
    if loaded_workspace.has_graph(graph):
        raise AlreadyExists("graph", graph)

    # Get data from the request and load it as json
    body = decode_data(request.data)
    data = json.load(StringIO(body), object_pairs_hook=OrderedDict)

    # Check file structure
    errors = validate_d3_json(data)
    if len(errors) > 0:
        raise ValidationFailed(errors)

    node_table_name = f"{graph}_nodes"
    edge_table_name = f"{graph}_links"

    # Change column names from the d3 format to the arango format
    nodes = data["nodes"]
    for node in nodes:
        node["_key"] = str(node["id"])
        del node["id"]

    links = data["links"]
    for link in links:
        link["_from"] = f"{node_table_name}/{link['source']}"
        link["_to"] = f"{node_table_name}/{link['target']}"
        del link["source"]
        del link["target"]

    # Create or retrieve the node and edge tables
    if loaded_workspace.has_table(node_table_name):
        node_table = loaded_workspace.table(node_table_name)
    else:
        node_table = loaded_workspace.create_table(node_table_name, edge=False)

    if loaded_workspace.has_table(edge_table_name):
        edge_table = loaded_workspace.table(edge_table_name)
    else:
        edge_table = loaded_workspace.create_table(edge_table_name, edge=True)

    # Insert data
    node_table.insert(nodes)
    edge_table.insert(links)

    loaded_workspace.create_graph(graph, edge_table_name)

    return {"nodecount": len(nodes), "edgecount": len(links)}
Esempio n. 5
0
def validate_node_table(rows: Sequence[MutableMapping], key_field: str,
                        overwrite: bool) -> None:
    """Validate that the given table is a valid node table."""
    fieldnames = rows[0].keys()
    data_errors: List[ValidationFailure] = []

    if key_field != "_key" and key_field not in fieldnames:
        data_errors.append(KeyFieldDoesNotExist(key=key_field))
        raise ValidationFailed(data_errors)

    if "_key" in fieldnames and key_field != "_key" and not overwrite:
        data_errors.append(KeyFieldAlreadyExists(key=key_field))
        raise ValidationFailed(data_errors)

    keys = (row[key_field] for row in rows)
    unique_keys: Set[str] = set()
    for key in keys:
        if key in unique_keys:
            data_errors.append(DuplicateKey(key=key))
        else:
            unique_keys.add(key)

    if len(data_errors) > 0:
        raise ValidationFailed(data_errors)
Esempio n. 6
0
    def create_aql_table(self, table: str, aql_query: str) -> Table:
        """Create a table in this workspace from an aql query."""
        if self.has_table(table):
            raise AlreadyExists("table", table)

        # In the future, the result of this validation can be
        # used to determine dependencies in virtual tables
        rows = list(self.run_query(aql_query))

        errors = validate_csv(rows, "_key", False)
        if errors:
            raise ValidationFailed(errors=errors)

        loaded_table = self.create_table(table, False)
        loaded_table.insert(rows)

        return loaded_table
Esempio n. 7
0
def validate_edge_table(rows: Sequence[MutableMapping]) -> None:
    """Validate that the given table is a valid edge table."""
    data_errors: List[ValidationFailure] = []

    # Checks that a cell has the form table_name/key
    valid_cell = re.compile("[^/]+/[^/]+")

    for i, row in enumerate(rows):
        fields: List[str] = []
        if not valid_cell.match(row["_from"]):
            fields.append("_from")
        if not valid_cell.match(row["_to"]):
            fields.append("_to")

        if fields:
            # i+2 -> +1 for index offset, +1 due to header row
            data_errors.append(InvalidRow(columns=fields, row=i + 2))

    if len(data_errors) > 0:
        raise ValidationFailed(data_errors)
Esempio n. 8
0
def upload(
    workspace: str,
    table: str,
    key: str = "_key",
    overwrite: bool = False,
    metadata: Optional[str] = None,
) -> Any:
    """
    Store a CSV file into the database as a node or edge table.

    `workspace` - the target workspace
    `table` - the target table
    `data` - the CSV data, passed in the request body. If the CSV data contains
             `_from` and `_to` fields, it will be treated as an edge table.
    """
    loaded_workspace = Workspace(workspace)

    if loaded_workspace.has_table(table):
        raise AlreadyExists("table", table)

    app.logger.info("Bulk Loading")

    # Read the request body into CSV format
    body = decode_data(request.data)

    try:
        # Type to a Dict rather than an OrderedDict
        csv_rows: List[UnprocessedTableRow] = list(csv.DictReader(StringIO(body)))
    except csv.Error:
        raise CSVReadError()

    # TODO: This temporarily needs to be done here, so that validation of the metadata
    # can be done before the table is actually created. Once the API is updated, this
    # will change.
    # https://github.com/multinet-app/multinet-server/issues/493
    metadata_dict = {}
    if metadata:
        try:
            metadata_dict = json.loads(metadata)
        except json.decoder.JSONDecodeError:
            raise BadQueryArgument("metadata", metadata)

    table_metadata = table_metadata_from_dict(metadata_dict)
    rows, metadata_validation_errors = process_rows(csv_rows, table_metadata.columns)

    # Perform validation.
    csv_validation_errors = validate_csv(rows, key, overwrite)

    validation_errors = [*metadata_validation_errors, *csv_validation_errors]
    if len(validation_errors):
        raise ValidationFailed(errors=validation_errors)

    # Once we reach here, we know that the specified key field must be present,
    # and either:
    #   key == "_key"   # noqa: E800
    #   or key != "_key" and the "_key" field is not present
    #   or key != "_key" and "_key" is present, but overwrite = True
    if key != "_key":
        rows = set_table_key(rows, key)

    # Create table and insert the data
    loaded_table = loaded_workspace.create_table(table, edge=is_edge_table(rows))

    # Set table metadata
    loaded_table.set_metadata(metadata_dict)

    results = loaded_table.insert(rows)
    return {"count": len(results)}