def _consumer_thread(self) -> None: processed = 0 while True: event = self._queue.get() processed += 1 if event is _SENTINEL: logger.debug( f"Consumer Thread {current_thread().name} finished after processing {processed} events" ) return try: nodes = self.transform(event) except Exception as e: logger.warning( f"Error when parsing event, recieved exception {e}") logger.debug(event) self.errors[current_thread()].append(e) nodes = [] if nodes: self.nodes += nodes self._queue.task_done()
def _setup_params(form: dict, schema: dict, is_external: bool) -> dict: logger.debug("Setting up parameters") params: Dict[str, Any] = {} if is_external: # External parameters are in the form params = {} for param in schema["params"]: if param["name"] in request.form: params[param["name"]] = request.form[param["name"]] logger.info(f"ExternalDataSource params received {params}") else: for param in schema["params"]: # Save the files, keep track of which parameter they represent if param["name"] in request.files: params[param["name"]] = tempfile.NamedTemporaryFile() request.files[param["name"]].save(params[param["name"]].name) params[param["name"]].seek(0) logger.info(f"Saved uploaded files {params}") logger.debug("Set up parameters") return params
def _make_edges(self, source_graph: nx.Graph) -> None: logger.info("Grouping Edges by type") sorted_edges = sorted(source_graph.edges(data=True, keys=True), key=lambda edge: edge[3]["edge_name"]) edges_by_type = itertools.groupby( sorted_edges, key=lambda edge: edge[3]["edge_name"]) for edge_type, edges in edges_by_type: # Remove white spaces edge_type = edge_type.replace(" ", "_") cypher_edges = list(map(self._edge_as_cypher, edges)) logger.debug( f"Inserting {len(cypher_edges)} {edge_type} edges into Neo4J") for i in range(0, len(cypher_edges), self.batch_size): start = i end = i + self.batch_size cypher = f"UNWIND [{', '.join(cypher_edges[start: end])}] as row\n" cypher += "MATCH (src {_key: row.src}), (dst {_key: row.dst})" cypher += f" CREATE (src)-[:`{edge_type}`]->(dst)" with self.neo4j.session() as session: session.write_transaction(lambda tx: tx.run(cypher)) logger.debug(f"Finished batch {i+1} ({start} -> {end})")
def _add_to_exiting_graph( existing_backend: Backend, datasource_cls: Type[DataSource], transformer_cls: Type[Transformer], params: Dict[str, Any], is_external: bool, ) -> Tuple[dict, bool]: try: # Set up parameters for datasource class datasource_params = ( # Use filenames if we are referencing a temporary file {param_name: tempfile.name for param_name, tempfile in params.items()} if not is_external else params ) # Create the datasource datasource = datasource_cls(**datasource_params) # type: ignore # Create transformer transformer = datasource.to_transformer(transformer_cls) # Create the nodes nodes = transformer.run() # Create the backend G = existing_backend.add_nodes(nodes) except Exception as e: logger.critical(f"Failure to generate graph {e}") import traceback logger.debug(f"{traceback.format_exc()}") if not is_external: # Clean up temporary files try: for _tempfile in params.values(): _tempfile.close() except Exception as e: logger.critical(f"Failure to clean up temporary files after error {e}") return {"message": str(e)}, False logger.info("Cleaning up tempfiles") if not is_external: # Clean up temporary files for _tempfile in params.values(): _tempfile.close() logger.info("Finished generating graph") # Check if we even had a graph. # This will be on the G attribute for any class subclassing NetworkX if existing_backend.is_empty(): return {"message": f"Graph generation resulted in 0 nodes."}, False return {"graph": G, "backend": existing_backend}, True
def _producer_thread(self) -> None: i = 0 for element in self.datasource.events(): self._queue.put(element, block=True) i += 1 logger.debug( f"Producer Thread {current_thread().name} finished after {i} events" ) return
def setup_schema(self) -> None: """Sets up the DGraph schema based on the nodes. This inspect all attributes of all nodes, and generates a schema for them. Each schema entry has the format `{node_type}.{field}`. If a field is a string field, it has the `@index(exact)` predicate added to it. An example output schema:: process.process_image string @index(exact) process.process_id int """ all_node_types = inspect.getmembers( sys.modules["beagle.nodes"], lambda cls: inspect.isclass(cls) and not inspect.isabstract(cls) and issubclass(cls, Node) and cls != Node, ) schema = "" for cls_name, node_class in all_node_types: for attr, attr_type in node_class.__annotations__.items(): if attr == "key_fields": continue # https://github.com/python/typing/issues/528#issuecomment-357751667 if type(attr_type) == type(Union): attr_type = attr_type.__args__[0] if attr_type == int: attr_type = "int" elif type(attr_type) == type(DefaultDict) and issubclass( attr_type.__args__[1], Edge ): # Don't need this, get built automatically continue else: attr_type = "string @index(exact)" # Remove spaces, lowercase schema += f"{node_class.__name__.lower().replace(' ', '_')}.{attr}: {attr_type} .\n" schema += "<type>: string @index(exact) .\n" logger.debug(schema) self.dgraph.alter(pydgraph.Operation(schema=schema))
def run(self) -> List[Node]: """Generates the list of nodes from the datasource. This methods kicks off a producer/consumer queue. The producer grabs events one by one from the datasource by iterating over the events from the `events` generator. Each event is then sent to the :py:meth:`transformer` function to be transformer into one or more `Node` objects. Returns ------- List[Node] All Nodes created from the data source. """ logger.debug("Launching transformer") threads: List[Thread] = [] producer_thread = Thread(target=self._producer_thread) producer_thread.start() threads.append(producer_thread) self.errors[producer_thread] = [] logger.debug("Started producer thread") consumer_count = _THREAD_COUNT - 1 if consumer_count == 0: consumer_count = 1 for i in range(consumer_count): t = Thread(target=self._consumer_thread) self.errors[t] = [] t.start() threads.append(t) logger.debug(f"Started {_THREAD_COUNT-1} consumer threads") # Wait for the producer to finish producer_thread.join() self._queue.join() # Stop the threads for i in range(consumer_count): self._queue.put(_SENTINEL) for thread in threads: thread.join() logger.info( f"Finished processing of events, created {len(self.nodes)} nodes.") if any([len(x) > 0 for x in self.errors.values()]): logger.warning(f"Parsing finished with errors.") logger.debug(self.errors) return self.nodes
def _make_nodes(self, source_graph: nx.Graph) -> None: logger.info("Grouping Nodes by type") # Group nodes by class sorted_nodes = sorted( [node["data"] for _, node in source_graph.nodes(data=True)], key=lambda node: node.__name__, reverse=True, ) nodes_by_type = itertools.groupby(sorted_nodes, key=lambda node: node.__name__) for node_type, nodes in nodes_by_type: # remove whitespaces node_type = node_type.replace(" ", "_") self._create_constraint(node_type) cypher_nodes = list(map(self._node_as_cypher, nodes)) logger.debug( f"Inserting {len(cypher_nodes)} {node_type} nodes into Neo4J") for i in range(0, len(cypher_nodes), self.batch_size): start = i end = i + self.batch_size cypher = f"UNWIND [{', '.join(cypher_nodes[start: end])}] as row\n" cypher += f"CREATE (node:{node_type} {{_key: row._key}}) SET node = row" with self.neo4j.session() as session: session.write_transaction(lambda tx: tx.run(cypher)) logger.debug(f"Finished batch {i+1} ({start} -> {end})")
def _validate_params(form: dict, files: dict) -> Tuple[dict, bool]: """Validates that the passed in parameters are valid. Test for the following: 1. Datasource, comment, and transformer all passed in (backend is optional). 2. For the datasource requested, all of the parameters to the datasource are present. Parameters ---------- form : dict The HTTP form sent files : dict The files sent along the form, if any Returns ------- Tuple[dict, bool] Return (error message, False) if not valid, otherwise (config, True) """ # Verify we have the basic parameters. missing_params = [] for req_param in ["datasource", "transformer", "comment"]: if req_param not in form: missing_params.append(req_param) if len(missing_params) > 0: logger.debug(f"Request to /new missing parameters: {missing_params}") return ({"message": f"Missing parameters {missing_params}"}, False) # Pull out the requested datasource/transformer. requested_datasource = form["datasource"] requested_transformer = form["transformer"] # Backend is optional requested_backend = form.get("backend", "NetworkX") datasource_schema = next( filter(lambda entry: entry["id"] == requested_datasource, SCHEMA["datasources"]), None ) if datasource_schema is None: logger.debug(f"User requested a non-existent data source {requested_datasource}") resp = { "message": f"Requested datasource '{requested_datasource}' is invalid, " + "please use /api/datasources to find a list of valid datasources" } return (resp, False) datasource_cls = DATASOURCES[requested_datasource] transformer_cls = TRANSFORMERS[requested_transformer] backend_cls = BACKENDS[requested_backend] required_params: List[Dict[str, Any]] = datasource_schema["params"] is_external = issubclass(datasource_cls, ExternalDataSource) # Make sure the user provided all required parameters for the datasource. datasource_missing_params = [] for param in required_params: # Skip missing parameters if param["required"] is False: continue if is_external and param["name"] not in form: datasource_missing_params.append(param["name"]) if not is_external and param["name"] not in files: datasource_missing_params.append(param["name"]) if len(datasource_missing_params) > 0: logger.debug( f"Missing datasource {'form' if is_external else 'files'} params {datasource_missing_params}" ) resp = { "message": f"Missing datasource {'form' if is_external else 'files'} params {datasource_missing_params}" } return (resp, False) return ( { "datasource": datasource_cls, "transformer": transformer_cls, "backend": backend_cls, "schema": datasource_schema, "required_params": required_params, }, True, )
def new(): """Generate a new graph using the supplied DataSource, Transformer, and the parameters passed to the DataSource. At minimum, the user must supply the following form parameters: 1. datasource 2. transformer 3. comment 4. backend Outside of that, the user must supply at **minimum** the parameters marked by the datasource as required. * Use the /api/datasources endpoint to see which ones these are. * Programmatically, these are any parameters without a default value. Failure to supply either the minimum three or the required parameters for that datasource returns a 400 status code with the missing parameters in the 'message' field. If any part of the graph creation yields an error, a 500 HTTP code is returend with the python exception as a string in the 'message' field. If the graph is succesfully created, the user is returned a dictionary with the ID of the graph and the URI path to viewing it in the *beagle web interface*. For example: >>> { id: 1, self: /fireeye_hx/1 } Returns ------- dict {id: integer, self: string} """ # Returns a tuple of (dict, bool). resp, success = _validate_params(form=request.form, files=request.files) # If false, return error message if not success: return make_response(jsonify(resp), 400) datasource_cls: Type[DataSource] = resp["datasource"] transformer_cls: Type[Transformer] = resp["transformer"] backend_cls: Type[Backend] = resp["backend"] datasource_schema = resp["schema"] # If this class extends the ExternalDataSource class, we know that the parameters # represent strings, and not files. is_external = issubclass(datasource_cls, ExternalDataSource) logger.info( f"Recieved upload request for datasource=<{datasource_cls.__name__}>, " + f"transformer=<{transformer_cls.__name__}>, backend=<{backend_cls.__name__}>" ) logger.info("Transforming data to a graph.") params = _setup_params(form=request.form, schema=datasource_schema, is_external=is_external) resp, success = _create_graph( datasource_cls=datasource_cls, transformer_cls=transformer_cls, backend_cls=backend_cls, params=params, is_external=is_external, ) if not success: return make_response(jsonify(resp), 400) G = resp["graph"] # If the backend is NetworkX, save the graph. # Otherwise, redirect the user to wherever he sent it (if possible) if backend_cls.__name__ == "NetworkX": response = _save_graph_to_db(backend=resp["backend"], category=datasource_cls.category) response = jsonify(response) else: logger.debug(G) response = jsonify({"resp": G}) return response
def new(): """Generate a new graph using the supplied DataSource, Transformer, and the parameters passed to the DataSource. At minimum, the user must supply the following form parameters: 1. datasource 2. transformer 3. comment 4. backend Outside of that, the user must supply at **minimum** the parameters marked by the datasource as required. * Use the /api/datasources endpoint to see which ones these are. * Programmatically, these are any parameters without a default value. Failure to supply either the minimum three or the required parameters for that datasource returns a 400 status code with the missing parameters in the 'message' field. If any part of the graph creation yields an error, a 500 HTTP code is returend with the python exception as a string in the 'message' field. If the graph is succesfully created, the user is returned a dictionary with the ID of the graph and the URI path to viewing it in the *beagle web interface*. For example: >>> { id: 1, self: /fireeye_hx/1 } Returns ------- dict {id: integer, self: string} """ # Verify we have the basic parameters. missing_params = [] for param in ["datasource", "transformer", "comment"]: if param not in request.form: missing_params.append(param) if len(missing_params) > 0: logger.debug(f"Request to /new missing parameters: {missing_params}") return make_response( jsonify({"message": f"Missing parameters {missing_params}"}), 400) # Get the requested_datasource = request.form["datasource"] requested_transformer = request.form["transformer"] requested_backend = request.form.get("backend", "NetworkX") datasource_schema = next( filter(lambda entry: entry["id"] == requested_datasource, SCHEMA["datasources"]), None) if datasource_schema is None: logger.debug( f"User requested a non-existent data source {requested_datasource}" ) return make_response( jsonify({ "message": f"Requested datasource '{requested_datasource}' is invalid, " + "please use /api/datasources to find a list of valid datasources" }), 400, ) logger.info( f"Recieved upload request for datasource=<{requested_datasource}>, " + f"transformer=<{requested_transformer}>, backend=<{requested_backend}>" ) datasource_cls = DATASOURCES[requested_datasource] transformer_cls = TRANSFORMERS[requested_transformer] backend_class = BACKENDS[requested_backend] required_parameters = datasource_schema["params"] # If this class extends the ExternalDataSource class, we know that the parameters # represent strings, and not files. is_external = issubclass(datasource_cls, ExternalDataSource) # Make sure the user provided all required parameters for the datasource. datasource_missing_params = [] for param in required_parameters: # Skip missnig parameters if param["required"] is False: continue if is_external and param["name"] not in request.form: datasource_missing_params.append(param["name"]) if not is_external and param["name"] not in request.files: datasource_missing_params.append(param["name"]) if len(datasource_missing_params) > 0: logger.debug( f"Missing datasource {'form' if is_external else 'files'} params {datasource_missing_params}" ) return make_response( jsonify({ "message": f"Missing datasource {'form' if is_external else 'files'} params {datasource_missing_params}" }), 400, ) logger.info("Transforming data to a graph.") logger.debug("Setting up parameters") params = {} if is_external: # External parameters are in the form params = {} for param in datasource_schema["params"]: if param["name"] in request.form: params[param["name"]] = request.form[param["name"]] logger.info(f"ExternalDataSource params received {params}") else: for param in datasource_schema["params"]: # Save the files, keep track of which parameter they represent if param["name"] in request.files: params[param["name"]] = tempfile.NamedTemporaryFile() request.files[param["name"]].save(params[param["name"]].name) params[param["name"]].seek(0) logger.info(f"Saved uploaded files {params}") logger.debug("Set up parameters") try: # Create the datasource datasource = datasource_cls( # Give file paths instead of file-like objects when not external source. **({ param_name: tempfile.name for param_name, tempfile in params.items() } if not is_external else params)) transformer = datasource.to_transformer(transformer_cls) graph = backend_class(metadata=datasource.metadata(), nodes=transformer.run(), consolidate_edges=True) # Make the graph G = graph.graph() except Exception as e: logger.critical(f"Failure to generate graph {e}") import traceback logger.debug(f"{traceback.format_exc()}") if not is_external: # Clean up temporary files try: for _tempfile in params.values(): _tempfile.close() except Exception as e: logger.critical( f"Failure to clean up temporary files after error {e}") response = make_response(jsonify({"message": str(e)}), 500) response.headers.add("Access-Control-Allow-Origin", "*") return response logger.info("Cleaning up tempfiles") if not is_external: # Clean up temporary files for _tempfile in params.values(): _tempfile.close() logger.info("Finished generating graph") # Check if we even had a graph. # This will be on the G attribute for any class subclassing NetworkX if graph.is_empty(): return make_response( jsonify({"message": f"Graph generation resulted in 0 nodes. "}), 400) # If the backend is NetworkX, save the graph. # Otherwise, redirect the user to wherever he sent it (if possible) if backend_class.__name__ == "NetworkX": # Take the SHA256 of the contents of the graph. contents_hash = hashlib.sha256( json.dumps(graph.to_json(), sort_keys=True).encode("utf-8")).hexdigest() # See if we have previously generated this *exact* graph. existing = Graph.query.filter_by(meta=graph.metadata, sha256=contents_hash).first() if existing: logger.info(f"Graph previously generated with id {existing.id}") response = jsonify({ "id": existing.id, "self": f"/{existing.category}/{existing.id}" }) response.headers.add("Access-Control-Allow-Origin", "*") return response dest_folder = datasource_cls.category.replace(" ", "_").lower() # Set up the storage directory. dest_path = f"{Config.get('storage', 'dir')}/{dest_folder}/{contents_hash}.json" os.makedirs(f"{Config.get('storage', 'dir')}/{dest_folder}", exist_ok=True) db_entry = Graph( sha256=contents_hash, meta=graph.metadata, comment=request.form.get("comment", None), category=dest_folder, # Categories use the lower name! file_path=f"{contents_hash}.json", ) db.session.add(db_entry) db.session.commit() logger.info(f"Added graph to database with id={db_entry.id}") json.dump(graph.to_json(), open(dest_path, "w")) logger.info(f"Saved graph to {dest_path}") response = jsonify({ "id": db_entry.id, "self": f"/{dest_folder}/{db_entry.id}" }) else: logger.debug(G) response = jsonify({"resp": G}) response.headers.add("Access-Control-Allow-Origin", "*") return response
def _create_constraint(self, node_type: str) -> None: constraint_format = "CREATE CONSTRAINT ON (n:{name}) ASSERT n._key is UNIQUE" logger.debug(f"Creating _key constraint for {node_type}") with self.neo4j.session() as session: session.run(constraint_format.format(name=node_type))