def test_set_transform_results(self, test_transform_results, dynamo_test_environment): function_name = 'set_transform_results' dynamo_driver = LeechDriver(table_name=blank_table_name) test_source_vertex = test_transform_results[0] test_potentials = test_transform_results[1] test_id_value = test_source_vertex.id_value test_identifier_stem = test_source_vertex.identifier_stem test_internal_id = test_source_vertex.internal_id dynamo_driver.set_transform_results( test_source_vertex, test_potentials, identifier_stem=test_identifier_stem, id_value=test_id_value) disposition = 'working' if not test_potentials: disposition = 'graphing' test_args = (function_name, test_id_value, test_identifier_stem, dynamo_test_environment) test_kwargs = { 'stage_name': 'transformation', 'disposition': disposition, 'internal_id': test_internal_id } self._assert_dynamo_call(*test_args, **test_kwargs) attribute_values = dynamo_test_environment.call_args_list[0][0][1]['ExpressionAttributeValues'] self._assert_object_properties_creation(test_source_vertex.object_type, attribute_values[':v']) self._assert_potentials_creation(attribute_values[':ps'])
def __init__(self, metal_order, **kwargs): self._assimilate_order = metal_order self._source_vertex = metal_order.source_vertex self._potential_vertex = metal_order.potential_vertex self._rule_entry = metal_order.rule_entry self._extracted_data = metal_order.extracted_data self._dynamo_driver = LeechDriver()
def __init__(self, metal_order, **kwargs): self._extraction_order = metal_order self._extraction_function_name = metal_order.extraction_function_name self._extraction_properties = metal_order.extraction_properties self._schema_entry = metal_order.schema_entry self._dynamo_driver = LeechDriver() self._transform_queue = kwargs.get( 'transform_queue', ForgeQueue.get_for_transform_queue(**kwargs))
def test_mark_object_as_graphed(self, test_id, dynamo_test_environment): function_name = 'mark_object_as_graphed' test_identifier_stem = test_id[0] test_id_value = test_id[1] dynamo_driver = LeechDriver(table_name=blank_table_name) dynamo_driver.mark_object_as_graphed(identifier_stem=test_identifier_stem, id_value=test_id_value) self._assert_dynamo_call(function_name, test_id_value, test_identifier_stem, dynamo_test_environment, stage_name='graphing')
def __init__(self, metal_order, **kwargs): self._transform_order = metal_order self._assimilation_queue = kwargs.get( 'assimilate_queue', ForgeQueue.get_for_assimilation_queue(**kwargs)) self._extracted_data = metal_order.extracted_data self._schema_entry = metal_order.schema_entry self._source_vertex_data = metal_order.extracted_data['source'] self._dynamo_driver = LeechDriver()
def __init__(self, **kwargs): self._scanner = DynamoScanner(kwargs.get('index_name', 'stalled')) self._driver = LeechDriver() self._load_graph_orders = [] self._graph_counter = 0 self._load_queue_url = os.getenv( 'LOAD_URL', 'https://sqs.us-east-1.amazonaws.com/803040539655/load') self._extract_queue_url = os.getenv( 'EXTRACT_URL', 'https://sqs.us-east-1.amazonaws.com/803040539655/extract') self._load_queue = boto3.resource('sqs').Queue(self._load_queue_url) self._extraction_queue = ForgeQueue.get_for_extraction_queue()
def test_set_stub_assimilated_vertex(self, stub_potential_vertex, dynamo_test_environment): function_name = 'set_assimilated_vertex' dynamo_driver = LeechDriver(table_name=blank_table_name) results = dynamo_driver.set_assimilated_vertex(stub_potential_vertex, True) internal_id = None id_value = None identifier_stem = IdentifierStem.for_stub(stub_potential_vertex) if stub_potential_vertex.is_internal_id_set: internal_id = stub_potential_vertex.internal_id if stub_potential_vertex.is_id_value_set: id_value = stub_potential_vertex.id_value self._assert_dynamo_call( function_name, id_value, identifier_stem, dynamo_test_environment, stage_name='assimilation', internal_id=internal_id, id_value=id_value, object_type=stub_potential_vertex.object_type )
class Dentist: def __init__(self, metal_order, **kwargs): self._extraction_order = metal_order self._extraction_function_name = metal_order.extraction_function_name self._extraction_properties = metal_order.extraction_properties self._schema_entry = metal_order.schema_entry self._dynamo_driver = LeechDriver() self._transform_queue = kwargs.get( 'transform_queue', ForgeQueue.get_for_transform_queue(**kwargs)) @classmethod def extract_bulk(cls, metal_orders): results = [] for metal_order in metal_orders: dentist = cls(metal_order) result = dentist.extract() results.append(result) return results def extract(self): extracted_data = StageManager.run_extraction( self._extraction_function_name, self._extraction_properties) source_data = extracted_data['source'] if len(source_data) > 1: raise InvalidExtractionMultipleSourceException( self._extraction_function_name, self._extraction_order) if not source_data: return self._mark_object_blank() for entry in source_data: if not entry: return self._mark_object_blank() extracted_data['source'] = entry break transform_order = TransformObjectOrder( self._extraction_order.identifier_stem, self._extraction_order.id_value, extracted_data, self._schema_entry) self._transform_queue.add_order(transform_order) self._dynamo_driver.set_extraction_results( extracted_data, identifier_stem=self._extraction_order.identifier_stem, id_value=self._extraction_order.id_value) self._transform_queue.push_orders() def _mark_object_blank(self): return self._dynamo_driver.mark_object_as_blank( identifier_stem=self._extraction_order.identifier_stem, id_value=self._extraction_order.id_value)
def set_assimilation_result(self, test_assimilation_results, dynamo_test_environment, counter): function_name = 'set_assimilation_results' dynamo_driver = LeechDriver(table_name=blank_table_name) test_assimilation_result = test_assimilation_results[2] test_edge_type = test_assimilation_results[1] test_source_vertex = test_assimilation_results[0] test_identifier_stem = test_source_vertex.identifier_stem test_id_value = test_source_vertex.id_value dynamo_driver.set_assimilation_results( test_edge_type, test_assimilation_result, identifier_stem=test_identifier_stem, id_value=test_id_value ) self._assert_dynamo_call(function_name, test_id_value, test_identifier_stem, dynamo_test_environment, stage_name='assimilation', edge_type=test_edge_type, counter=counter) identified_vertexes = dynamo_test_environment.call_args[0][1]['ExpressionAttributeValues'][':iv'] self._assert_identified_vertexes_creation(identified_vertexes, test_assimilation_results)
class DisguisedRobot: def __init__(self, metal_order, **kwargs): self._transform_order = metal_order self._assimilation_queue = kwargs.get( 'assimilate_queue', ForgeQueue.get_for_assimilation_queue(**kwargs)) self._extracted_data = metal_order.extracted_data self._schema_entry = metal_order.schema_entry self._source_vertex_data = metal_order.extracted_data['source'] self._dynamo_driver = LeechDriver() def transform(self): regulator = VertexRegulator(self._schema_entry) source_vertex = regulator.create_potential_vertex( self._source_vertex_data) logging.info( 'generated source vertex in transform step, source_vertex: %s' % source_vertex.to_json) extracted_data = self._extracted_data assimilate_orders = [] arbiter = RuleArbiter(source_vertex, self._schema_entry) potentials = arbiter.process_rules(self._extracted_data) for potential in potentials: potential_vertex = potential[0] rule_entry = potential[1] assimilate_order = AssimilateObjectOrder(source_vertex, potential_vertex, rule_entry, extracted_data) assimilate_orders.append(assimilate_order) self._assimilation_queue.add_orders(assimilate_orders) self._write_results(source_vertex, potentials) self._assimilation_queue.push_orders() def _write_results(self, vertex, potentials): try: self._dynamo_driver.set_transform_results( vertex, potentials, identifier_stem=vertex.identifier_stem, id_value=vertex.id_value) except ClientError as e: if e.response['Error']['Code'] != 'ConditionalCheckFailedException': raise e
def load(*args, **kwargs): logging.info('starting a load task with args/kwargs: %s/%s' % (args, kwargs)) task_args = kwargs['task_args'] dynamo_driver = LeechDriver(**task_args) key_fields = task_args['keys'] keys = { 'identifier_stem': key_fields['identifier_stem'], 'id_value': key_fields['sid_value'] } potential_object = dynamo_driver.get_object(**keys) ogm = Ogm(**task_args) graph_results = ogm.graph_object(potential_object) try: dynamo_driver.mark_object_as_graphed( identifier_stem=potential_object['source'].identifier_stem, id_value=potential_object['source'].id_value) except ClientError as e: if e.response['Error']['Code'] != 'ConditionalCheckFailedException': raise e logging.warning( 'attempted to mark a vertex as graphing, ' 'but it appears this step has already happened, no changes to be made' % potential_object) return graph_results
def test_mark_ids_as_working(self, test_working_ids, dynamo_test_environment): function_name = 'mark_ids_as_working' dynamo_driver = LeechDriver(table_name=blank_table_name) test_id_range = test_working_ids[1] test_identifier_stem = test_working_ids[0] results = dynamo_driver.mark_ids_as_working(test_id_range, identifier_stem=test_identifier_stem) assert results == ([], list(test_id_range)) assert dynamo_test_environment.called is True assert dynamo_test_environment.call_count == len(test_id_range) for boto_call in dynamo_test_environment.call_args_list: dynamo_commands = boto_call[0] dynamo_args = dynamo_commands[0] dynamo_kwargs = dynamo_commands[1] assert dynamo_args == 'UpdateItem' assert dynamo_kwargs['Key']['identifier_stem'] == str(test_identifier_stem) assert int(dynamo_kwargs['Key']['sid_value']) in test_id_range update_expression = dynamo_kwargs['UpdateExpression'] update_names = dynamo_kwargs['ExpressionAttributeNames'] update_values = dynamo_kwargs['ExpressionAttributeValues'] self._assert_update_expression_creation(function_name, update_expression) self._assert_attribute_names_creation(function_name, update_names) self._assert_attribute_values_creation(function_name, update_values, id_value_range=test_id_range, object_type=test_identifier_stem.object_type, stage_name='assimilation')
class Fixer: def __init__(self, **kwargs): self._scanner = DynamoScanner(kwargs.get('index_name', 'stalled')) self._driver = LeechDriver() self._load_graph_orders = [] self._graph_counter = 0 self._load_queue_url = os.getenv( 'LOAD_URL', 'https://sqs.us-east-1.amazonaws.com/803040539655/load') self._extract_queue_url = os.getenv( 'EXTRACT_URL', 'https://sqs.us-east-1.amazonaws.com/803040539655/extract') self._load_queue = boto3.resource('sqs').Queue(self._load_queue_url) self._extraction_queue = ForgeQueue.get_for_extraction_queue() def fix(self): stalled_objects = self._scanner.scan_stalled_objects() for stalled_object in stalled_objects: stalled_stage = stalled_object['last_stage_seen'] if stalled_stage == 'assimilation': self._load(stalled_object) continue if stalled_stage == 'transformation': self._assimilate(stalled_object) continue if stalled_stage == 'monitoring': self._extract(stalled_object) continue if stalled_stage == 'extraction': self._transform(stalled_object) continue if stalled_stage == 'graphing': self._process(stalled_object) continue raise NotImplementedError( f'stalled stage: {stalled_stage} is not registered with the system' ) self._clean_up() def _clean_up(self): if self._load_graph_orders: self._load_queue.send_messages(Entries=self._load_graph_orders) self._extraction_queue.push_orders() def _load(self, stalled_object): object_type = stalled_object['object_type'] if object_type[-1:] == '_': return stalled_key = { 'sid_value': { 'S': stalled_object['sid_value'] }, 'identifier_stem': { 'S': stalled_object['identifier_stem'] } } if len(self._load_graph_orders) >= 10: self._load_queue.send_messages(Entries=self._load_graph_orders) self._load_graph_orders = [] self._load_graph_orders.append({ 'Id': str(self._graph_counter), 'MessageBody': json.dumps({ 'task_name': 'load', 'task_args': { 'keys': stalled_key } }) }) self._graph_counter += 1 def _assimilate(self, stalled_object): disposition = stalled_object['disposition'] if disposition == 'graphing': return self._load(stalled_object) raise NotImplementedError() def _extract(self, stalled_object): identifier_stem = stalled_object['identifier_stem'] identifier_stem = IdentifierStem.from_raw(identifier_stem) extractor_names = self._driver.get_extractor_function_names( identifier_stem) schema_entry = SchemaVertexEntry.retrieve( stalled_object['object_type']) schema_extraction_properties = schema_entry.extract[ extractor_names['type']] extraction_properties = identifier_stem.for_extractor extraction_properties.update( schema_extraction_properties.extraction_properties) extractor_name = extractor_names['extraction'] extraction_order = ExtractObjectOrder(identifier_stem, stalled_object['id_value'], extractor_name, extraction_properties, schema_entry) self._extraction_queue.add_order(extraction_order) def _transform(self, stalled_object): pass def _process(self, stalled_object): pass
class SevenOfNine: def __init__(self, metal_order, **kwargs): self._assimilate_order = metal_order self._source_vertex = metal_order.source_vertex self._potential_vertex = metal_order.potential_vertex self._rule_entry = metal_order.rule_entry self._extracted_data = metal_order.extracted_data self._dynamo_driver = LeechDriver() def assimilate(self): assimilation_results = [] edge_regulator = EdgeRegulator.get_for_object_type(self._rule_entry.edge_type) identified_vertexes, exist = self._derive_vertexes() for vertex in identified_vertexes: edge = self._derive_edge(edge_regulator, vertex) assimilation_results.append({ 'edge': edge, 'vertex': vertex }) self._write_assimilation_results(identified_vertexes, exist, assimilation_results) def _derive_vertexes(self): if self._potential_vertex.is_properties_complete and self._potential_vertex.is_identifiable: return [self._potential_vertex], False found_vertexes = self._dynamo_driver.find_potential_vertexes( self._potential_vertex.object_type, self._potential_vertex.object_properties) if found_vertexes: return found_vertexes, True if self._rule_entry.is_stub: return [self._potential_vertex], False return [], None def _write_assimilation_results(self, identified_vertexes, exist, assimilation_results): self._set_new_vertexes(identified_vertexes, exist) self._set_assimilation_results(assimilation_results) def _set_new_vertexes(self, identified_vertexes, exist): if exist: return if self._rule_entry.is_stub: return self._write_vertexes(identified_vertexes, True) if self._rule_entry.is_create: return self._write_vertexes(identified_vertexes) if self._rule_entry.is_pass: return raise NotImplementedError('do not know what to do with potential_vertex: %s for rule_type %s' % ( self._potential_vertex, self._rule_entry.if_missing)) def _derive_edge(self, edge_regulator, potential_vertex): return edge_regulator.generate_potential_edge( self._source_vertex, potential_vertex, self._extracted_data, self._rule_entry.inbound) def _set_assimilation_results(self, assimilation_results): try: self._dynamo_driver.set_assimilation_results( self._rule_entry.edge_type, assimilation_results, identifier_stem=self._source_vertex.identifier_stem, id_value=self._source_vertex.id_value) except ClientError as e: if e.response['Error']['Code'] != 'ConditionalCheckFailedException': raise e logging.warning( 'attempted to set assimilation results for edge_type: %s, ' 'but it appears this step has already happened, no changes to be made' % self._rule_entry.edge_type) def _write_vertexes(self, vertexes, is_stub=False): for vertex in vertexes: self._write_vertex(vertex, is_stub) def _write_vertex(self, vertex, is_stub): try: if is_stub: return self._dynamo_driver.set_assimilated_vertex(vertex, is_stub, identifier_stem=None, id_value=None) return self._dynamo_driver.set_assimilated_vertex( vertex, is_stub, identifier_stem=vertex.identifier_stem, id_value=vertex.id_value) except ClientError as e: if e.response['Error']['Code'] != 'ConditionalCheckFailedException': raise e logging.warning( 'attempted to write a new vertex: %s, ' 'but it appears this step has already happened, no changes to be made' % vertex.to_json)