def test_set_transform_results(self, test_transform_results, dynamo_test_environment):
     function_name = 'set_transform_results'
     dynamo_driver = LeechDriver(table_name=blank_table_name)
     test_source_vertex = test_transform_results[0]
     test_potentials = test_transform_results[1]
     test_id_value = test_source_vertex.id_value
     test_identifier_stem = test_source_vertex.identifier_stem
     test_internal_id = test_source_vertex.internal_id
     dynamo_driver.set_transform_results(
         test_source_vertex, test_potentials,
         identifier_stem=test_identifier_stem,
         id_value=test_id_value)
     disposition = 'working'
     if not test_potentials:
         disposition = 'graphing'
     test_args = (function_name, test_id_value, test_identifier_stem, dynamo_test_environment)
     test_kwargs = {
         'stage_name': 'transformation',
         'disposition': disposition,
         'internal_id': test_internal_id
     }
     self._assert_dynamo_call(*test_args, **test_kwargs)
     attribute_values = dynamo_test_environment.call_args_list[0][0][1]['ExpressionAttributeValues']
     self._assert_object_properties_creation(test_source_vertex.object_type, attribute_values[':v'])
     self._assert_potentials_creation(attribute_values[':ps'])
Esempio n. 2
0
 def __init__(self, metal_order, **kwargs):
     self._assimilate_order = metal_order
     self._source_vertex = metal_order.source_vertex
     self._potential_vertex = metal_order.potential_vertex
     self._rule_entry = metal_order.rule_entry
     self._extracted_data = metal_order.extracted_data
     self._dynamo_driver = LeechDriver()
Esempio n. 3
0
 def __init__(self, metal_order, **kwargs):
     self._extraction_order = metal_order
     self._extraction_function_name = metal_order.extraction_function_name
     self._extraction_properties = metal_order.extraction_properties
     self._schema_entry = metal_order.schema_entry
     self._dynamo_driver = LeechDriver()
     self._transform_queue = kwargs.get(
         'transform_queue', ForgeQueue.get_for_transform_queue(**kwargs))
 def test_mark_object_as_graphed(self, test_id, dynamo_test_environment):
     function_name = 'mark_object_as_graphed'
     test_identifier_stem = test_id[0]
     test_id_value = test_id[1]
     dynamo_driver = LeechDriver(table_name=blank_table_name)
     dynamo_driver.mark_object_as_graphed(identifier_stem=test_identifier_stem, id_value=test_id_value)
     self._assert_dynamo_call(function_name, test_id_value, test_identifier_stem, dynamo_test_environment,
                              stage_name='graphing')
 def __init__(self, metal_order, **kwargs):
     self._transform_order = metal_order
     self._assimilation_queue = kwargs.get(
         'assimilate_queue',
         ForgeQueue.get_for_assimilation_queue(**kwargs))
     self._extracted_data = metal_order.extracted_data
     self._schema_entry = metal_order.schema_entry
     self._source_vertex_data = metal_order.extracted_data['source']
     self._dynamo_driver = LeechDriver()
Esempio n. 6
0
 def __init__(self, **kwargs):
     self._scanner = DynamoScanner(kwargs.get('index_name', 'stalled'))
     self._driver = LeechDriver()
     self._load_graph_orders = []
     self._graph_counter = 0
     self._load_queue_url = os.getenv(
         'LOAD_URL',
         'https://sqs.us-east-1.amazonaws.com/803040539655/load')
     self._extract_queue_url = os.getenv(
         'EXTRACT_URL',
         'https://sqs.us-east-1.amazonaws.com/803040539655/extract')
     self._load_queue = boto3.resource('sqs').Queue(self._load_queue_url)
     self._extraction_queue = ForgeQueue.get_for_extraction_queue()
 def test_set_stub_assimilated_vertex(self, stub_potential_vertex, dynamo_test_environment):
     function_name = 'set_assimilated_vertex'
     dynamo_driver = LeechDriver(table_name=blank_table_name)
     results = dynamo_driver.set_assimilated_vertex(stub_potential_vertex, True)
     internal_id = None
     id_value = None
     identifier_stem = IdentifierStem.for_stub(stub_potential_vertex)
     if stub_potential_vertex.is_internal_id_set:
         internal_id = stub_potential_vertex.internal_id
     if stub_potential_vertex.is_id_value_set:
         id_value = stub_potential_vertex.id_value
     self._assert_dynamo_call(
         function_name, id_value, identifier_stem, dynamo_test_environment, stage_name='assimilation',
         internal_id=internal_id, id_value=id_value, object_type=stub_potential_vertex.object_type
     )
Esempio n. 8
0
class Dentist:
    def __init__(self, metal_order, **kwargs):
        self._extraction_order = metal_order
        self._extraction_function_name = metal_order.extraction_function_name
        self._extraction_properties = metal_order.extraction_properties
        self._schema_entry = metal_order.schema_entry
        self._dynamo_driver = LeechDriver()
        self._transform_queue = kwargs.get(
            'transform_queue', ForgeQueue.get_for_transform_queue(**kwargs))

    @classmethod
    def extract_bulk(cls, metal_orders):
        results = []
        for metal_order in metal_orders:
            dentist = cls(metal_order)
            result = dentist.extract()
            results.append(result)
        return results

    def extract(self):
        extracted_data = StageManager.run_extraction(
            self._extraction_function_name, self._extraction_properties)
        source_data = extracted_data['source']
        if len(source_data) > 1:
            raise InvalidExtractionMultipleSourceException(
                self._extraction_function_name, self._extraction_order)
        if not source_data:
            return self._mark_object_blank()
        for entry in source_data:
            if not entry:
                return self._mark_object_blank()
            extracted_data['source'] = entry
            break
        transform_order = TransformObjectOrder(
            self._extraction_order.identifier_stem,
            self._extraction_order.id_value, extracted_data,
            self._schema_entry)
        self._transform_queue.add_order(transform_order)
        self._dynamo_driver.set_extraction_results(
            extracted_data,
            identifier_stem=self._extraction_order.identifier_stem,
            id_value=self._extraction_order.id_value)
        self._transform_queue.push_orders()

    def _mark_object_blank(self):
        return self._dynamo_driver.mark_object_as_blank(
            identifier_stem=self._extraction_order.identifier_stem,
            id_value=self._extraction_order.id_value)
 def set_assimilation_result(self, test_assimilation_results, dynamo_test_environment, counter):
     function_name = 'set_assimilation_results'
     dynamo_driver = LeechDriver(table_name=blank_table_name)
     test_assimilation_result = test_assimilation_results[2]
     test_edge_type = test_assimilation_results[1]
     test_source_vertex = test_assimilation_results[0]
     test_identifier_stem = test_source_vertex.identifier_stem
     test_id_value = test_source_vertex.id_value
     dynamo_driver.set_assimilation_results(
         test_edge_type, test_assimilation_result,
         identifier_stem=test_identifier_stem,
         id_value=test_id_value
     )
     self._assert_dynamo_call(function_name, test_id_value, test_identifier_stem, dynamo_test_environment,
                              stage_name='assimilation', edge_type=test_edge_type, counter=counter)
     identified_vertexes = dynamo_test_environment.call_args[0][1]['ExpressionAttributeValues'][':iv']
     self._assert_identified_vertexes_creation(identified_vertexes, test_assimilation_results)
Esempio n. 10
0
class DisguisedRobot:
    def __init__(self, metal_order, **kwargs):
        self._transform_order = metal_order
        self._assimilation_queue = kwargs.get(
            'assimilate_queue',
            ForgeQueue.get_for_assimilation_queue(**kwargs))
        self._extracted_data = metal_order.extracted_data
        self._schema_entry = metal_order.schema_entry
        self._source_vertex_data = metal_order.extracted_data['source']
        self._dynamo_driver = LeechDriver()

    def transform(self):
        regulator = VertexRegulator(self._schema_entry)
        source_vertex = regulator.create_potential_vertex(
            self._source_vertex_data)
        logging.info(
            'generated source vertex in transform step, source_vertex: %s' %
            source_vertex.to_json)
        extracted_data = self._extracted_data
        assimilate_orders = []
        arbiter = RuleArbiter(source_vertex, self._schema_entry)
        potentials = arbiter.process_rules(self._extracted_data)
        for potential in potentials:
            potential_vertex = potential[0]
            rule_entry = potential[1]
            assimilate_order = AssimilateObjectOrder(source_vertex,
                                                     potential_vertex,
                                                     rule_entry,
                                                     extracted_data)
            assimilate_orders.append(assimilate_order)
        self._assimilation_queue.add_orders(assimilate_orders)
        self._write_results(source_vertex, potentials)
        self._assimilation_queue.push_orders()

    def _write_results(self, vertex, potentials):
        try:
            self._dynamo_driver.set_transform_results(
                vertex,
                potentials,
                identifier_stem=vertex.identifier_stem,
                id_value=vertex.id_value)
        except ClientError as e:
            if e.response['Error']['Code'] != 'ConditionalCheckFailedException':
                raise e
Esempio n. 11
0
def load(*args, **kwargs):
    logging.info('starting a load task with args/kwargs: %s/%s' % (args, kwargs))
    task_args = kwargs['task_args']
    dynamo_driver = LeechDriver(**task_args)
    key_fields = task_args['keys']
    keys = {
        'identifier_stem': key_fields['identifier_stem'],
        'id_value': key_fields['sid_value']
    }
    potential_object = dynamo_driver.get_object(**keys)
    ogm = Ogm(**task_args)
    graph_results = ogm.graph_object(potential_object)
    try:
        dynamo_driver.mark_object_as_graphed(
            identifier_stem=potential_object['source'].identifier_stem,
            id_value=potential_object['source'].id_value)
    except ClientError as e:
        if e.response['Error']['Code'] != 'ConditionalCheckFailedException':
            raise e
        logging.warning(
            'attempted to mark a vertex as graphing, '
            'but it appears this step has already happened, no changes to be made' % potential_object)
    return graph_results
Esempio n. 12
0
 def test_mark_ids_as_working(self, test_working_ids, dynamo_test_environment):
     function_name = 'mark_ids_as_working'
     dynamo_driver = LeechDriver(table_name=blank_table_name)
     test_id_range = test_working_ids[1]
     test_identifier_stem = test_working_ids[0]
     results = dynamo_driver.mark_ids_as_working(test_id_range, identifier_stem=test_identifier_stem)
     assert results == ([], list(test_id_range))
     assert dynamo_test_environment.called is True
     assert dynamo_test_environment.call_count == len(test_id_range)
     for boto_call in dynamo_test_environment.call_args_list:
         dynamo_commands = boto_call[0]
         dynamo_args = dynamo_commands[0]
         dynamo_kwargs = dynamo_commands[1]
         assert dynamo_args == 'UpdateItem'
         assert dynamo_kwargs['Key']['identifier_stem'] == str(test_identifier_stem)
         assert int(dynamo_kwargs['Key']['sid_value']) in test_id_range
         update_expression = dynamo_kwargs['UpdateExpression']
         update_names = dynamo_kwargs['ExpressionAttributeNames']
         update_values = dynamo_kwargs['ExpressionAttributeValues']
         self._assert_update_expression_creation(function_name, update_expression)
         self._assert_attribute_names_creation(function_name, update_names)
         self._assert_attribute_values_creation(function_name, update_values, id_value_range=test_id_range,
                                                object_type=test_identifier_stem.object_type,
                                                stage_name='assimilation')
Esempio n. 13
0
class Fixer:
    def __init__(self, **kwargs):
        self._scanner = DynamoScanner(kwargs.get('index_name', 'stalled'))
        self._driver = LeechDriver()
        self._load_graph_orders = []
        self._graph_counter = 0
        self._load_queue_url = os.getenv(
            'LOAD_URL',
            'https://sqs.us-east-1.amazonaws.com/803040539655/load')
        self._extract_queue_url = os.getenv(
            'EXTRACT_URL',
            'https://sqs.us-east-1.amazonaws.com/803040539655/extract')
        self._load_queue = boto3.resource('sqs').Queue(self._load_queue_url)
        self._extraction_queue = ForgeQueue.get_for_extraction_queue()

    def fix(self):
        stalled_objects = self._scanner.scan_stalled_objects()
        for stalled_object in stalled_objects:
            stalled_stage = stalled_object['last_stage_seen']
            if stalled_stage == 'assimilation':
                self._load(stalled_object)
                continue
            if stalled_stage == 'transformation':
                self._assimilate(stalled_object)
                continue
            if stalled_stage == 'monitoring':
                self._extract(stalled_object)
                continue
            if stalled_stage == 'extraction':
                self._transform(stalled_object)
                continue
            if stalled_stage == 'graphing':
                self._process(stalled_object)
                continue
            raise NotImplementedError(
                f'stalled stage: {stalled_stage} is not registered with the system'
            )
        self._clean_up()

    def _clean_up(self):
        if self._load_graph_orders:
            self._load_queue.send_messages(Entries=self._load_graph_orders)
        self._extraction_queue.push_orders()

    def _load(self, stalled_object):
        object_type = stalled_object['object_type']
        if object_type[-1:] == '_':
            return
        stalled_key = {
            'sid_value': {
                'S': stalled_object['sid_value']
            },
            'identifier_stem': {
                'S': stalled_object['identifier_stem']
            }
        }
        if len(self._load_graph_orders) >= 10:
            self._load_queue.send_messages(Entries=self._load_graph_orders)
            self._load_graph_orders = []
        self._load_graph_orders.append({
            'Id':
            str(self._graph_counter),
            'MessageBody':
            json.dumps({
                'task_name': 'load',
                'task_args': {
                    'keys': stalled_key
                }
            })
        })
        self._graph_counter += 1

    def _assimilate(self, stalled_object):
        disposition = stalled_object['disposition']
        if disposition == 'graphing':
            return self._load(stalled_object)
        raise NotImplementedError()

    def _extract(self, stalled_object):
        identifier_stem = stalled_object['identifier_stem']
        identifier_stem = IdentifierStem.from_raw(identifier_stem)
        extractor_names = self._driver.get_extractor_function_names(
            identifier_stem)
        schema_entry = SchemaVertexEntry.retrieve(
            stalled_object['object_type'])
        schema_extraction_properties = schema_entry.extract[
            extractor_names['type']]
        extraction_properties = identifier_stem.for_extractor
        extraction_properties.update(
            schema_extraction_properties.extraction_properties)
        extractor_name = extractor_names['extraction']
        extraction_order = ExtractObjectOrder(identifier_stem,
                                              stalled_object['id_value'],
                                              extractor_name,
                                              extraction_properties,
                                              schema_entry)
        self._extraction_queue.add_order(extraction_order)

    def _transform(self, stalled_object):
        pass

    def _process(self, stalled_object):
        pass
Esempio n. 14
0
class SevenOfNine:
    def __init__(self, metal_order, **kwargs):
        self._assimilate_order = metal_order
        self._source_vertex = metal_order.source_vertex
        self._potential_vertex = metal_order.potential_vertex
        self._rule_entry = metal_order.rule_entry
        self._extracted_data = metal_order.extracted_data
        self._dynamo_driver = LeechDriver()

    def assimilate(self):
        assimilation_results = []
        edge_regulator = EdgeRegulator.get_for_object_type(self._rule_entry.edge_type)
        identified_vertexes, exist = self._derive_vertexes()
        for vertex in identified_vertexes:
            edge = self._derive_edge(edge_regulator, vertex)
            assimilation_results.append({
                'edge': edge,
                'vertex': vertex
            })
        self._write_assimilation_results(identified_vertexes, exist, assimilation_results)

    def _derive_vertexes(self):
        if self._potential_vertex.is_properties_complete and self._potential_vertex.is_identifiable:
            return [self._potential_vertex], False
        found_vertexes = self._dynamo_driver.find_potential_vertexes(
            self._potential_vertex.object_type, self._potential_vertex.object_properties)
        if found_vertexes:
            return found_vertexes, True
        if self._rule_entry.is_stub:
            return [self._potential_vertex], False
        return [], None

    def _write_assimilation_results(self, identified_vertexes, exist, assimilation_results):
        self._set_new_vertexes(identified_vertexes, exist)
        self._set_assimilation_results(assimilation_results)

    def _set_new_vertexes(self, identified_vertexes, exist):
        if exist:
            return
        if self._rule_entry.is_stub:
            return self._write_vertexes(identified_vertexes, True)
        if self._rule_entry.is_create:
            return self._write_vertexes(identified_vertexes)
        if self._rule_entry.is_pass:
            return
        raise NotImplementedError('do not know what to do with potential_vertex: %s for rule_type %s' % (
            self._potential_vertex, self._rule_entry.if_missing))

    def _derive_edge(self, edge_regulator, potential_vertex):
        return edge_regulator.generate_potential_edge(
            self._source_vertex, potential_vertex, self._extracted_data, self._rule_entry.inbound)

    def _set_assimilation_results(self, assimilation_results):
        try:
            self._dynamo_driver.set_assimilation_results(
                self._rule_entry.edge_type, assimilation_results,
                identifier_stem=self._source_vertex.identifier_stem,
                id_value=self._source_vertex.id_value)
        except ClientError as e:
            if e.response['Error']['Code'] != 'ConditionalCheckFailedException':
                raise e
            logging.warning(
                'attempted to set assimilation results for edge_type: %s, '
                'but it appears this step has already happened, no changes to be made' % self._rule_entry.edge_type)

    def _write_vertexes(self, vertexes, is_stub=False):
        for vertex in vertexes:
            self._write_vertex(vertex, is_stub)

    def _write_vertex(self, vertex, is_stub):
        try:
            if is_stub:
                return self._dynamo_driver.set_assimilated_vertex(vertex, is_stub, identifier_stem=None, id_value=None)
            return self._dynamo_driver.set_assimilated_vertex(
                vertex, is_stub, identifier_stem=vertex.identifier_stem, id_value=vertex.id_value)
        except ClientError as e:
            if e.response['Error']['Code'] != 'ConditionalCheckFailedException':
                raise e
            logging.warning(
                'attempted to write a new vertex: %s, '
                'but it appears this step has already happened, no changes to be made' % vertex.to_json)