Exemple #1
0
 def get_context_data(self, **kwargs):
     context = super(AggregatorExportView, self).get_context_data(**kwargs)
     context.update({
         'sd_prefix': settings.TRIPLE_DATABASE['PREFIXES']['sdv1'],
         'sparql_endpoint': get_virtuoso_endpoint(),
         'mastergraph_host': settings.TRIPLE_DATABASE_MASTER['HOST'],
         'mastergraph_port':
         settings.TRIPLE_DATABASE_MASTER['KWARGS']['rexpro_port'],
         'mastergraph_graphname':
         settings.TRIPLE_DATABASE_MASTER['KWARGS']['graph'],
         'resource_namespace':
         settings.TRIPLE_DATABASE_MASTER['PREFIXES']['sdres'],
     })
     return context
Exemple #2
0
 def get_context_data(self, **kwargs):
     context = super(AggregatorExportView, self).get_context_data(**kwargs)
     context.update({
         'sd_prefix':
         settings.TRIPLE_DATABASE['PREFIXES']['sdv1'],
         'sparql_endpoint':
         get_virtuoso_endpoint(),
         'mastergraph_host':
         settings.TRIPLE_DATABASE_MASTER['HOST'],
         'mastergraph_port':
         settings.TRIPLE_DATABASE_MASTER['KWARGS']['rexpro_port'],
         'mastergraph_graphname':
         settings.TRIPLE_DATABASE_MASTER['KWARGS']['graph'],
         'resource_namespace':
         settings.TRIPLE_DATABASE_MASTER['PREFIXES']['sdres'],
     })
     return context
Exemple #3
0
def process_aggregator(aggregator, force=False):
    """ execute the aggregator workflow: run silk on every archive item
     associated to the aggregator.
    """
    from tempfile import mkdtemp
    from webui.cnmain.utils import get_virtuoso_endpoint

    logger_name = process_aggregator.request.id
    loggy = get_redis_logger(logger_name)
    local_manager.cleanup()
    local.logger = loggy
    tmpdir = mkdtemp()
    scheduler = Scheduler.objects.create(
        content_type=ContentType.objects.get_for_model(aggregator),
        object_id=aggregator.pk,
        status=Scheduler.RUNNING,
        logger_name=logger_name,
    )

    try:
        loggy.info("Processing aggregator %s", unicode(aggregator))
        loggy.debug("Working dir: %s", tmpdir)

        context = {
            'aggregator':
            aggregator,
            'sd_prefix':
            settings.TRIPLE_DATABASE['PREFIXES']['sdv1'],
            'sparql_endpoint':
            get_virtuoso_endpoint(),
            'mastergraph_host':
            settings.TRIPLE_DATABASE_MASTER['HOST'],
            'mastergraph_port':
            settings.TRIPLE_DATABASE_MASTER['KWARGS']['rexpro_port'],
            'mastergraph_graphname':
            settings.TRIPLE_DATABASE_MASTER['KWARGS']['graph'],
            'resource_namespace':
            settings.TRIPLE_DATABASE_MASTER['PREFIXES']['sdres'],
        }

        loggy.info("Connecting to virtuoso")

        aggregator_archiveitems = aggregator.aggregatorarchiveitem_set\
            .all().order_by('first_workflow_success')

        if not force:
            res = []
            for aggregator_archiveitem in aggregator_archiveitems:
                if aggregator_archiveitem.needs_update():
                    res.append(aggregator_archiveitem)
                else:
                    loggy.info('Skipped archiveitem %s',
                               unicode(aggregator_archiveitem.archiveitem))

            aggregator_archiveitems = res

        _aggregator_process_archiveitems(aggregator_archiveitems, scheduler,
                                         tmpdir, context)

        loggy.info('Workflow completed')
    except Exception, e:
        loggy.exception('Generic exception in the workflow')
        scheduler.status = Scheduler.FAIL
        scheduler.error = e.message or str(e)
        # send the exception to sentry
        raise
Exemple #4
0
def process_aggregator(aggregator, force=False):
    """ execute the aggregator workflow: run silk on every archive item
     associated to the aggregator.
    """
    from tempfile import mkdtemp
    from webui.cnmain.utils import get_virtuoso_endpoint

    logger_name = process_aggregator.request.id
    loggy = get_redis_logger(logger_name)
    local_manager.cleanup()
    local.logger = loggy
    tmpdir = mkdtemp()
    scheduler = Scheduler.objects.create(
        content_type=ContentType.objects.get_for_model(aggregator),
        object_id=aggregator.pk,
        status=Scheduler.RUNNING,
        logger_name=logger_name,
    )

    try:
        loggy.info("Processing aggregator %s", unicode(aggregator))
        loggy.debug("Working dir: %s", tmpdir)

        context = {
            'aggregator': aggregator,
            'sd_prefix': settings.TRIPLE_DATABASE['PREFIXES']['sdv1'],
            'sparql_endpoint': get_virtuoso_endpoint(),
            'mastergraph_host': settings.TRIPLE_DATABASE_MASTER['HOST'],
            'mastergraph_port':
            settings.TRIPLE_DATABASE_MASTER['KWARGS']['rexpro_port'],
            'mastergraph_graphname':
            settings.TRIPLE_DATABASE_MASTER['KWARGS']['graph'],
            'resource_namespace':
            settings.TRIPLE_DATABASE_MASTER['PREFIXES']['sdres'],
        }

        loggy.info("Connecting to virtuoso")

        aggregator_archiveitems = aggregator.aggregatorarchiveitem_set\
            .all().order_by('first_workflow_success')

        if not force:
            res = []
            for aggregator_archiveitem in aggregator_archiveitems:
                if aggregator_archiveitem.needs_update():
                    res.append(aggregator_archiveitem)
                else:
                    loggy.info('Skipped archiveitem %s',
                               unicode(aggregator_archiveitem.archiveitem))

            aggregator_archiveitems = res

        _aggregator_process_archiveitems(
            aggregator_archiveitems, scheduler, tmpdir, context
        )

        loggy.info('Workflow completed')
    except Exception, e:
        loggy.exception('Generic exception in the workflow')
        scheduler.status = Scheduler.FAIL
        scheduler.error = e.message or str(e)
        # send the exception to sentry
        raise
Exemple #5
0
    def test_silk_project_file_is_valid(self):
        import xml.etree.ElementTree as ET

        self.client_login('admin')

        item1 = ArchiveItemFactory()
        item2 = ArchiveItemFactory()
        for item in (item1, item2):
            AggregatorArchiveItem.objects.create(
                aggregator=self.aggregator,
                archiveitem=item
            )

        response = self.client.get(self.export_url)
        tree = ET.fromstring(response.content)

        self.assertIn(
            (settings.TRIPLE_DATABASE['PREFIXES']['sdv1'], 'sd'),
            [(x.get('namespace'), x.get('id'))
             for x in tree.findall('.//Prefix')]
        )

        # check datasources
        datasources = tree.findall('.//DataSource')
        self.assertEqual(len(datasources), 3)
        self.assertEqual(datasources[0].get('id'), 'master-graph')

        mastergraph = datasources[0]
        datasources = datasources[1:]

        # check datasources endpoints
        self.assertEqual(
            mastergraph.find('Param[@name="host"]').get('value'),
            settings.TRIPLE_DATABASE_MASTER['HOST']
        )
        self.assertEqual(
            [get_virtuoso_endpoint()] * 2,
            [x.find('Param[@name="endpointURI"]').get("value")
             for x in datasources]
        )

        # check datasources graph names
        self.assertEqual(
            mastergraph.find('Param[@name="graph"]').get('value'),
            settings.TRIPLE_DATABASE_MASTER["KWARGS"]["graph"]
        )
        self.assertEqual(
            [item1.datagraph_mapped_name, item2.datagraph_mapped_name],
            [x.find('Param[@name="graph"]').get("value")
             for x in datasources]
        )

        # check tasks
        datasource_ids = [x.get('id') for x in datasources]
        tasks = tree.findall('.//LinkingTask')
        self.assertEqual(len(tasks), 2)
        self.assertEqual(
            datasource_ids,
            [x.find('.//Interlink').get('id') for x in tasks]
        )

        # check task parameters
        for datasource_id, task in zip(datasource_ids, tasks):
            self.assertEqual(
                task.find('.//SourceDataset').get('dataSource'),
                datasource_id
            )
            self.assertEqual(
                task.find('.//TargetDataset').get('dataSource'),
                'master-graph'
            )
            self.assertEqual(
                task.find('.//SourceDataset').find('RestrictTo').text.strip(),
                '?a rdf:type <{}> .'.format(self.aggregator.entity_type)
            )
            self.assertEqual(
                task.find('.//TargetDataset').find('RestrictTo').text.strip(),
                'b -> {}'.format(self.aggregator.vertex_selector)
            )
            self.assertIsNone(task.find('.//LinkageRule').text)
            self.assertIsNone(task.find('.//Filter').text)
            self.assertIsNone(task.find('.//Outputs').text)
            self.assertIsNone(task.find('.//PositiveEntities').text)
            self.assertIsNone(task.find('.//NegativeEntities').text)
            self.assertIsNone(
                task.find('.//Alignment/')
                    .find('{}Alignment'.format('{http://knowledgeweb.'
                                               'semanticweb.org'
                                               '/heterogeneity/alignment#}')
                          ).text
            )
Exemple #6
0
    def test_silk_project_file_is_valid(self):
        import xml.etree.ElementTree as ET

        self.client_login('admin')

        item1 = ArchiveItemFactory()
        item2 = ArchiveItemFactory()
        for item in (item1, item2):
            AggregatorArchiveItem.objects.create(aggregator=self.aggregator,
                                                 archiveitem=item)

        response = self.client.get(self.export_url)
        tree = ET.fromstring(response.content)

        self.assertIn((settings.TRIPLE_DATABASE['PREFIXES']['sdv1'], 'sd'),
                      [(x.get('namespace'), x.get('id'))
                       for x in tree.findall('.//Prefix')])

        # check datasources
        datasources = tree.findall('.//DataSource')
        self.assertEqual(len(datasources), 3)
        self.assertEqual(datasources[0].get('id'), 'master-graph')

        mastergraph = datasources[0]
        datasources = datasources[1:]

        # check datasources endpoints
        self.assertEqual(
            mastergraph.find('Param[@name="host"]').get('value'),
            settings.TRIPLE_DATABASE_MASTER['HOST'])
        self.assertEqual([get_virtuoso_endpoint()] * 2, [
            x.find('Param[@name="endpointURI"]').get("value")
            for x in datasources
        ])

        # check datasources graph names
        self.assertEqual(
            mastergraph.find('Param[@name="graph"]').get('value'),
            settings.TRIPLE_DATABASE_MASTER["KWARGS"]["graph"])
        self.assertEqual(
            [item1.datagraph_mapped_name, item2.datagraph_mapped_name],
            [x.find('Param[@name="graph"]').get("value") for x in datasources])

        # check tasks
        datasource_ids = [x.get('id') for x in datasources]
        tasks = tree.findall('.//LinkingTask')
        self.assertEqual(len(tasks), 2)
        self.assertEqual(datasource_ids,
                         [x.find('.//Interlink').get('id') for x in tasks])

        # check task parameters
        for datasource_id, task in zip(datasource_ids, tasks):
            self.assertEqual(
                task.find('.//SourceDataset').get('dataSource'), datasource_id)
            self.assertEqual(
                task.find('.//TargetDataset').get('dataSource'),
                'master-graph')
            self.assertEqual(
                task.find('.//SourceDataset').find('RestrictTo').text.strip(),
                '?a rdf:type <{}> .'.format(self.aggregator.entity_type))
            self.assertEqual(
                task.find('.//TargetDataset').find('RestrictTo').text.strip(),
                'b -> {}'.format(self.aggregator.vertex_selector))
            self.assertIsNone(task.find('.//LinkageRule').text)
            self.assertIsNone(task.find('.//Filter').text)
            self.assertIsNone(task.find('.//Outputs').text)
            self.assertIsNone(task.find('.//PositiveEntities').text)
            self.assertIsNone(task.find('.//NegativeEntities').text)
            self.assertIsNone(
                task.find('.//Alignment/').find('{}Alignment'.format(
                    '{http://knowledgeweb.'
                    'semanticweb.org'
                    '/heterogeneity/alignment#}')).text)
Exemple #7
0
    def test_can_silk_rules_file_is_valid(self):
        import xml.etree.ElementTree as ET
        from django.template.loader import render_to_string

        archive_item = ArchiveItemFactory()
        AggregatorArchiveItem.objects.create(
            aggregator=self.aggregator,
            archiveitem=archive_item
        )

        self.aggregator.silk_rule = \
            '<LinkageRule><smart data="now" /></LinkageRule>'
        self.aggregator.save()
        output_filename = 'a_really_cool_filename.thm'

        context = {
            'aggregator': self.aggregator,
            'sd_prefix': settings.TRIPLE_DATABASE['PREFIXES']['sdv1'],
            'sparql_endpoint': get_virtuoso_endpoint(),
            'archive_item': archive_item,
            'output_filename': output_filename,
            'mastergraph_host': settings.TRIPLE_DATABASE_MASTER['HOST'],
            'mastergraph_port':
            settings.TRIPLE_DATABASE_MASTER['KWARGS']['rexpro_port'],
            'mastergraph_graphname':
            settings.TRIPLE_DATABASE_MASTER['KWARGS']['graph'],
            'resource_namespace':
            settings.TRIPLE_DATABASE_MASTER['PREFIXES']['sdres'],
        }

        tree = ET.fromstring(render_to_string(
            'controller/aggregator/silk_rules.xml', context
        ))

        self.assertIn(
            (settings.TRIPLE_DATABASE['PREFIXES']['sdv1'], 'sd'),
            [(x.get('namespace'), x.get('id'))
             for x in tree.findall('.//Prefix')]
        )

        # check datasources
        datasources_dom = tree.findall('.//DataSource')
        self.assertEqual(len(datasources_dom), 2)
        self.assertEqual(datasources_dom[0].get('id'), 'master-graph')

        mastergraph, datasource = datasources_dom

        # check datasource endpoints
        self.assertEqual(
            get_virtuoso_endpoint(),
            datasource.find('Param[@name="endpointURI"]').get("value"),
        )

        # check datasources graph names
        self.assertEqual(
            mastergraph.find('Param[@name="graph"]').get('value'),
            settings.TRIPLE_DATABASE_MASTER["KWARGS"]["graph"]
        )
        self.assertEqual(
            archive_item.datagraph_mapped_name,
            datasource.find('Param[@name="graph"]').get("value")
        )

        # check tasks
        datasource_id = datasource.get('id')
        rules = tree.findall('.//Interlink')
        self.assertEqual(len(rules), 1)
        self.assertEqual(datasource_id, rules[0].get('id'))

        # check rules parameters
        rule = rules[0]
        self.assertEqual(
            rule.find('.//SourceDataset').get('dataSource'),
            datasource_id
        )
        self.assertEqual(
            rule.find('.//TargetDataset').get('dataSource'),
            'master-graph'
        )
        self.assertEqual(
            ET.tostring(rule.find('.//LinkageRule')).strip(),
            self.aggregator.silk_rule
        )
        self.assertEqual(
            rule.find('.//SourceDataset').find('RestrictTo').text.strip(),
            '?a rdf:type <{}> .'.format(self.aggregator.entity_type)
        )
        self.assertEqual(
            rule.find('.//TargetDataset').find('RestrictTo').text.strip(),
            'b -> {}'.format(self.aggregator.vertex_selector)
        )
        self.assertIsNone(rule.find('.//Filter').text)

        output = rule.find('.//Outputs').find('Output')
        self.assertEqual(output.get('type'), 'file')
        self.assertEqual(output.findall('Param')[0].get('name'), 'file')
        self.assertEqual(
            output.findall('Param')[0].get('value'), output_filename)
        self.assertEqual(output.findall('Param')[1].get('name'), 'format')
        self.assertEqual(output.findall('Param')[1].get('value'), 'ntriples')