Ejemplo n.º 1
0
def create_from_json(records, apply_async=True):
    current_app.logger.info('Loading dump...')

    for i, record in enumerate(records['records']):
        engine = WorkflowEngine.with_name("articles_upload")
        engine.save()
        obj = workflow_object_class.create(data=record)
        obj.id_workflow = str(engine.uuid)
        extra_data = {}
        record_extra = record.pop('extra_data', {})
        if record_extra:
            extra_data['record_extra'] = record_extra

        obj.extra_data['source_data'] = {
            'data': copy.deepcopy(record),
            'extra_data': copy.deepcopy(extra_data),
        }
        obj.extra_data.update(extra_data)

        obj.data_type = current_app.config['CRAWLER_DATA_TYPE']
        obj.save()
        db.session.commit()

        job_id = uuid1()

        crawler_object = CrawlerWorkflowObject(job_id=job_id, object_id=obj.id)
        db.session.add(crawler_object)
        queue = current_app.config['CRAWLER_CELERY_QUEUE']

        if apply_async:
            start.apply_async(
                kwargs={
                    'workflow_name': "articles_upload",
                    'object_id': obj.id,
                },
                queue=queue,
            )
        else:
            start(workflow_name="articles_upload", object_id=obj.id)

        current_app.logger.info('Parsed record {}.'.format(i))
Ejemplo n.º 2
0
def submit_results(job_id, errors, log_file, results_uri, results_data=None):
    """Receive the submission of the results of a crawl job.

    Then it spawns the appropiate workflow according to whichever workflow
    the crawl job specifies.

    :param job_id: Id of the crawler job.
    :param errors: Errors that happened, if any (seems ambiguous)
    :param log_file: Path to the log file of the crawler job.
    :param results_uri: URI to the file containing the results of the crawl
       job, namely the records extracted.
    :param results_data: Optional data payload with the results list, to skip
        retrieving them from the `results_uri`, useful for slow or unreliable
        storages.
    """
    results_path = urlparse(results_uri).path
    job = CrawlerJob.get_by_job(job_id)
    job.logs = log_file
    job.results = results_uri

    if errors:
        job.status = JobStatus.ERROR
        job.save()
        db.session.commit()
        raise CrawlerJobError(str(errors))

    if results_data is None:
        results_data = _extract_results_data(results_path)

    for crawl_result in results_data:
        crawl_result = copy.deepcopy(crawl_result)
        try:
            _check_crawl_result_format(crawl_result)
        except KeyError as e:
            crawl_result = _crawl_result_from_exception(e, crawl_result)

        record = crawl_result.pop('record')
        crawl_errors = crawl_result['errors']

        current_app.logger.debug('Parsing record: {}'.format(record))
        engine = WorkflowEngine.with_name(job.workflow)
        engine.save()
        obj = workflow_object_class.create(data=record)
        obj.id_workflow = str(engine.uuid)
        if crawl_errors:
            obj.status = ObjectStatus.ERROR
            obj.extra_data['crawl_errors'] = crawl_result

        else:
            extra_data = {
                'crawler_job_id': job_id,
                'crawler_results_path': results_path,
            }
            record_extra = record.pop('extra_data', {})
            if record_extra:
                extra_data['record_extra'] = record_extra

            obj.extra_data['source_data'] = {
                'data': copy.deepcopy(record),
                'extra_data': copy.deepcopy(extra_data),
            }
            obj.extra_data.update(extra_data)

        obj.data_type = current_app.config['CRAWLER_DATA_TYPE']
        obj.save()
        db.session.commit()

        crawler_object = CrawlerWorkflowObject(job_id=job_id, object_id=obj.id)
        db.session.add(crawler_object)
        queue = current_app.config['CRAWLER_CELERY_QUEUE']

        if not crawl_errors:
            start.apply_async(
                kwargs={
                    'workflow_name': job.workflow,
                    'object_id': obj.id,
                },
                queue=queue,
            )

    current_app.logger.info('Parsed {} records.'.format(len(results_data)))

    job.status = JobStatus.FINISHED
    job.save()
    db.session.commit()