Esempio n. 1
0
def handle_fault_execution(request, schedule_time, exception):
    unique_id = request["unique_id"]
    job_url = request["url"]
    project_name = request["project_name"]
    job_name = request["job_name"]
    user_id = request["user_id"]
    crawler_name = request["crawler_name"]
    schedule_category = request["schedule_category"]

    # update relevant MongoDC entry in jobs collection with task_id and status
    update_data = u'{ "unique_id": "' + unique_id + '", "url": "' + job_url + '", "project_name": "' \
                  + project_name + '", "job_name": "' + job_name + '", "user_id": "' + user_id \
                  + '", "crawler_name": "' + crawler_name \
                  + '", "task_id": "Not Generated", "status": "FAILED" }'

    data_item = json.loads(update_data)
    data_item['schedule_category'] = schedule_category
    data_item['schedule_time'] = schedule_time

    mongo_connection = MongoConnection()
    if schedule_category == "Instant":
        # update relevant MongoDC entry in jobs collection with task_id and status
        query = {'user_id': user_id, 'url': job_url, 'project_name': project_name, 'job_name': job_name,
                 'crawler_name': crawler_name}
        mongo_connection.upsert_item(query, data_item, "jobs")
    else:
        # store job records in MongoDB database
        mongo_connection.insert_item(data_item, "jobs")
    logger.exception("Current can not schedule with invalid date or time format. "
                     "Hence, job execution failed. " + str(exception))
Esempio n. 2
0
def project_create(request):
    # take urls comes from client.
    try:
        json_data = json.loads(request.body)
        user_id = json_data['user_id']
        project_name = json_data['project_name']
        if not user_id:
            return JsonResponse(
                {'Error': 'Request payload does not contain user_id'},
                status=400)

        if not project_name:
            return JsonResponse(
                {'Error': 'Request payload does not contain project_name'},
                status=400)

    except JSONDecodeError:
        return JsonResponse(
            {
                'Error':
                'Request payload does not contain required parameters or empty'
            },
            status=400)

    # user Authorization
    token_header = request.headers.get('Token')
    auth = FirebaseAuth(token_header, user_id)

    if not auth:
        return JsonResponse(
            {
                'Error':
                'User authentication failed. Please try again with a valid user login'
            },
            status=400)

    try:
        mongo_connection = MongoConnection()
        data_item = dict(json_data)
        query = {
            'user_id': data_item['user_id'],
            'project_name': data_item['project_name']
        }
        mongo_connection.upsert_item(query, data_item, "projects")
    except Exception as e:
        return JsonResponse(
            {
                'Error':
                'Error while connecting to the MongoDB database, ' + str(e)
            },
            status=400)

    return JsonResponse({
        'status':
        "SUCCESS",
        'Message':
        'Project:' + project_name + ' created successfully'
    })
Esempio n. 3
0
def crawl_new_job(request):
    # Post requests are for new crawling tasks
    if request.method == 'POST':

        # take urls comes from client.
        try:
            json_data = json.loads(request.body)
            url_data = json_data['urls']
            job_name = json_data['job_name']
            project_name = json_data['project_name']
            user_id = json_data['user_id']
            crawler_name = json_data['crawler_name']
            schedule_type = json_data['schedule_type']
            schedule_data = json_data['schedule_data']
        except (JSONDecodeError, KeyError) as e:
            return JsonResponse({'Error': 'Missing fields in the request payload or empty, ' + str(e)}, status=400)

        if not user_id:
            return JsonResponse({'Error': 'Missing user id key in the request payload'}, status=400)

        if not job_name:
            return JsonResponse({'Error': 'Missing job name key in the request payload'}, status=400)

        if not url_data:
            return JsonResponse({'Error': 'Missing urls key in the request payload'}, status=400)

        if not project_name:
            return JsonResponse({'Error': 'Missing project_name key in the request payload'}, status=400)

        if not crawler_name:
            return JsonResponse({'Error': 'Missing crawler_name key in the request payload'}, status=400)

        if not schedule_type:
            return JsonResponse({'Error': 'Missing schedule_type key in the request payload'}, status=400)

        if (schedule_type != SCHEDULE_TASK_TYPE) \
                and (schedule_type != INTERVAL_TASK_TYPE) and (schedule_type != HOT_TASK_TYPE):
            return JsonResponse({'Error': 'Requested crawler_type:' + schedule_type + ' is not a valid type'},
                                status=400)

        publish_url_ids = []
        for url in url_data:
            if not is_valid_url(url):
                return JsonResponse({'Error': url + ' URL is invalid'}, status=400)

            unique_id = str(uuid4())  # create a unique ID.
            publish_data = u'{ "unique_id": "' + unique_id + '", "job_name": "' + job_name \
                           + '", "url": "' + url + '", "project_name": "' \
                           + project_name + '", "user_id": "' + user_id + '", "crawler_name": "' + crawler_name \
                           + '", "task_id":"" }'

            publish_data = json.loads(publish_data)
            try:
                # schedule data with celery task scheduler
                if schedule_type == SCHEDULE_TASK_TYPE:
                    publish_data['schedule_data'] = schedule_data
                    publish_data['schedule_category'] = CRON
                    publish_data['status'] = "RUNNING"
                    celery_task = schedule_job_with_cron_tab(publish_data)
                elif schedule_type == INTERVAL_TASK_TYPE:
                    publish_data['schedule_data'] = schedule_data
                    publish_data['schedule_category'] = INTERVAL
                    publish_data['status'] = "RUNNING"
                    celery_task = schedule_job_with_interval(publish_data)
                else:
                    publish_data['schedule_category'] = INSTANT
                    publish_data['status'] = "PENDING"
                    celery_task = schedule_cron_job.delay(kwargs=json.dumps(publish_data))

                if isinstance(celery_task, JsonResponse):
                    return celery_task

                publish_url_ids.append(unique_id)
                publish_data['celery_task_name'] = celery_task.name
                try:
                    # store job records in MongoDB database
                    query = {'user_id': user_id, 'job_name': job_name, 'url': url,
                             'project_name': project_name, 'crawler_name': crawler_name}
                    mongo_connection = MongoConnection()
                    mongo_connection.upsert_item(query, publish_data, "jobs")
                except Exception as e:
                    return JsonResponse({'Error': 'Error while connecting to the MongoDB database, ' + str(e)},
                                        status=400)
            except Exception as e:
                return JsonResponse({'Status': "400 BAD",
                                     'Error': 'Error occurred while scheduling the data with the Celery executor, '
                                              + str(e)}, status=400)

        return JsonResponse({'status': "SUCCESS", 'Message': "Crawl job scheduled successfully.\n job_ids:"
                                                             + str(publish_url_ids)})
Esempio n. 4
0
def schedule_cron_job(self, **kwargs):
    json_body = ""
    schedule_time = str(time.time())
    try:
        json_body = kwargs
        if "kwargs" in json_body:
            json_body = json.loads(json_body['kwargs'])
        unique_id = json_body['unique_id']
        job_url = json_body["url"]
        project_name = json_body["project_name"]
        job_name = json_body["job_name"]
        user_id = json_body["user_id"]
        status = json_body["status"]
        crawler_name = json_body["crawler_name"]
        schedule_category = json_body["schedule_category"]

        if not unique_id or not job_url or not project_name or not user_id or not status \
                or not crawler_name or not job_name:
            raise Exception('Required parameters are missing in the consumed message')

        if check_url_gives_response(job_url):
            job_domain = urlparse(job_url).netloc
            try:
                settings = {
                    'unique_id': unique_id,
                    'user_id': user_id,
                    'job_name': job_name,
                    'project_name': project_name,
                    'schedule_time': schedule_time,
                    'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
                }

                # to list available spiders in the project
                # print(scrapyd.list_spiders("crawlerx_project"))

                # Schedule a crawl job with project and a specific spider
                task_id = \
                    scrapy_daemon.schedule("crawlerx_project", crawler_name, settings=settings,
                                           url=job_url, domain=job_domain)

                mongo_connection = MongoConnection()

                job_data = u'{ "unique_id": "' + unique_id + '", "url": "' + job_url + '", "project_name": "' \
                           + project_name + '", "job_name": "' + job_name + '", "user_id": "' + user_id \
                           + '", "crawler_name": "' + crawler_name \
                           + '", "task_id": "' + task_id + '", "status": "RUNNING" }'
                data_item = json.loads(job_data)
                data_item['schedule_category'] = schedule_category
                data_item['schedule_time'] = schedule_time

                if schedule_category == "Instant":
                    # update relevant MongoDC entry in jobs collection with task_id and status
                    query = {'user_id': user_id, 'url': job_url, 'project_name': project_name, 'job_name': job_name,
                             'crawler_name': crawler_name}
                    mongo_connection.upsert_item(query, data_item, "jobs")
                else:
                    # store job records in MongoDB database
                    mongo_connection.insert_item(data_item, "jobs")

                # task id of the crawl job
                logger.info("Crawling job has been started with ID - " + task_id)
            except Exception as e:
                handle_fault_execution(json_body, schedule_time, e)
        else:
            handle_fault_execution(json_body, schedule_time,
                                   Exception("Current job URL does not seems available. Hence, job execution failed."))
    except Exception as e:
        handle_fault_execution(json_body, schedule_time, e)