def delete_crawl_job(request, job_id): if request.method == 'DELETE': try: mongo_connection = MongoConnection() json_data = mongo_connection.get_items("jobs", {'unique_id': job_id}) if len(json_data) == 0: return JsonResponse({'Error': 'Requested job id' + str(job_id) + 'does not exists'}, status=400) # this should be a interval or cron job celery_task_name = "" if len(json_data) > 1: for obj in json_data: if 'celery_task_name' in obj: celery_task_name = obj['celery_task_name'] break delete_count = 0 if json_data[0]['schedule_category'] == INSTANT: delete_count = mongo_connection.delete_items("jobs", {'unique_id': job_id}) else: # delete scheduled task from django beat if not celery_task_name: celery_task_name = json_data[0]['celery_task_name'] delete_schedule_job(celery_task_name) delete_count = mongo_connection.delete_items("jobs", {'unique_id': job_id}) if delete_count == 0: return JsonResponse({'Error': 'Delete action failed for the job_id: ' + str(job_id)}, status=400) except Exception as e: return JsonResponse({'Error': 'Error while deleting the job from the database, ' + str(e)}, status=400) return JsonResponse({'Status': "SUCCESS", 'Message': 'Crawl job deleted successfully'})
def get_projects(request): try: json_data = json.loads(request.body) user_id = json_data['user_id'] if not user_id: return JsonResponse( {'Error': 'Request payload does not contain user_id'}, status=400) except JSONDecodeError: return JsonResponse( { 'Error': 'Request payload does not contain required parameters or empty' }, status=400) try: mongo_connection = MongoConnection() json_data = mongo_connection.get_items("projects", {'user_id': user_id}) except Exception as e: return JsonResponse( { 'Error': 'Error while getting project details from the database, ' + str(e) }, status=400) return JsonResponse({'Status': "SUCCESS", 'data': json_data})
def get_jobs_by_project(request): try: json_data = json.loads(request.body) user_id = json_data['user_id'] project_name = json_data['project_name'] except JSONDecodeError as e: return JsonResponse({ 'Error': 'Missing user_id in the request payload or empty, ' + str(e) }) if not user_id: return JsonResponse( {'Error': 'Request payload does not contain user_id'}) if not project_name: return JsonResponse( {'Error': 'Request payload does not contain project_name'}) # take urls comes from client. try: mongo_connection = MongoConnection() json_data = mongo_connection.get_items("jobs", { 'user_id': user_id, 'project_name': project_name }) except Exception as e: return JsonResponse({ 'Error': 'Error while getting job details from the database, ' + str(e) }) return JsonResponse({'status': "SUCCESS", 'data': json_data})
def project_create(request): # take urls comes from client. try: json_data = json.loads(request.body) user_id = json_data['user_id'] project_name = json_data['project_name'] if not user_id: return JsonResponse( {'Error': 'Request payload does not contain user_id'}, status=400) if not project_name: return JsonResponse( {'Error': 'Request payload does not contain project_name'}, status=400) except JSONDecodeError: return JsonResponse( { 'Error': 'Request payload does not contain required parameters or empty' }, status=400) # user Authorization token_header = request.headers.get('Token') auth = FirebaseAuth(token_header, user_id) if not auth: return JsonResponse( { 'Error': 'User authentication failed. Please try again with a valid user login' }, status=400) try: mongo_connection = MongoConnection() data_item = dict(json_data) query = { 'user_id': data_item['user_id'], 'project_name': data_item['project_name'] } mongo_connection.upsert_item(query, data_item, "projects") except Exception as e: return JsonResponse( { 'Error': 'Error while connecting to the MongoDB database, ' + str(e) }, status=400) return JsonResponse({ 'status': "SUCCESS", 'Message': 'Project:' + project_name + ' created successfully' })
def delete_crawl_task(request, task_id): if request.method == 'DELETE': try: mongo_connection = MongoConnection() delete_count = mongo_connection.delete_items("jobs", {'task_id': task_id}) if delete_count == 0: return JsonResponse({'Error': 'Delete action failed for the task_id: ' + str(task_id)}, status=400) except Exception as e: return JsonResponse({'Error': 'Error while deleting the job from the database, ' + str(e)}, status=400) return JsonResponse({'Status': "SUCCESS", 'Message': 'Crawl job deleted successfully'})
def get_job_data(request): # take urls comes from client. try: json_data = json.loads(request.body) except JSONDecodeError as e: return JsonResponse({'Error': 'Missing URLs in the request payload or empty, ' + str(e)}, status=400) if "user_id" not in json_data: return JsonResponse({'Error': 'Missing user id key in the request payload'}, status=400) if ("task_id" not in json_data) and ("unique_id" not in json_data): return JsonResponse({'Error': 'Missing unique_id or task_id key in the request payload'}, status=400) try: mongo_connection = MongoConnection() json_data = mongo_connection.get_items("jobs", json_data) except Exception as e: return JsonResponse({'Error': 'Error while getting project details from the database, ' + str(e)}, status=400) return JsonResponse({'Status': "SUCCESS", 'data': json_data})
def disable_schedule_job(request): if request.method == 'POST': task_name = "" try: json_data = json.loads(request.body) task_name = json_data['celery_task_name'] is_enabled = json_data['is_enabled'] task = PeriodicTask.objects.get(name=task_name) if not task: return JsonResponse({'Error': 'Scheduled task_name: ' + task_name + ' is invalid or does not exist'} , status=400) if type(is_enabled) != bool: return JsonResponse({'Error': 'Scheduled is_enabled: ' + is_enabled + ' is invalid parameter'} , status=400) task.enabled = is_enabled task.save() mongo_connection = MongoConnection() if is_enabled: mongo_connection.update_item({"celery_task_name": task_name, "status": "DISABLED"}, {'$set': {"status": "RUNNING"}}, "jobs") value = "enabled" else: mongo_connection.update_item({"celery_task_name": task_name, "status": "RUNNING"}, {'$set': {"status": "DISABLED"}}, "jobs") value = "disabled" return JsonResponse({'Status': "SUCCESS", 'Message': 'Successfully ' + value + ' the scheduled task_name: ' + task_name}) except Exception as e: return JsonResponse({'status': "400 BAD", 'Error': 'Error occurred while disabling the scheduled task task_name: ' + task_name + ". " + str(e)}, status=400)
def handle_fault_execution(request, schedule_time, exception): unique_id = request["unique_id"] job_url = request["url"] project_name = request["project_name"] job_name = request["job_name"] user_id = request["user_id"] crawler_name = request["crawler_name"] schedule_category = request["schedule_category"] # update relevant MongoDC entry in jobs collection with task_id and status update_data = u'{ "unique_id": "' + unique_id + '", "url": "' + job_url + '", "project_name": "' \ + project_name + '", "job_name": "' + job_name + '", "user_id": "' + user_id \ + '", "crawler_name": "' + crawler_name \ + '", "task_id": "Not Generated", "status": "FAILED" }' data_item = json.loads(update_data) data_item['schedule_category'] = schedule_category data_item['schedule_time'] = schedule_time mongo_connection = MongoConnection() if schedule_category == "Instant": # update relevant MongoDC entry in jobs collection with task_id and status query = {'user_id': user_id, 'url': job_url, 'project_name': project_name, 'job_name': job_name, 'crawler_name': crawler_name} mongo_connection.upsert_item(query, data_item, "jobs") else: # store job records in MongoDB database mongo_connection.insert_item(data_item, "jobs") logger.exception("Current can not schedule with invalid date or time format. " "Hence, job execution failed. " + str(exception))
def get_jobs(request): try: json_data = json.loads(request.body) except JSONDecodeError as e: return JsonResponse( { 'Error': 'Missing user_id in the request payload or empty, ' + str(e) }, status=400) # take urls comes from client. try: mongo_connection = MongoConnection() json_data = mongo_connection.get_items("jobs", json_data) except Exception as e: return JsonResponse( { 'Error': 'Error while getting job details from the database, ' + str(e) }, status=400) return JsonResponse({'Status': "SUCCESS", 'data': json_data})
def crawl_new_job(request): # Post requests are for new crawling tasks if request.method == 'POST': # take urls comes from client. try: json_data = json.loads(request.body) url_data = json_data['urls'] job_name = json_data['job_name'] project_name = json_data['project_name'] user_id = json_data['user_id'] crawler_name = json_data['crawler_name'] schedule_type = json_data['schedule_type'] schedule_data = json_data['schedule_data'] except (JSONDecodeError, KeyError) as e: return JsonResponse({'Error': 'Missing fields in the request payload or empty, ' + str(e)}, status=400) if not user_id: return JsonResponse({'Error': 'Missing user id key in the request payload'}, status=400) if not job_name: return JsonResponse({'Error': 'Missing job name key in the request payload'}, status=400) if not url_data: return JsonResponse({'Error': 'Missing urls key in the request payload'}, status=400) if not project_name: return JsonResponse({'Error': 'Missing project_name key in the request payload'}, status=400) if not crawler_name: return JsonResponse({'Error': 'Missing crawler_name key in the request payload'}, status=400) if not schedule_type: return JsonResponse({'Error': 'Missing schedule_type key in the request payload'}, status=400) if (schedule_type != SCHEDULE_TASK_TYPE) \ and (schedule_type != INTERVAL_TASK_TYPE) and (schedule_type != HOT_TASK_TYPE): return JsonResponse({'Error': 'Requested crawler_type:' + schedule_type + ' is not a valid type'}, status=400) publish_url_ids = [] for url in url_data: if not is_valid_url(url): return JsonResponse({'Error': url + ' URL is invalid'}, status=400) unique_id = str(uuid4()) # create a unique ID. publish_data = u'{ "unique_id": "' + unique_id + '", "job_name": "' + job_name \ + '", "url": "' + url + '", "project_name": "' \ + project_name + '", "user_id": "' + user_id + '", "crawler_name": "' + crawler_name \ + '", "task_id":"" }' publish_data = json.loads(publish_data) try: # schedule data with celery task scheduler if schedule_type == SCHEDULE_TASK_TYPE: publish_data['schedule_data'] = schedule_data publish_data['schedule_category'] = CRON publish_data['status'] = "RUNNING" celery_task = schedule_job_with_cron_tab(publish_data) elif schedule_type == INTERVAL_TASK_TYPE: publish_data['schedule_data'] = schedule_data publish_data['schedule_category'] = INTERVAL publish_data['status'] = "RUNNING" celery_task = schedule_job_with_interval(publish_data) else: publish_data['schedule_category'] = INSTANT publish_data['status'] = "PENDING" celery_task = schedule_cron_job.delay(kwargs=json.dumps(publish_data)) if isinstance(celery_task, JsonResponse): return celery_task publish_url_ids.append(unique_id) publish_data['celery_task_name'] = celery_task.name try: # store job records in MongoDB database query = {'user_id': user_id, 'job_name': job_name, 'url': url, 'project_name': project_name, 'crawler_name': crawler_name} mongo_connection = MongoConnection() mongo_connection.upsert_item(query, publish_data, "jobs") except Exception as e: return JsonResponse({'Error': 'Error while connecting to the MongoDB database, ' + str(e)}, status=400) except Exception as e: return JsonResponse({'Status': "400 BAD", 'Error': 'Error occurred while scheduling the data with the Celery executor, ' + str(e)}, status=400) return JsonResponse({'status': "SUCCESS", 'Message': "Crawl job scheduled successfully.\n job_ids:" + str(publish_url_ids)})
def schedule_cron_job(self, **kwargs): json_body = "" schedule_time = str(time.time()) try: json_body = kwargs if "kwargs" in json_body: json_body = json.loads(json_body['kwargs']) unique_id = json_body['unique_id'] job_url = json_body["url"] project_name = json_body["project_name"] job_name = json_body["job_name"] user_id = json_body["user_id"] status = json_body["status"] crawler_name = json_body["crawler_name"] schedule_category = json_body["schedule_category"] if not unique_id or not job_url or not project_name or not user_id or not status \ or not crawler_name or not job_name: raise Exception('Required parameters are missing in the consumed message') if check_url_gives_response(job_url): job_domain = urlparse(job_url).netloc try: settings = { 'unique_id': unique_id, 'user_id': user_id, 'job_name': job_name, 'project_name': project_name, 'schedule_time': schedule_time, 'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' } # to list available spiders in the project # print(scrapyd.list_spiders("crawlerx_project")) # Schedule a crawl job with project and a specific spider task_id = \ scrapy_daemon.schedule("crawlerx_project", crawler_name, settings=settings, url=job_url, domain=job_domain) mongo_connection = MongoConnection() job_data = u'{ "unique_id": "' + unique_id + '", "url": "' + job_url + '", "project_name": "' \ + project_name + '", "job_name": "' + job_name + '", "user_id": "' + user_id \ + '", "crawler_name": "' + crawler_name \ + '", "task_id": "' + task_id + '", "status": "RUNNING" }' data_item = json.loads(job_data) data_item['schedule_category'] = schedule_category data_item['schedule_time'] = schedule_time if schedule_category == "Instant": # update relevant MongoDC entry in jobs collection with task_id and status query = {'user_id': user_id, 'url': job_url, 'project_name': project_name, 'job_name': job_name, 'crawler_name': crawler_name} mongo_connection.upsert_item(query, data_item, "jobs") else: # store job records in MongoDB database mongo_connection.insert_item(data_item, "jobs") # task id of the crawl job logger.info("Crawling job has been started with ID - " + task_id) except Exception as e: handle_fault_execution(json_body, schedule_time, e) else: handle_fault_execution(json_body, schedule_time, Exception("Current job URL does not seems available. Hence, job execution failed.")) except Exception as e: handle_fault_execution(json_body, schedule_time, e)