def start_broad_crawl_job(workspace_id, num_to_fetch, broadness, broad_crawler_provider, broad_crawler_sources, crawl_type): #check there is trained data categorized_urls = get_seeds_urls_categorized(workspace_id) if 'relevant' not in categorized_urls or len( categorized_urls['relevant']) == 0: raise InvalidUsage("No trained URLS!", status_code=409) job_id = save_job(workspace_id, num_to_fetch=int(num_to_fetch), broad_crawler_provider=broad_crawler_provider, broad_crawler_sources=broad_crawler_sources, crawl_type=crawl_type) job_id = str(job_id) queue_broad_crawl(workspace_id, job_id=job_id, num_to_fetch=int(num_to_fetch), broadness=broadness, broad_crawler_provider=broad_crawler_provider, broad_crawler_sources=broad_crawler_sources) return job_id
def broad_crawl_publication_api(workspace_id): try: num_to_fetch = request.json['nResults'] broad_crawler_provider = request.json['crawlProvider'] broad_crawler_sources = request.json['crawlSources'] logging.info("Going to fetch %s urls with broad crawl" % str(num_to_fetch)) job_id = start_broad_crawl_job( workspace_id, num_to_fetch=int(num_to_fetch), broad_crawler_provider=broad_crawler_provider, broad_crawler_sources=broad_crawler_sources, crawl_type="BROADCRAWL") if job_id is None: return Response(json.dumps( "{errorMessage: No keywords provided, error: 2002}"), mimetype="application/json") res = {} res['jobId'] = job_id return Response(json.dumps({"jobId": job_id}), mimetype="application/json") except NameError, e: raise InvalidUsage(str(e), status_code=409)
def add_workspace_api(): try: name = request.data add_workspace(name) in_doc = list_workspace() out_doc = JSONEncoder().encode(in_doc) return Response(out_doc, mimetype="application/json") except AddingWorkspaceError: raise InvalidUsage('A workspace with that name already exists', status_code=409)
def edit_api(id): roles = None active = None if 'roles' in request.json: roles = request.json['roles'] if 'active' in request.json: active = bool(request.json['active']) if active is None and roles is None: raise InvalidUsage("no update provided", status_code=409) update_user(id, active, roles) return Response("{}", mimetype="application/json")
def start_smart_crawl_job(workspace_id, num_to_fetch, broadness): urls = __get_urls(workspace_id) if len(urls) == 0: raise InvalidUsage("No trained URLS!", status_code=409) job_id = save_smart_crawl_job(workspace_id, num_to_fetch=int(num_to_fetch), broadness=broadness) page_model = __get_page_model(workspace_id) __queue_smart_crawl_start(workspace_id, job_id=job_id, page_limit=int(num_to_fetch), broadness=broadness, urls=urls, page_model=page_model) return job_id
def create_account(username): password = request.json['password'] encrypted_password = utils.encrypt_password(password) try: Singleton.getInstance().user_datastore.create_user( email=username, password=encrypted_password, roles=[], active=True, login_count=0) except NotUniqueError as ex: raise InvalidUsage('An username with that email already exists', status_code=409) return Response("{}", mimetype="application/json")
def schedule_spider_searchengine_api(workspace_id): num_to_fetch = request.json['nResults'] broad_crawler_provider = request.json['crawlProvider'] broad_crawler_sources = request.json['crawlSources'] try: job_id = schedule_spider_searchengine( workspace_id, num_to_fetch=int(num_to_fetch), broad_crawler_provider=broad_crawler_provider, broad_crawler_sources=broad_crawler_sources) return Response('{"jobId": "' + job_id + '"}', mimetype="application/json") except NameError, e: raise InvalidUsage(str(e), status_code=409)
def smart_crawl_publication_api(workspace_id): try: num_to_fetch = request.json['nResults'] broadness = request.json['broadness'] job_id = start_smart_crawl_job(workspace_id, num_to_fetch=int(num_to_fetch), broadness=broadness) if job_id is None: return Response( json.dumps("{errorMessage: Job Failed to start, error: 2002}"), mimetype="application/json") res = {} res['jobId'] = job_id return Response(json.dumps({"jobId": job_id}), mimetype="application/json") except NameError, e: raise InvalidUsage(str(e), status_code=409)
def start_deep_crawl_job(workspace_id, num_to_fetch, selection): broad_crawler_provider = "HH-JOOGLE" crawl_type = "DEEPCRAWL" broad_crawler_sources = [] for key, value in selection.iteritems(): if value["allSelected"] or len(value["selected"]) > 0: broad_crawler_sources.append(key) urls = __get_seeds_url_by_selection(workspace_id, selection) domains = extract_domains_from_urls(urls) if len(urls) == 0: raise InvalidUsage("No Seed URLS were selected!", status_code=409) job_id = save_job(workspace_id, num_to_fetch=int(num_to_fetch), crawler_provider=broad_crawler_provider, crawler_sources=broad_crawler_sources, crawl_type=crawl_type, status="STARTED") login_credentials = get_successful_logins(workspace_id, domains) for doc in login_credentials: if "keyValues" in doc: doc["key_values"] = doc["keyValues"] doc.pop('keyValues', None) doc["id"] = doc["_id"] doc.pop('_id', None) queue_deep_crawl_start(workspace_id, job_id=job_id, num_to_fetch=num_to_fetch, urls=urls, login_credentials=login_credentials) return job_id