def schedule_spider_searchengine(workspace_id, num_to_fetch, crawler_provider, crawler_sources, keyword_source_type): keywords = dao_get_keywords_by_relevance(workspace_id) categorized_urls = get_seeds_urls_categorized(workspace_id) job_id = save_job(workspace_id, num_to_fetch=int(num_to_fetch), crawler_provider=crawler_provider, crawler_sources=crawler_sources, crawl_type="KEYWORDS", keyword_source_type=keyword_source_type) message = { 'workspace': workspace_id, 'jobId': job_id, 'crawlProvider': crawler_provider, 'crawlSources': crawler_sources, 'strTimestamp': strftime("%Y-%m-%d %H:%M:%S", gmtime()), 'keywordSourceType': keyword_source_type, 'included': keywords['included'], 'excluded': keywords['excluded'], 'relevantUrl': categorized_urls['relevant'], 'irrelevantUrl': categorized_urls['irrelevant'], 'nResults': int(num_to_fetch), 'existentUrl': get_seeds_urls_url(workspace_id) } Singleton.getInstance().broker_service.add_message_to_googlecrawler( message) return job_id
def get_modeler_progress(workspace_id): collection = Singleton.getInstance( ).mongo_instance.get_workspace_collection() cursor = collection.find({'_id': ObjectId(workspace_id)}, { "page_model.model": 1, "page_model.quality": 1, "page_model.percentage_done": 1 }) docs = list(cursor) page_model_progress = {} for doc in docs: model = False quality = [] percentage_done = 0 if "page_model" in doc: if "percentage_done" in doc["page_model"]: percentage_done = doc["page_model"]["percentage_done"] if percentage_done > 99: model = True result = Singleton.getInstance( ).es_client.get_modeler_model_results(workspace_id) quality = json.loads(result["quality"]) page_model_progress["model"] = model page_model_progress["quality"] = quality page_model_progress["percentageDone"] = percentage_done # progress = json.loads(page_model_progress) return page_model_progress
def publish_to_import_url_queue(workspace_id, url): metadata = build_metadata(workspace_id) metadata["keywordSourceType"] = "MANUAL" message = {'url': url, 'isRelevant': True, 'metadata': metadata} Singleton.getInstance().broker_service.add_message_to_import_url(message)
def schedule_spider_searchengine(workspace_id, num_to_fetch, broad_crawler_provider, broad_crawler_sources): keywords = dao_get_keywords_by_relevance(workspace_id) categorized_urls = get_seeds_urls_categorized(workspace_id) job_id = save_job(workspace_id, num_to_fetch=int(num_to_fetch), broad_crawler_provider=broad_crawler_provider, broad_crawler_sources=broad_crawler_sources, crawl_type="KEYWORDS") message = { 'included': keywords['included'], 'excluded': keywords['excluded'], 'relevantUrl': categorized_urls['relevant'], 'irrelevantUrl': categorized_urls['irrelevant'], 'nResults': int(num_to_fetch), 'existentUrl': get_seeds_urls_url(workspace_id), 'workspace': workspace_id, 'jobId': job_id, 'crawlProvider': broad_crawler_provider, 'crawlSources': broad_crawler_sources } Singleton.getInstance().broker_service.add_message_to_googlecrawler( message) return job_id
def dao_delete_keywords(workspace_id, hash): operation = {'$unset': {"words." + hash: ""}} # ws = Singleton.getInstance().mongo_instance.get_current_workspace() ws = Singleton.getInstance().mongo_instance.get_workspace_by_id(workspace_id) if ws == None: Singleton.getInstance().mongo_instance.workspace_collection.update({"_id": "_default"}, operation) else: Singleton.getInstance().mongo_instance.workspace_collection.update({"_id": ObjectId(ws["_id"])}, operation)
def dao_update_user_account_status(id, is_active): update_object = {} if is_active is not None: update_object['active'] = is_active Singleton.getInstance().mongo_instance.get_user_collection().update( {'_id': ObjectId(id)}, {'$set': update_object})
def publish_to_login_output_queue(workspace_id, job_id, url, key_values): input_queue = "dd-login-output" message = { 'workspaceId': workspace_id, 'job_id': job_id, 'url': url, 'key_values': key_values } Singleton.getInstance().broker_service.post_to_queue_no_extra_headers( message, input_queue)
def publish_to_events_queue(workspace_id, event_type, action, arguments): message = { 'workspaceId': workspace_id, 'timestamp': time.time(), 'event': event_type, 'action': action, 'arguments': json.dumps(arguments) } Singleton.getInstance().broker_service.add_message_to_events(message)
def publish_to_import_url_queue(workspace_id, url, is_relevant=True): message = { 'url': url, 'isRelevant': is_relevant, 'metadata': Singleton.getInstance().broker_service.get_metadata(workspace_id) } Singleton.getInstance().broker_service.add_message_to_import_url(message)
def publish_to_events_queue(workspace_id, event_type, action): message = { 'event': event_type, 'action': action, # 'data': po, 'metadata': Singleton.getInstance().broker_service.get_metadata(workspace_id) } Singleton.getInstance().broker_service.add_message_to_events(message)
def dao_save_blur_level(self, level): ws = Singleton.getInstance().mongo_instance.get_current_workspace() if ws is None: Singleton.getInstance().mongo_instance.workspace_collection.upsert( {'_id': '_default'}, {'$set': { 'blur_level': level }}) else: Singleton.getInstance().mongo_instance.workspace_collection.update( {'_id': ObjectId(ws['_id'])}, {'$set': { 'blur_level': level }})
def dao_add_workspace(name, ts): ws_doc = Singleton.getInstance( ).mongo_instance.workspace_collection.find_one({'name': name}) if ws_doc is None: Singleton.getInstance().mongo_instance.workspace_collection.save({ 'name': name, 'created': ts }) # Singleton.getInstance().mongo_instance.create_collections_when_new_workspace(name) else: raise AddingWorkspaceError('The name already exists')
def get_seeds_urls(workspace_id, categories, last_id, limit, _source_exclude=[ "result.crawlResultDto.image", "result.crawlResultDto.html" ]): categories_search_condition = {} if categories is not None: categories_search_condition = { 'userDefinedCategory': { '$in': categories } } page_search_object = {} if last_id is not None: page_search_object = {"_id": {"$gt": ObjectId(last_id)}} deleted_search_object = {'deleted': None} workspace_search_object = {'workspaceId': workspace_id} field_names_to_include = [ '_id', 'host', 'desc', 'crawlEntityType', 'url', 'words', 'title', 'categories', 'language', 'relevant', 'userDefinedCategories' ] collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() res = collection \ .find({'$and': [categories_search_condition, page_search_object, deleted_search_object, workspace_search_object]}, field_names_to_include) \ .sort('_id', pymongo.ASCENDING) \ .limit(limit) docs = list(res) for item in docs: try: es_result = Singleton.getInstance( ).es_client.get_open_crawled_index_results(item['url'], _source_exclude) item['desc'] = es_result["text"] if "crawlResultDto" in es_result and "html" in es_result[ "crawlResultDto"]: item['html'] = es_result["crawlResultDto"]["html"] except: logging.info("item failed") return docs
def queue_labels(workspace_id): docs = dao_get_labels(workspace_id) for doc in docs: location = build_html_location(doc["url"]) doc["html_location"] = location message = { 'workspace_id': workspace_id, 'pages': docs, } Singleton.getInstance().broker_service.add_message_to_dd_modeler_input( message)
def dao_update_keywords(workspace_id, word, score): # hash = hashlib.sha224(word.encode('utf_8')).hexdigest() hash = str(adler32(word.encode('utf_8')) & 0xffffffff) scoredWord = {"word": word, "score": score} operation = {'$set': {"words." + hash: scoredWord}} # ws = Singleton.getInstance().mongo_instance.get_current_workspace() ws = Singleton.getInstance().mongo_instance.get_workspace_by_id(workspace_id) if ws == None: Singleton.getInstance().mongo_instance.workspace_collection.upsert({"_id": "_default"}, operation) else: Singleton.getInstance().mongo_instance.workspace_collection.update({"_id": ObjectId(ws["_id"])}, operation) return hash
def dao_update_user(id, active, roleIds): update_object = {} if active != None: update_object['active'] = active if roleIds != None: new_roles = [] for role in roleIds: new_roles.append(ObjectId(role)) update_object['roles'] = new_roles Singleton.getInstance().mongo_instance.get_user_collection().update( {'_id': ObjectId(id)}, {'$set': update_object})
def dao_aggregated_labels_urls(workspace_id): collection = Singleton.getInstance().mongo_instance.get_seed_urls_collection() source_search_conditions = [] workspace_search_object = {'workspaceId': workspace_id} delete_search_object = {'deleted': {'$exists': False}} # 'deleted': {'$exists': False}} source_search_conditions.append(workspace_search_object) source_search_conditions.append(delete_search_object) source_search_object = {'$and': source_search_conditions} try: res = collection.aggregate([ {'$match': source_search_object}, {'$project': {'_id': 0, 'userDefinedCategories': 1}}, {'$unwind': "$userDefinedCategories"}, {'$group': {'_id': "$userDefinedCategories", 'tags': { '$sum': 1}}}, {'$project': {'_id': 0, 'userDefinedCategories': "$_id", 'tags': 1}}, {'$sort': {'userDefinedCategories': -1}} # {'$group': {'_id': {'userDefinedCategories': '$userDefinedCategories'}, "count": {"$sum": 1}}} ]) except Exception as e: logging.error(e) return res["result"]
def dao_get_keywords_by_relevance(workspace_id): keywords = {} included = [] excluded = [] related = [] ws = Singleton.getInstance().mongo_instance.get_workspace_by_id(workspace_id) if ws is None \ or "words" not in ws \ or ws["words"] is None: logging.info("no keywords defined") else: for key, value in ws["words"].iteritems(): if value['score'] > 3: included.append(value['word']) elif value['score'] < 3: excluded.append(value['word']) else: related.append(value['word']) if len(included) == 0: raise NameError('No keywords where defined.') keywords['included'] = included keywords['excluded'] = excluded keywords['related'] = related return keywords
def get_seeds_urls_to_deep_crawl_dao(workspace_id, page_size, keyword_source_type, last_id): and_condition_list = [] if keyword_source_type: and_condition_list.append({"keywordSourceType": keyword_source_type}) if last_id: last_id_search_object = {"_id": {"$gt": ObjectId(last_id)}} and_condition_list.append(last_id_search_object) deleted_search_object = {'deleted': {"$exists": False}} and_condition_list.append(deleted_search_object) workspace_search_object = {'workspaceId': workspace_id} and_condition_list.append(workspace_search_object) collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() res = collection.find({'$and': and_condition_list})\ .sort('_id', pymongo.ASCENDING)\ .limit(page_size) docs = list(res) return docs
def get_seeds_udcs_by_workspace_dao(workspace_id): and_condition_list = [] # source_search_conditions = [] # if source == "searchengine": # source_search_conditions.append({'crawlEntityType': "BING"}) # source_search_conditions.append({'crawlEntityType': "GOOGLE"}) # elif source == "twitter": # source_search_conditions.append({'crawlEntityType': "TWITTER"}) # elif source == "tor": # source_search_conditions.append({'crawlEntityType': "TOR"}) # elif source == "imported": # source_search_conditions.append({'crawlEntityType': "MANUAL"}) # elif source == "deepdeep": # source_search_conditions.append({'crawlEntityType': "DD"}) # else: # print("no valid source was provided:" + source) # source_search_object = {'$or': source_search_conditions} # and_condition_list.append(source_search_object) deleted_search_object = {'deleted': None} and_condition_list.append(deleted_search_object) workspace_search_object = {'workspaceId': workspace_id} and_condition_list.append(workspace_search_object) collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() res = collection\ .find({'$and': and_condition_list})\ .distinct("udc") docs = list(res) return sorted(docs)
def get_trainer_progress(workspace_id): collection = Singleton.getInstance( ).mongo_instance.get_workspace_collection() cursor = collection.find({'_id': ObjectId(workspace_id)}) docs = list(cursor) trainer_progress = {} for doc in docs: progress = "" percentage_done = 0 model = False if "dd_trainer" in doc and "trainer_progress" in doc["dd_trainer"]: progress = doc["dd_trainer"]["trainer_progress"] if "dd_trainer" in doc and "percentage_done" in doc["dd_trainer"]: percentage_done = doc["dd_trainer"]["percentage_done"] if "dd_trainer" in doc and "trainer_model" in doc["dd_trainer"]: model = doc["dd_trainer"]["trainer_model"] jobs = get_last_job_by_workspace_dao(workspace_id, 'DD-TRAINER') trainer_progress["progress"] = progress trainer_progress["percentageDone"] = percentage_done trainer_progress["model"] = model trainer_progress["jobs"] = jobs # trainer_progress ["model"] = true or false!!!!!; return trainer_progress
def get_user_input_forms(workspace_id, last_id): page_search_object = {} if last_id: # page_search_object = {'_id' > input_search_query['last_id']} page_search_object = {"_id": {"$gt": ObjectId(last_id)}} complete_search_object = { '$or': [{ 'completed': { '$exists': False } }, { 'completed': { '$ne': True } }] } workspace_search_object = {'workspaceId': workspace_id} collection = Singleton.getInstance( ).mongo_instance.get_login_input_collection() res = collection\ .find({'$and': [page_search_object, complete_search_object, workspace_search_object]})\ .sort('_id', pymongo.ASCENDING)\ .limit(5) docs = list(res) return docs
def get_all_progress(workspace_id): collection = Singleton.getInstance( ).mongo_instance.get_workspace_collection() cursor = collection.find({'_id': ObjectId(workspace_id)}, { "page_model.quality": 1, "dd_trainer.trainer_progress": 1, "dd_crawler.crawler_progress": 1, "dd_broadcrawler.broadcrawler_progress": 1 }) docs = list(cursor) #progress = [] progress = {} progress["model"] = [] progress["trainer"] = [] progress["crawler"] = "" progress["broadcrawler"] = "" for doc in docs: if "page_model" in doc and "quality" in doc["page_model"]: progress_as_string = doc["page_model"]["quality"] progress["model"] = json.loads(progress_as_string) if "dd_trainer" in doc and "trainer_progress" in doc["dd_trainer"]: progress["trainer"] = doc["dd_trainer"]["trainer_progress"] if "dd_crawler" in doc and "crawler_progress" in doc["dd_crawler"]: progress["crawler"] = doc["dd_crawler"]["crawler_progress"] # FIXME this should exist, broacrawler is called just crawler # if "dd_broadcrawler" in doc and "broadcrawler_progress" in doc["dd_broadcrawler"]: # progress["broadcrawler"] = doc["dd_broadcrawler"]["broadcrawler_progress"] return progress
def dao_reset_results(workspace_id, source): collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() source_search_conditions = [] workspace_search_object = {'workspaceId': workspace_id} if source == "searchengine": source_search_conditions.append({'crawlEntityType': "BING"}) source_search_conditions.append({'crawlEntityType': "GOOGLE"}) elif source == "twitter": source_search_conditions.append({'crawlEntityType': "TWITTER"}) elif source == "tor": source_search_conditions.append({'crawlEntityType': "TOR"}) elif source == "imported": source_search_conditions.append({'crawlEntityType': "MANUAL"}) elif source == "deepdeep": source_search_conditions.append({'crawlEntityType': "DD"}) else: print("no valid source was provided:" + source) return source_search_object = {'$or': source_search_conditions} collection.remove( {'$and': [workspace_search_object, source_search_object]})
def get_user_input_forms_stats(workspace_id): search_object = { '$and': [ { 'workspaceId': workspace_id }, ] } collection = Singleton.getInstance( ).mongo_instance.get_login_input_collection() res_hosts = collection.aggregate([{ '$match': search_object }, { '$project': { "_id": 0, "completed_label": { "$cond": ["$completed", "completed", "pending"] }, "completed": { "$cond": ["$completed", 1, 1] } } }, { '$group': { "_id": "$completed_label", "count": { '$sum': "$completed" } } }]) return list(res_hosts["result"])
def dao_aggregate_urls_to_deep_crawl(workspace_id): and_search_conditions = [] workspace_search_object = {'workspaceId': workspace_id} and_search_conditions.append(workspace_search_object) delete_search_object = {'deleted': {'$exists': False}} and_search_conditions.append(delete_search_object) source_search_object = {'$and': and_search_conditions} collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() try: res = collection.aggregate([{ '$match': source_search_object }, { '$group': { '_id': { 'keywordSourceType': '$keywordSourceType' }, "count": { "$sum": 1 } } }]) except Exception as e: print e return res["result"]
def dao_delete_workspace(id): Singleton.getInstance().mongo_instance.workspace_collection.remove( {"_id": ObjectId(id)}) Singleton.getInstance().mongo_instance.get_crawl_job_collection().remove( {"workspaceId": id}) Singleton.getInstance().mongo_instance.get_seed_urls_collection().remove( {"workspaceId": id}) Singleton.getInstance().mongo_instance.get_broad_crawler_collection( ).remove({"workspaceId": id})
def dao_update_relevance(url, obj): update_object = {} update_object['relevant'] = obj['relevant'] collection = Singleton.getInstance( ).mongo_instance.get_current_seed_urls_collection() print "setting url %s to %s in collection %s" % (url, str( obj['relevant']), collection) collection.update({"url": url}, {'$set': update_object}, True)
def dao_update_relevanceByid(workspace_id, id, relevance, categories, udc): collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() update_object = {} update_object['relevant'] = relevance update_object['categories'] = categories update_object['udc'] = udc collection.update({"_id": ObjectId(id)}, {'$set': update_object}, True)
def get(workspace_id): ws = Singleton.getInstance().mongo_instance.get_workspace_by_id(workspace_id) if ws is None \ or "userDefinedCategories" not in ws \ or ws["userDefinedCategories"] is None: return [] else: return ws["userDefinedCategories"]