def __queue_smart_crawl_start(workspace_id, job_id, page_limit, broadness, urls, page_model): ''' { "workspace_id": "workspace id", "id": "crawl id", "page_limit": 100 "broadness": "BROAD" # Valid codes are["N10", "N100", "N1000", "N10000", "BROAD"], "urls": ["http://example.com", "http://example.com/2"], "page_model": "b64-encoded page classifier", } ''' message = { 'workspace_id': workspace_id, #Singleton.getInstance().mongo_instance.get_current_workspace_name(), 'id': job_id, 'page_limit': page_limit, 'broadness': broadness, 'urls': urls, 'page_model': page_model, } logging.info(message) Singleton.getInstance().broker_service.add_message_to_dd_crawler_input( message)
def queue_broad_crawl(workspace_id, job_id, num_to_fetch, broad_crawler_provider, broad_crawler_sources): # keywords = dao_get_keywords() keywords = dao_get_keywords_by_relevance(workspace_id) categorized_urls = get_seeds_urls_categorized(workspace_id) existent_url = get_relevant_or_neutral_seeds_urls_url(workspace_id) # jobId = str(uuid.uuid1()) logging.info("sending broad crawl message for %s urls with keywords %s" % (str(num_to_fetch), str(keywords))) message = { 'included': keywords['included'], 'excluded': keywords['excluded'], 'relevantUrl': categorized_urls['relevant'], 'irrelevantUrl': categorized_urls['irrelevant'], 'nResults': int(num_to_fetch), 'existentUrl': existent_url, #get_seeds_urls_url(), # 'appInstance': Singleton.getInstance().app_instance, 'workspace': workspace_id, #Singleton.getInstance().mongo_instance.get_current_workspace_name(), 'jobId': job_id, 'crawlProvider': broad_crawler_provider, 'crawlSources': broad_crawler_sources } logging.info(message) Singleton.getInstance().broker_service.add_message_to_broadcrawler( message, broad_crawler_provider)
def get_screenshot(crawl_type, id): if crawl_type == "broadcrawl": collection = Singleton.getInstance( ).mongo_instance.get_broad_crawler_collection() elif crawl_type == "deepcrawl": collection = Singleton.getInstance( ).mongo_instance.get_deep_crawler_collection() elif crawl_type == "deepcrawl-domains": collection = Singleton.getInstance( ).mongo_instance.get_deep_crawler_domains_collection() elif crawl_type == "keywords": collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() else: logging.info("invalid crawl type: " + crawl_type) res = collection.find({"_id": ObjectId(id)}) docs = list(res) screenshot = "" if len(docs) > 0: url = docs[0]["url"] bytes = Singleton.getInstance().es_client.get_screenshoot(url) if bytes: screenshot = base64.b64decode(bytes) return screenshot
def queue_deep_crawl_stop(workspace_id, job_id): logging.info("preparing stop deep-crawler") message = { 'id': job_id, # 'workspace_id': workspace_id, 'stop': True } logging.info(message) Singleton.getInstance().broker_service.add_message_to_deepcrawler(message)
def queue_login(workspace_id, job_id, credentials): message = { 'workspace_id': workspace_id, 'job_id': job_id, 'id': credentials["_id"], 'domain': credentials["domain"], 'url': credentials["url"], 'key_values': credentials["keyValues"] } Singleton.getInstance().broker_service.add_message_to_login(message)
def queue_deep_crawl_start(workspace_id, job_id, num_to_fetch, urls, login_credentials): logging.info("preparing deep crawl message") message = { 'id': job_id, 'workspace_id': workspace_id, 'page_limit': int(num_to_fetch), 'urls': urls, 'login_credentials': login_credentials } logging.info(message) Singleton.getInstance().broker_service.add_message_to_deepcrawler(message)
def register_scraping_subscriber(): # Callback def action1(args): logging.info("read_topic_from_scraping run with: " + str(args)) # def save_urls(obj): # url = obj['url'] # print("save_url with: " + url) # print("save_url object with: " + str(obj)) # dao_update_url(url=url, obj=obj) # Singleton.getInstance().broker_service.read_topic_from_broadcrawler(callback=save_urls) Singleton.getInstance().broker_service.read_topic_from_scraping(callback=action1)
def get_screenshot(crawl_type, id): if crawl_type == "broadcrawl": collection = Singleton.getInstance( ).mongo_instance.get_broad_crawler_collection() elif crawl_type == "keywords": collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() else: logging.info("invalid crawl type: " + crawl_type) res = collection.find({"_id": ObjectId(id)}) url = res[0]["url"] base64 = Singleton.getInstance().es_client.get_screenshoot(url) return base64
def get_deep_crawl_domains_by_domain_name(workspace_id, job_id, domain_name, limit, last_id): collection = Singleton.getInstance( ).mongo_instance.get_deep_crawler_collection() and_source_conditions = [] workspace_search_object = {'workspaceId': workspace_id} and_source_conditions.append(workspace_search_object) job_search_object = {'jobId': job_id} and_source_conditions.append(job_search_object) domain_name_search_object = {'domain': domain_name} and_source_conditions.append(domain_name_search_object) page_search_object = {} if last_id is not None: page_search_object = {"_id": {"$gt": ObjectId(last_id)}} and_source_conditions.append(page_search_object) query = {'$and': and_source_conditions} cursor = collection.find(query) \ .sort('_id', pymongo.ASCENDING) \ .limit(limit) docs = list(cursor) return docs
def get_logins(workspace_id, domains): collection = Singleton.getInstance().mongo_instance.get_login_collection() and_source_conditions = [] workspace_search_object = {'workspaceId': workspace_id} and_source_conditions.append(workspace_search_object) domains_search_object = {'domain': {'$in': domains}} and_source_conditions.append(domains_search_object) key_values_search_object = {'keyValues': {"$exists": True}} and_source_conditions.append(key_values_search_object) query = {'$and': and_source_conditions} fields = { 'url': 1, 'domain': 1, 'keysOrder': 1, 'keyValues': 1, 'result': 1, '_id': 1 } cursor = collection.find(query, fields) docs = list(cursor) for doc in docs: doc["_id"] = str(doc["_id"]) return docs
def save_login(credentials): collection = Singleton.getInstance().mongo_instance.get_login_collection() login_search_object = {'_id': ObjectId(credentials["_id"])} operation = {'$set': {"keyValues": credentials["keyValues"]}} collection.update(login_search_object, operation)
def get_max_id(workspace_id, job_id): ws_object = {} ws_object["workspaceId"] = workspace_id search_object = {} if job_id: search_object = {"jobId": job_id} # res = Singleton.getInstance().mongo_instance.get_broad_crawler_output_collection() \ # collection = Singleton.getInstance().mongo_instance.get_broad_crawler_output_collection_by_workspace_id(workspace_id) collection = Singleton.getInstance( ).mongo_instance.get_broad_crawler_collection() res = collection \ .find({'$and': [ws_object, search_object]}, ['_id']) \ .sort('_id', pymongo.DESCENDING) \ .limit(1) res_list = list(res) if len(res_list) > 0: max_id = str(res_list[0]["_id"]) else: max_id = None return max_id
def create_account(username): password = request.json['password'] encrypted_password = utils.encrypt_password(password) try: Singleton.getInstance().user_datastore.create_user( email=username, password=encrypted_password, roles=[], active=True, login_count=0) except NotUniqueError as ex: raise InvalidUsage('An username with that email already exists', status_code=409) return Response("{}", mimetype="application/json")
def __init__(self): self.counter = 0 self.sample_size = 50 def action1(args): print("callback run with: " + str(args)) print("current counter:" + str(self.counter)) self.counter = self.counter + 1 kafka_host_port = "9092" kafka_host_name = "localhost" # topic_group = "test-group" # topic = 'splash' # topic_input = topic + '-input' # topic_output = topic + '-input'#+ '-output' app_instance = str(uuid.uuid1()) app_instance = 'app-instance' #str(uuid.uuid1()) instance = Singleton.getInstance() instance.app_instance = app_instance instance.mongo_instance = MongoInstance() kafka_connector = KafkaConnector(kafka_host_name, kafka_host_port) # kafka_connector.create_topic(topic_input) broker_service = BrokerService(app_instance, kafka_host_name, kafka_host_port) #subscribe # broker_service.read_from_queue(action1, topic_group, topic_output) broker_service.read_topic_from_splash(action1) time.sleep(1) #produce message = {} i = 0 while i < self.sample_size: message['index'] = i message['text'] = 'test' # message['date'] = strftime("%Y-%m-%d %H:%M:%S", gmtime()) message[ 'splash_url_path'] = "/render.json?png=1&html=1&url=http://www.hyperiongray.com" #"splash_url_path" : "/render.json?png=1&html=1&url=http://www.hyperiongray.com", # broker_service.post_to_queue(message, topic_input, topic_output) broker_service.add_message_to_splash(message) print 'sending: ' + str(i) i = i + 1 time.sleep(5) print "Finish all" print('Samples produced: ' + str(self.sample_size)) print('Samples consumed: ' + str(self.counter)) if self.sample_size == self.counter: print 'Test passed' else: print 'test failed!!!' time.sleep(1000000)
def get_metadata(self, workspace_id): metadata = {} metadata['workspace'] = workspace_id metadata['source'] = Singleton.getInstance().app_instance metadata['callbackQueue'] = "callback_queue_not_used" metadata['timestamp'] = time.time() metadata['strTimestamp'] = strftime("%Y-%m-%d %H:%M:%S", gmtime()) return metadata
def get_existing_categories_service(workspace_id): collection = Singleton.getInstance( ).mongo_instance.get_broad_crawler_collection() search_object = {} search_object["workspaceId"] = workspace_id categories = list(collection.find(search_object).distinct("categories")) languages = list(collection.find(search_object).distinct("language")) return categories, languages
def scraping_publication(url): def register_scraping_subscriber(): logging.info('registering register_scraping_subscriber') # Callback def action1(args): logging.info("action1 callback run with: " + str(args)) logging.info('publishing for scraping: ' + url) message = { 'url': url, 'nResults': 2, 'workspace': Singleton.getInstance().mongo_instance.get_current_workspace_name() } Singleton.getInstance().broker_service.add_message_to_scraping(message) return 282828
def post_to_queue(self, message, input_queue, callback_queue): # add more headers to the message message['source'] = Singleton.getInstance().app_instance#self.app_instance message['callbackQueue'] = callback_queue message['timestamp'] = time.time() message['strTimestamp'] = strftime("%Y-%m-%d %H:%M:%S", gmtime()) json_message = json.dumps(message) self.kafka_connector.send_message(input_queue, json_message)
def pin_service(workspace_id, id, is_pinned): logging.info("PINNING: %s" % id + " as " + str(is_pinned)) collection = Singleton.getInstance( ).mongo_instance.get_broad_crawler_collection() update_object = {"pinned": is_pinned} collection.update({"_id": ObjectId(id)}, {'$set': update_object}, True) publish_to_events_queue(workspace_id, "bookmark", "changed", { "id": id, "pinned": is_pinned })
def get_job_by_id(job_id): collection = Singleton.getInstance( ).mongo_instance.get_crawl_job_collection() and_source_conditions = [] job_search_object = {'_id': ObjectId(job_id)} and_source_conditions.append(job_search_object) query = {'$and': and_source_conditions} cursor = collection.find(query) docs = list(cursor) return docs[0]
def get_domains_by_job_id(workspace_id, job_id): collection = Singleton.getInstance( ).mongo_instance.get_deep_crawler_domains_collection() and_source_conditions = [] workspace_search_object = {'workspaceId': workspace_id} and_source_conditions.append(workspace_search_object) job_search_object = {'jobId': job_id} and_source_conditions.append(job_search_object) query = {'$and': and_source_conditions} cursor = collection.find(query) docs = list(cursor) return docs
def get_relevant_or_neutral_seeds_urls_url(workspace_id): # res = Singleton.getInstance().mongo_instance.get_broad_crawler_output_collection()\ # collection = Singleton.getInstance().mongo_instance.get_broad_crawler_output_collection_by_workspace_id(workspace_id) collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() res = collection.find( { 'relevant': { '$ne': False }, "workspaceId": workspace_id }, {'url': 1}) docs = list(res) urls = [] for doc in docs: # urls.append({"id": str(doc["_id"]), "url": doc["url"]}) urls.append(doc["url"]) return urls
def __get_urls(workspace_id): collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() relevant_urls_result = collection.find( { 'workspaceId': workspace_id, 'relevant': True, 'deleted': { '$exists': False } }, { '_id': 0, 'url': 1 }) relevant_urls = [] for url_doc in relevant_urls_result: if 'url' in url_doc: relevant_urls.append(url_doc['url']) return relevant_urls
def get_smart_crawler_results(workspace_id, page_size, input_search_query): and_conditions = [] and_conditions.append({'workspaceId': workspace_id}) and_conditions.append({'deleted': {'$exists': False}}) and_conditions.append({'crawlEntityType': 'DD'}) # and_conditions.append({'crawlType': 'SMARTCRAWL'}) if 'job_id' in input_search_query and input_search_query[ 'job_id'] is not None: job_search_object = {'jobId': input_search_query["job_id"]} and_conditions.append(job_search_object) ''' max_id restricts the results to the current set of returned results and not screw the pagination by score ''' if 'last_id' in input_search_query and input_search_query[ 'last_id'] is not None: last_id_search_object = { "_id": { "$gt": ObjectId(input_search_query['last_id']) } } and_conditions.append(last_id_search_object) if "search_text" in input_search_query: search_text = input_search_query['search_text'] url_search_condition = {'url': {'$regex': search_text}} host_search_condition = {'host': {'$regex': search_text}} and_conditions.append( {'$or': [url_search_condition, host_search_condition]}) # and_source_conditions = {'$and': and_conditions} collection = Singleton.getInstance( ).mongo_instance.get_broad_crawler_collection() query = {'$and': and_conditions} cursor = collection.find(query) \ .sort('_id', pymongo.ASCENDING) \ .limit(page_size) docs = list(cursor) return docs
def dao_aggregate_broadcrawl_results(workspace_id): source_search_conditions = [] workspace_search_object = {'workspaceId': workspace_id} source_search_conditions.append(workspace_search_object) delete_search_object = {'deleted': {'$exists': False}} source_search_conditions.append(delete_search_object) source_search_object = {'$and': source_search_conditions} collection = Singleton.getInstance( ).mongo_instance.get_broad_crawler_collection() try: res = collection.aggregate([ # '$group': {'_id': '$crawlEntityType', "count": {"$sum": 1}} { '$match': source_search_object }, { '$group': { '_id': { 'crawlEntityType': '$crawlEntityType' }, "count": { "$sum": 1 } } } ]) except Exception as e: print e return res["result"]
def __init__(self): app_instance = str(uuid.uuid1()) app_instance = 'app-instance' #str(uuid.uuid1()) instance = Singleton.getInstance() instance.app_instance = app_instance instance.mongo_instance = MongoInstance() search_query = {} search_text = 'abellan' print '' print 'searching: ' + search_text search_query['search_text'] = search_text search_results = get_search_results(search_query) print(search_results) print 'searching complete for: ' + search_text # in host print '' search_text = 'Guingueta' print 'searching: ' + search_text search_query['search_text'] = search_text search_results = get_search_results(search_query) print(search_results) print 'searching complete for: ' + search_text # in words print '' search_text = 'restaurantes' print 'searching: ' + search_text search_query['search_text'] = search_text search_results = get_search_results(search_query) print(search_results) print 'searching complete for: ' + search_text print 'done!'
def __get_page_model(workspace_id): base64 = Singleton.getInstance().es_client.get_modeler_model(workspace_id) return base64
def __queue_smart_crawler_stop(workspace_id, job_id): message = {'id': job_id, 'stop': True} logging.info(message) Singleton.getInstance().broker_service.add_message_to_dd_crawler_input( message)
def count_service(workspace_id): collection = Singleton.getInstance( ).mongo_instance.get_broad_crawler_collection() return collection.find({"workspaceId": workspace_id}).count()
def pin_service(workspace_id, id, is_pinned): logging.info("PINNING: %s" % id + " as " + str(is_pinned)) collection = Singleton.getInstance( ).mongo_instance.get_broad_crawler_collection() update_object = {"pinned": is_pinned} collection.update({"_id": ObjectId(id)}, {'$set': update_object}, True)