def generateCrawler(self): threadnum = self.getThreadnum() duration = self.getDuration() lock = threading.Lock() for i in range(threadnum): cname = "corpuscollect-crawler" + str(i + 1) #create user extract index response = Configure.get_es_client().request( "GET", "http://test-mhis-service.pingan.com.cn/elasticsearch/u" + str(self.getUser()["userId"]) + "_" + cname + "_extract/process_task/_mapping", body=None, headers={"Content-Type": "application/json"}) if not (response.status == 200): data = { "settings": { "index": { "number_of_shards": 3, "number_of_replicas": 1 } }, "mappings": { "process_task": { "properties": { "scenarioId": { "type": "keyword" }, "href": { "type": "text" }, "sceneno": { "type": "integer" }, "gageno": { "type": "integer" }, "configuration": { "type": "object" } } } } } encoded_data = json.dumps(data).encode('utf-8') response = Configure.get_es_client().request( "PUT", "http://test-mhis-service.pingan.com.cn/elasticsearch/u" + str(self.getUser()["userId"]) + "_" + cname + "_extract", body=encoded_data, headers={"Content-Type": "application/json"}) if not (response.status == 200): print("User extract task indice did not create.") print(response.data.decode('utf-8')) crawler = SubCrawler(cname, p_scenario=self, p_lock=lock, p_duration=duration) crawler.start()
def writeextracttask(p_indice, p_type, p_scenarioid, p_sceneno, p_pageno, p_uri): id = Util.hash(p_uri) data = { "scenarioId": p_scenarioid, "sceneno": p_sceneno, "pageno": p_pageno, "href": p_uri } #print (data) #print ("http://test-mhis-service.pingan.com.cn/elasticsearch/"+p_indice+"/"+p_type+"/"+id) encoded_data = json.dumps(data).encode('utf-8') response = Configure.get_es_client().request( "PUT", "http://test-mhis-service.pingan.com.cn/elasticsearch/" + p_indice + "/" + p_type + "/" + id, body=encoded_data, headers={"Content-Type": "application/json"}) print("write extract: " + str(response.status))
def getoneextracttask(p_indice, p_type): data = {"from": 0, "size": 1, "query": {"match_all": {}}} encoded_data = json.dumps(data).encode('utf-8') print("http://test-mhis-service.pingan.com.cn/elasticsearch/" + p_indice + "/" + p_type + "/_search") response = Configure.get_es_client().request( "GET", "http://test-mhis-service.pingan.com.cn/elasticsearch/" + p_indice + "/" + p_type + "/_search", body=encoded_data, headers={"Content-Type": "application/json"}) if response.status == 200: hits = json.loads(response.data.decode('utf-8'))["hits"] total = hits["total"] if total > 0: docs = hits["hits"] id = docs[0]["_id"] doc = docs[0]["_source"] return (id, doc) return (None, None)
def submit_user_specified_task_immediately(p_userid, p_scenarioid, p_href=None): data = {"query": {"match": {"scenarioId": p_scenarioid}}} encoded_data = json.dumps(data).encode('utf-8') response = Configure.get_es_client().request( "GET", "http://test-mhis-service.pingan.com.cn/elasticsearch/u" + str(p_userid) + "_indice_base/custom_task/_search?pretty", body=encoded_data, headers={"Content-Type": "application/json"}) if response.status == 200: hits = json.loads(response.data.decode('utf-8'))["hits"] total = hits["total"] if total > 0: docs = hits["hits"] for idx, doc in enumerate(docs): scenario = doc["_source"] print(scenario) executor = Executor(p_scenario=scenario) executor.action()
def deletedoc(p_indice, p_type, p_id): response = Configure.get_es_client().request( "DELETE", "http://test-mhis-service.pingan.com.cn/elasticsearch/" + p_indice + "/" + p_type + "/" + p_id) print(response.status) return response.status