def generateCrawler(self): threadnum = self.getThreadnum() duration = self.getDuration() lock = threading.Lock() for i in range(threadnum): cname = "corpuscollect-crawler" + str(i + 1) #create user extract index response = Configure.get_es_client().request( "GET", "http://test-mhis-service.pingan.com.cn/elasticsearch/u" + str(self.getUser()["userId"]) + "_" + cname + "_extract/process_task/_mapping", body=None, headers={"Content-Type": "application/json"}) if not (response.status == 200): data = { "settings": { "index": { "number_of_shards": 3, "number_of_replicas": 1 } }, "mappings": { "process_task": { "properties": { "scenarioId": { "type": "keyword" }, "href": { "type": "text" }, "sceneno": { "type": "integer" }, "gageno": { "type": "integer" }, "configuration": { "type": "object" } } } } } encoded_data = json.dumps(data).encode('utf-8') response = Configure.get_es_client().request( "PUT", "http://test-mhis-service.pingan.com.cn/elasticsearch/u" + str(self.getUser()["userId"]) + "_" + cname + "_extract", body=encoded_data, headers={"Content-Type": "application/json"}) if not (response.status == 200): print("User extract task indice did not create.") print(response.data.decode('utf-8')) crawler = SubCrawler(cname, p_scenario=self, p_lock=lock, p_duration=duration) crawler.start()
def __init__(self, p_scenario): Scenario.__init__(self, p_scenario=p_scenario) if self.getAutomation(): print("this scene need automation driver!") self._driver = Configure.get_chrome_webdriver() else: self._driver = Configure.get_http_lowlevel_webdriver()
def __init__(self, p_name,p_tag,p_class,p_id): self._name = "'"+p_name+"'" if p_name else "null" self._tag = "'"+p_tag+"'" if p_tag else "null" self._class = "'"+p_class+"'" if p_class else "null" self._id = "'"+p_id+"'" if p_id else "null" self._find_script = open(Configure.get_application_root_dir()+"/js/findelements_min.js") self._select_script = open(Configure.get_application_root_dir()+"/js/select_min.js") self._find_scripts = None self._select_scripts = None if self._find_script: self._find_scripts = self._find_script.readline() if self._select_script: self._select_scripts = self._select_script.readline()
def do(self, p_location=None): driver = Configure.get_chrome_webdriver() if self._script: if not self._async: driver.execute_script(script=self._script) else: driver.execute_async_script(script=self._script)
def find(p_id, p_class, p_xpath, p_name, p_tag): driver = Configure.get_chrome_webdriver() if p_xpath: return driver.find_elements_by_xpath(p_xpath) xpath = "//" express = "" if p_tag: xpath += p_tag else: xpath += "*" if p_id: express += "contains(@id, '" + p_id + "')" if p_name: if len(express) > 0: express += " and " express += "contains(@name, '" + p_name + "')" if p_class: if len(express) > 0: express += " and " express += "contains(@class, '" + p_class + "')" if len(express) > 0: express = "[" + express + "]" xpath += express return driver.find_elements_by_xpath(xpath) '''
def collect(self, p_href, p_page, p_sceneno, p_pageno, p_delay=None): #startaddr = p_scene["href"] print ("get page: "+p_href) #print (startpage) pagebody = self._driver.get(p_href) if p_delay : print ("after get page ,we need sleep for a while: " + str(p_delay)) time.sleep(p_delay) selector = Selector(text=pagebody) actors = p_page["actors"] for actidx, actor in enumerate(actors): acttype = actor["type"] properties = actor["properties"] recorder = None if acttype == 2: #Recording recorder = PageCrawl(p_scenario=self, p_parameters=properties, p_sceneno=p_sceneno, p_pageno=p_pageno, p_location=p_href) else: raise Exception("Unsupported actor type") data = recorder.do(p_selector=selector) if data and "data" in data: dir = Configure.get_ouput_dir() + "/" + self.getId() filename = self.getTypename() + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + ".json" pageComponentsData = data["data"] for data in pageComponentsData : print (data) Storage.write_map_result( p_dir = dir, p_file_name = filename, p_contents = data )
def switch_frame(self): driver = Configure.get_chrome_webdriver() xpath = None if self._frame_index: xpath = "//iframe[" + str(self._frame_index) + "]" frame = Locator.find(self._frame_id, self._frame_class, xpath, self._frame_name) driver.switch_to.frame(frame) print(driver.page_source)
def init(self, kv=False): conf_file = open(Configure.get_application_root_dir() + "/template/" + self._configure_file) configures = "".join(conf_file.readlines()) if not kv: self.populate_page_components(p_configure=configures) else: self.populate_page_kvcomponents(p_configure=configures) self.populate_pagination(p_configure=configures)
def download_file(self): elements = self.getComponent() file_path = Configure.get_ouput_dir() + "/download" if not os.path.exists(file_path): os.mkdir(file_path) print(elements) for elmt in elements: if self._url_property: url = elmt.xpath("@" + self._url_property).extract_first() else: url = elmt.xpath("text()").extract_first() file_name = url.split("/")[-1] print("Download file to path: " + file_path + "/" + file_name) urllib.urlretrieve(url, file_path + "/" + file_name)
def submit(self, uri, pid, template, tid): taskfile = open( Configure.get_application_root_dir() + "/task/task_" + pid + ".xml", "ab") content = """ <crawl> <task> <pid>%s</pid> <uri>%s</uri> <template>%s</template> <id>%s</id> </task> </crawl> """ % (pid, uri, template, tid) taskfile.write(bytes(content, encoding="utf8")) taskfile.close()
def capture_whole(self, p_file=None, p_url=None): #get_phantomjs_webdriver().get(p_url) os.chdir(Configure.get_ouput_dir()) file_path = Configure.get_ouput_dir()+"/snapshot" if not os.path.exists(file_path): os.mkdir("snapshot") if p_file == None: p_file = "snapshot_"+str(uuid.uuid1())+".png" print (file_path+"/"+p_file) print (Configure.get_chrome_webdriver().get_window_size()) #Get web page's actual height clientHeight = Configure.get_chrome_webdriver().execute_script("return document.body.clientHeight;") print (clientHeight) #Adjuest window's height to fit web page's height cursize = Configure.get_chrome_webdriver().get_window_size() Configure.get_chrome_webdriver().set_window_size(cursize["width"], clientHeight) stored = Configure.get_chrome_webdriver().get_screenshot_as_file(file_path+"/"+p_file) return stored
def write_array_result(p_dir=None, p_file_name=None, p_contents=None, p_prefix=None, p_suffix=None, p_seperator=None, p_linenum=False): if p_dir: file_path = p_dir else: file_path = Configure.get_ouput_dir() if not os.path.exists(file_path): os.makedirs(file_path) if p_file_name == None: p_file_name = "crawl_" + str(uuid.uuid1()) + ".data" print(file_path + "/" + p_file_name) ofile = open(file_path + "/" + p_file_name, 'w', encoding="utf-8") if p_contents: for idx, content in enumerate(p_contents): if p_linenum: ofile.write(str(idx) + " ") if p_prefix: ofile.write(p_prefix) if type(content) == list: for i, item in enumerate(content): if i > 0 and p_seperator: ofile.write(p_seperator) else: ofile.write(" ") ofile.write(item) else: ofile.write(content) if p_suffix: ofile.write(p_suffix) ofile.write("\n") ofile.close()
def getoneextracttask(p_indice, p_type): data = {"from": 0, "size": 1, "query": {"match_all": {}}} encoded_data = json.dumps(data).encode('utf-8') print("http://test-mhis-service.pingan.com.cn/elasticsearch/" + p_indice + "/" + p_type + "/_search") response = Configure.get_es_client().request( "GET", "http://test-mhis-service.pingan.com.cn/elasticsearch/" + p_indice + "/" + p_type + "/_search", body=encoded_data, headers={"Content-Type": "application/json"}) if response.status == 200: hits = json.loads(response.data.decode('utf-8'))["hits"] total = hits["total"] if total > 0: docs = hits["hits"] id = docs[0]["_id"] doc = docs[0]["_source"] return (id, doc) return (None, None)
def writeextracttask(p_indice, p_type, p_scenarioid, p_sceneno, p_pageno, p_uri): id = Util.hash(p_uri) data = { "scenarioId": p_scenarioid, "sceneno": p_sceneno, "pageno": p_pageno, "href": p_uri } #print (data) #print ("http://test-mhis-service.pingan.com.cn/elasticsearch/"+p_indice+"/"+p_type+"/"+id) encoded_data = json.dumps(data).encode('utf-8') response = Configure.get_es_client().request( "PUT", "http://test-mhis-service.pingan.com.cn/elasticsearch/" + p_indice + "/" + p_type + "/" + id, body=encoded_data, headers={"Content-Type": "application/json"}) print("write extract: " + str(response.status))
def submit_user_specified_task_immediately(p_userid, p_scenarioid, p_href=None): data = {"query": {"match": {"scenarioId": p_scenarioid}}} encoded_data = json.dumps(data).encode('utf-8') response = Configure.get_es_client().request( "GET", "http://test-mhis-service.pingan.com.cn/elasticsearch/u" + str(p_userid) + "_indice_base/custom_task/_search?pretty", body=encoded_data, headers={"Content-Type": "application/json"}) if response.status == 200: hits = json.loads(response.data.decode('utf-8'))["hits"] total = hits["total"] if total > 0: docs = hits["hits"] for idx, doc in enumerate(docs): scenario = doc["_source"] print(scenario) executor = Executor(p_scenario=scenario) executor.action()
def switch_window(self): driver = Configure.get_chrome_webdriver() handles = driver.window_handles print("switch window: " + self._window) if self._window: driver.switch_to.window(self._window) driver.implicitly_wait(30) driver.set_window_size(1920, 1080) return print("switch url: " + self._url) if self._url: for handle in handles: print("switch to window: " + handle) driver.switch_to.window(handle) driver.implicitly_wait(30) driver.set_window_size(1920, 1080) cur_url = driver.current_url print("window url: " + cur_url) if cur_url.find(self._url) >= 0: break
def write_map_result(p_dir=None, p_file_name=None, p_contents=None): if p_dir: file_path = p_dir else: file_path = Configure.get_ouput_dir() if not os.path.exists(file_path): os.mkdir(file_path) if p_file_name == None: p_file_name = "crawl_" + str(uuid.uuid1()) + ".data" print(file_path + "/" + p_file_name) ofile = open(file_path + "/" + p_file_name, 'w', encoding="utf-8") if p_contents: for lines in p_contents: #ofile.write("\n") ofile.write('{ "items": {') for lidx, item in enumerate(lines): if lidx > 0: ofile.write(', ') id = item["item_id"] ofile.write('"' + id + '": {') val = None for idx, key in enumerate(item.keys()): if key == "item_id": continue if val == None: val = '"' + key + '": "' + item[key] + '"' else: val += ', "' + key + '": "' + item[key] + '"' ofile.write(val) ofile.write('}') ofile.write(" } }\n") ofile.close()
def findPagination(self): driver = Configure.get_chrome_webdriver() selector = Selector(text=driver.page_sources)
def find_elements_with_class(p_class): if p_class == None: return None driver = Configure.get_chrome_webdriver() return driver.find_elements_by_class_name(p_class)
def find_elements_with_tagname(p_tag): if p_tag == None: return None driver = Configure.get_chrome_webdriver() return driver.find_elements_by_tag_name(p_tag)
def find_elements_with_xpath(p_xpath): if p_xpath == None: return None driver = Configure.get_chrome_webdriver() return driver.find_elements_by_xpath(p_xpath)
def _init(p_usrid, p_udir, p_uout, p_ex, p_driver): if not os.path.exists(p_udir): os.makedirs(p_udir) if not os.path.exists(p_uout): os.makedirs(p_uout) if not os.path.exists(p_ex): os.makedirs(p_ex) Configure.setconfig(p_dir=p_udir) Configure.setoutput(p_dir=p_uout) Configure.setextractdir(p_dir=p_ex) Configure.setdriver(p_dir=p_driver) Configure.set_es_client() Configure.set_crawl_controller() Configure.set_default_https_context()
def find_elements_with_id(p_id): if p_id == None: return None driver = Configure.get_chrome_webdriver() return driver.find_elements_by_id(p_id)
def switch_parent_frame(self): Configure.get_chrome_webdriver().switch_to.parent_frame()
def input(self, p_value): func = u"select('"+p_value+"',"+self._name+","+self._tag+","+self._class+","+self._id+");" print (func) Configure.get_chrome_webdriver().execute_script(self._find_scripts.decode("utf-8")+" "+self._select_scripts.decode("utf-8")+" "+func);
def collect(self, p_href, p_page, p_sceneno, p_pageno, p_delay=None): #startaddr = p_scene["href"] print ("get page: "+p_href) #print (startpage) self._driver.get(p_href) if p_delay : print ("after get page ,we need sleep for a while: " + str(p_delay)) time.sleep(p_delay) selector = Selector(text=self._driver.page_source) ofile = open("/usr/local/var/source1.htm",'w', encoding="utf-8") ofile.write(self._driver.page_source) ofile.close() self._driver.save_screenshot("/usr/local/var/capture1.png") actors = p_page["actors"] for actidx, actor in enumerate(actors): acttype = actor["type"] properties = actor["properties"] if "indexname" in properties : indexname = properties["indexname"] else : indexname = "indice" recorder = None if acttype == 5: #element recorder = Iterator(p_scenario=self, p_parameters=properties, p_sceneno=p_sceneno, p_pageno=p_pageno) elif acttype == 10: #Recordingkv recorder = PageKVCrawl(p_scenario=self, p_parameters=properties, p_sceneno=p_sceneno, p_pageno=p_pageno) else: raise Exception("Unsupported actor type") data = recorder.do(p_selector=selector, p_pageid=self.getId()+"_page0") resultary = [] prevalary = None iniprevalary = False keydata = data["keydata"] if "keydata" in data else None keylabel = data["keylabel"] if not keydata: keyvalue = data["keyvalue"] if "keyvalue" in data else "unknown key value" else: for ikey in keydata: ikeyval = ikey["value"] kvpair = '"'+keylabel+'":"'+ikeyval+'"' valary.append(kvpair) npvalary = np.array(valary).reshape((len(valary),1)) if iniprevalary: prevalary = np.hstack((prevalary,npvalary)) else: prevalary = npvalary iniprevalary = True values = data["values"] for valkey in values.keys(): valcollect = values[valkey] valary = [] for item in valcollect: itemval = item["value"] itemtype = properties["pageComponent"]["kvv-mapping"]["values"][valkey]["type"] if valkey == "timepoint": format = properties["timepoint"]["format"] if format == "yyyy-mm-dd HH:mi:ss" : itemval = itemval[0:10] +"T"+ itemval[11:]+".000Z" elif format == "HH:mi:ss" : itemval = datetime.datetime.now().strftime('%Y-%m-%dT') + itemval +".000Z" if itemtype == "number" or itemtype == "boolean" : kvpair = '"'+valkey+'":'+itemval else: kvpair = '"'+valkey+'":"'+itemval+'"' valary.append(kvpair) npvalary = np.array(valary).reshape((len(valary),1)) if iniprevalary : prevalary = np.hstack((prevalary,npvalary)) else : prevalary = npvalary iniprevalary = True if not keydata: rows, cols = prevalary.shape print ("rows, cols, "+str(rows)+str(cols)) keysary = [] for ii in range(rows): keysary.append('"'+keylabel+'":"'+keyvalue+'"') npkeysary = np.array(keysary).reshape((len(keysary),1)) prevalary = np.hstack((prevalary,npkeysary)) dir = Configure.get_ouput_dir() + "/" + self.getId() filename = self.getTypename()+datetime.datetime.now().strftime('%Y%m%d%H%M%S')+".json" Storage.write_array_result(p_dir=dir, p_file_name=filename, p_contents=prevalary.tolist(), p_prefix="{", p_suffix="}", p_seperator=",")
def __init__(self, p_scenario): Scenario.__init__(self, p_scenario=p_scenario) self._driver = Configure.get_chrome_webdriver()
def deletedoc(p_indice, p_type, p_id): response = Configure.get_es_client().request( "DELETE", "http://test-mhis-service.pingan.com.cn/elasticsearch/" + p_indice + "/" + p_type + "/" + p_id) print(response.status) return response.status