Beispiel #1
0
    def generateCrawler(self):
        threadnum = self.getThreadnum()
        duration = self.getDuration()
        lock = threading.Lock()
        for i in range(threadnum):
            cname = "corpuscollect-crawler" + str(i + 1)
            #create user extract index
            response = Configure.get_es_client().request(
                "GET",
                "http://test-mhis-service.pingan.com.cn/elasticsearch/u" +
                str(self.getUser()["userId"]) + "_" + cname +
                "_extract/process_task/_mapping",
                body=None,
                headers={"Content-Type": "application/json"})

            if not (response.status == 200):
                data = {
                    "settings": {
                        "index": {
                            "number_of_shards": 3,
                            "number_of_replicas": 1
                        }
                    },
                    "mappings": {
                        "process_task": {
                            "properties": {
                                "scenarioId": {
                                    "type": "keyword"
                                },
                                "href": {
                                    "type": "text"
                                },
                                "sceneno": {
                                    "type": "integer"
                                },
                                "gageno": {
                                    "type": "integer"
                                },
                                "configuration": {
                                    "type": "object"
                                }
                            }
                        }
                    }
                }
                encoded_data = json.dumps(data).encode('utf-8')
                response = Configure.get_es_client().request(
                    "PUT",
                    "http://test-mhis-service.pingan.com.cn/elasticsearch/u" +
                    str(self.getUser()["userId"]) + "_" + cname + "_extract",
                    body=encoded_data,
                    headers={"Content-Type": "application/json"})
                if not (response.status == 200):
                    print("User extract task indice did not create.")
                print(response.data.decode('utf-8'))
            crawler = SubCrawler(cname,
                                 p_scenario=self,
                                 p_lock=lock,
                                 p_duration=duration)
            crawler.start()
Beispiel #2
0
 def __init__(self, p_scenario):
     Scenario.__init__(self, p_scenario=p_scenario)
     if self.getAutomation():
         print("this scene need automation driver!")
         self._driver = Configure.get_chrome_webdriver()
     else:
         self._driver = Configure.get_http_lowlevel_webdriver()
Beispiel #3
0
    def __init__(self, p_name,p_tag,p_class,p_id):
      self._name = "'"+p_name+"'" if p_name else "null"
      self._tag = "'"+p_tag+"'" if p_tag else "null"
      self._class = "'"+p_class+"'" if p_class else "null"
      self._id = "'"+p_id+"'" if p_id else "null"

      self._find_script = open(Configure.get_application_root_dir()+"/js/findelements_min.js")
      self._select_script = open(Configure.get_application_root_dir()+"/js/select_min.js")
      self._find_scripts = None
      self._select_scripts = None
      if self._find_script:
        self._find_scripts = self._find_script.readline()
      if self._select_script:
        self._select_scripts = self._select_script.readline()
Beispiel #4
0
 def do(self, p_location=None):
     driver = Configure.get_chrome_webdriver()
     if self._script:
         if not self._async:
             driver.execute_script(script=self._script)
         else:
             driver.execute_async_script(script=self._script)
Beispiel #5
0
    def find(p_id, p_class, p_xpath, p_name, p_tag):
        driver = Configure.get_chrome_webdriver()
        if p_xpath:
            return driver.find_elements_by_xpath(p_xpath)

        xpath = "//"
        express = ""
        if p_tag:
            xpath += p_tag
        else:
            xpath += "*"

        if p_id:
            express += "contains(@id, '" + p_id + "')"

        if p_name:
            if len(express) > 0:
                express += " and "
            express += "contains(@name, '" + p_name + "')"

        if p_class:
            if len(express) > 0:
                express += " and "
            express += "contains(@class, '" + p_class + "')"

        if len(express) > 0:
            express = "[" + express + "]"

        xpath += express
        return driver.find_elements_by_xpath(xpath)
        '''  
Beispiel #6
0
 def collect(self, p_href, p_page, p_sceneno, p_pageno, p_delay=None):
   #startaddr = p_scene["href"]
   print ("get page: "+p_href)
   #print (startpage)
   pagebody = self._driver.get(p_href)
   if p_delay :
     print ("after get page ,we need sleep for a while: " + str(p_delay))
     time.sleep(p_delay)
     
   selector = Selector(text=pagebody)
   actors = p_page["actors"]
   for actidx, actor in enumerate(actors):
     acttype = actor["type"]
     properties = actor["properties"]
     recorder = None
     if acttype == 2: #Recording
       recorder = PageCrawl(p_scenario=self, p_parameters=properties, p_sceneno=p_sceneno, p_pageno=p_pageno, p_location=p_href)    
   
     else:
       raise Exception("Unsupported actor type")    
     data = recorder.do(p_selector=selector)
     if data and "data" in data:
       dir = Configure.get_ouput_dir() + "/" + self.getId()
       filename = self.getTypename() + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + ".json"     
       pageComponentsData = data["data"]
       for data in pageComponentsData :
         print (data)  
         Storage.write_map_result( p_dir = dir, p_file_name = filename, p_contents = data )
Beispiel #7
0
 def switch_frame(self):
     driver = Configure.get_chrome_webdriver()
     xpath = None
     if self._frame_index:
         xpath = "//iframe[" + str(self._frame_index) + "]"
     frame = Locator.find(self._frame_id, self._frame_class, xpath,
                          self._frame_name)
     driver.switch_to.frame(frame)
     print(driver.page_source)
Beispiel #8
0
    def init(self, kv=False):
        conf_file = open(Configure.get_application_root_dir() + "/template/" +
                         self._configure_file)
        configures = "".join(conf_file.readlines())
        if not kv:
            self.populate_page_components(p_configure=configures)
        else:
            self.populate_page_kvcomponents(p_configure=configures)

        self.populate_pagination(p_configure=configures)
Beispiel #9
0
    def download_file(self):
        elements = self.getComponent()
        file_path = Configure.get_ouput_dir() + "/download"
        if not os.path.exists(file_path):
            os.mkdir(file_path)
        print(elements)
        for elmt in elements:
            if self._url_property:
                url = elmt.xpath("@" + self._url_property).extract_first()
            else:
                url = elmt.xpath("text()").extract_first()

            file_name = url.split("/")[-1]
            print("Download file to path: " + file_path + "/" + file_name)
            urllib.urlretrieve(url, file_path + "/" + file_name)
Beispiel #10
0
 def submit(self, uri, pid, template, tid):
     taskfile = open(
         Configure.get_application_root_dir() + "/task/task_" + pid +
         ".xml", "ab")
     content = """
                 <crawl>
                     <task>
                         <pid>%s</pid>
                         <uri>%s</uri>
                         <template>%s</template>
                         <id>%s</id>
                     </task>
                 </crawl>
                 """ % (pid, uri, template, tid)
     taskfile.write(bytes(content, encoding="utf8"))
     taskfile.close()
Beispiel #11
0
  def capture_whole(self, p_file=None, p_url=None):
    #get_phantomjs_webdriver().get(p_url)
    os.chdir(Configure.get_ouput_dir())
    file_path = Configure.get_ouput_dir()+"/snapshot"
    if not os.path.exists(file_path):
      os.mkdir("snapshot")

    if p_file == None:
      p_file = "snapshot_"+str(uuid.uuid1())+".png"
    print (file_path+"/"+p_file)
    print (Configure.get_chrome_webdriver().get_window_size())
    
    #Get web page's actual height
    clientHeight = Configure.get_chrome_webdriver().execute_script("return document.body.clientHeight;")
    print (clientHeight)
    #Adjuest window's height to fit web page's height
    cursize = Configure.get_chrome_webdriver().get_window_size()
    Configure.get_chrome_webdriver().set_window_size(cursize["width"], clientHeight)
    
    stored = Configure.get_chrome_webdriver().get_screenshot_as_file(file_path+"/"+p_file)
    return stored
Beispiel #12
0
    def write_array_result(p_dir=None,
                           p_file_name=None,
                           p_contents=None,
                           p_prefix=None,
                           p_suffix=None,
                           p_seperator=None,
                           p_linenum=False):
        if p_dir:
            file_path = p_dir
        else:
            file_path = Configure.get_ouput_dir()
        if not os.path.exists(file_path):
            os.makedirs(file_path)

        if p_file_name == None:
            p_file_name = "crawl_" + str(uuid.uuid1()) + ".data"
        print(file_path + "/" + p_file_name)

        ofile = open(file_path + "/" + p_file_name, 'w', encoding="utf-8")
        if p_contents:
            for idx, content in enumerate(p_contents):
                if p_linenum:
                    ofile.write(str(idx) + "  ")

                if p_prefix:
                    ofile.write(p_prefix)

                if type(content) == list:
                    for i, item in enumerate(content):
                        if i > 0 and p_seperator:
                            ofile.write(p_seperator)
                        else:
                            ofile.write(" ")
                        ofile.write(item)
                else:
                    ofile.write(content)

                if p_suffix:
                    ofile.write(p_suffix)
                ofile.write("\n")

        ofile.close()
Beispiel #13
0
 def getoneextracttask(p_indice, p_type):
     data = {"from": 0, "size": 1, "query": {"match_all": {}}}
     encoded_data = json.dumps(data).encode('utf-8')
     print("http://test-mhis-service.pingan.com.cn/elasticsearch/" +
           p_indice + "/" + p_type + "/_search")
     response = Configure.get_es_client().request(
         "GET",
         "http://test-mhis-service.pingan.com.cn/elasticsearch/" +
         p_indice + "/" + p_type + "/_search",
         body=encoded_data,
         headers={"Content-Type": "application/json"})
     if response.status == 200:
         hits = json.loads(response.data.decode('utf-8'))["hits"]
         total = hits["total"]
         if total > 0:
             docs = hits["hits"]
             id = docs[0]["_id"]
             doc = docs[0]["_source"]
             return (id, doc)
     return (None, None)
Beispiel #14
0
    def writeextracttask(p_indice, p_type, p_scenarioid, p_sceneno, p_pageno,
                         p_uri):
        id = Util.hash(p_uri)

        data = {
            "scenarioId": p_scenarioid,
            "sceneno": p_sceneno,
            "pageno": p_pageno,
            "href": p_uri
        }
        #print (data)
        #print ("http://test-mhis-service.pingan.com.cn/elasticsearch/"+p_indice+"/"+p_type+"/"+id)
        encoded_data = json.dumps(data).encode('utf-8')
        response = Configure.get_es_client().request(
            "PUT",
            "http://test-mhis-service.pingan.com.cn/elasticsearch/" +
            p_indice + "/" + p_type + "/" + id,
            body=encoded_data,
            headers={"Content-Type": "application/json"})
        print("write extract: " + str(response.status))
Beispiel #15
0
 def submit_user_specified_task_immediately(p_userid,
                                            p_scenarioid,
                                            p_href=None):
     data = {"query": {"match": {"scenarioId": p_scenarioid}}}
     encoded_data = json.dumps(data).encode('utf-8')
     response = Configure.get_es_client().request(
         "GET",
         "http://test-mhis-service.pingan.com.cn/elasticsearch/u" +
         str(p_userid) + "_indice_base/custom_task/_search?pretty",
         body=encoded_data,
         headers={"Content-Type": "application/json"})
     if response.status == 200:
         hits = json.loads(response.data.decode('utf-8'))["hits"]
         total = hits["total"]
         if total > 0:
             docs = hits["hits"]
             for idx, doc in enumerate(docs):
                 scenario = doc["_source"]
                 print(scenario)
                 executor = Executor(p_scenario=scenario)
                 executor.action()
Beispiel #16
0
    def switch_window(self):
        driver = Configure.get_chrome_webdriver()
        handles = driver.window_handles

        print("switch window: " + self._window)
        if self._window:
            driver.switch_to.window(self._window)
            driver.implicitly_wait(30)
            driver.set_window_size(1920, 1080)
            return

        print("switch url: " + self._url)
        if self._url:
            for handle in handles:
                print("switch to window: " + handle)
                driver.switch_to.window(handle)
                driver.implicitly_wait(30)
                driver.set_window_size(1920, 1080)
                cur_url = driver.current_url
                print("window url: " + cur_url)
                if cur_url.find(self._url) >= 0:
                    break
Beispiel #17
0
    def write_map_result(p_dir=None, p_file_name=None, p_contents=None):
        if p_dir:
            file_path = p_dir
        else:
            file_path = Configure.get_ouput_dir()

        if not os.path.exists(file_path):
            os.mkdir(file_path)

        if p_file_name == None:
            p_file_name = "crawl_" + str(uuid.uuid1()) + ".data"
        print(file_path + "/" + p_file_name)

        ofile = open(file_path + "/" + p_file_name, 'w', encoding="utf-8")

        if p_contents:
            for lines in p_contents:
                #ofile.write("\n")
                ofile.write('{ "items": {')
                for lidx, item in enumerate(lines):
                    if lidx > 0:
                        ofile.write(', ')
                    id = item["item_id"]
                    ofile.write('"' + id + '": {')
                    val = None
                    for idx, key in enumerate(item.keys()):
                        if key == "item_id":
                            continue
                        if val == None:
                            val = '"' + key + '": "' + item[key] + '"'
                        else:
                            val += ', "' + key + '": "' + item[key] + '"'
                    ofile.write(val)
                    ofile.write('}')
                ofile.write(" } }\n")
            ofile.close()
Beispiel #18
0
 def findPagination(self):
     driver = Configure.get_chrome_webdriver()
     selector = Selector(text=driver.page_sources)
Beispiel #19
0
 def find_elements_with_class(p_class):
     if p_class == None:
         return None
     driver = Configure.get_chrome_webdriver()
     return driver.find_elements_by_class_name(p_class)
Beispiel #20
0
 def find_elements_with_tagname(p_tag):
     if p_tag == None:
         return None
     driver = Configure.get_chrome_webdriver()
     return driver.find_elements_by_tag_name(p_tag)
Beispiel #21
0
 def find_elements_with_xpath(p_xpath):
     if p_xpath == None:
         return None
     driver = Configure.get_chrome_webdriver()
     return driver.find_elements_by_xpath(p_xpath)
Beispiel #22
0
    def _init(p_usrid, p_udir, p_uout, p_ex, p_driver):
        if not os.path.exists(p_udir):
            os.makedirs(p_udir)
        if not os.path.exists(p_uout):
            os.makedirs(p_uout)
        if not os.path.exists(p_ex):
            os.makedirs(p_ex)

        Configure.setconfig(p_dir=p_udir)
        Configure.setoutput(p_dir=p_uout)
        Configure.setextractdir(p_dir=p_ex)
        Configure.setdriver(p_dir=p_driver)
        Configure.set_es_client()
        Configure.set_crawl_controller()
        Configure.set_default_https_context()
Beispiel #23
0
 def find_elements_with_id(p_id):
     if p_id == None:
         return None
     driver = Configure.get_chrome_webdriver()
     return driver.find_elements_by_id(p_id)
Beispiel #24
0
 def switch_parent_frame(self):
     Configure.get_chrome_webdriver().switch_to.parent_frame()
Beispiel #25
0
 def input(self, p_value):
   func = u"select('"+p_value+"',"+self._name+","+self._tag+","+self._class+","+self._id+");"
   print (func)
   Configure.get_chrome_webdriver().execute_script(self._find_scripts.decode("utf-8")+" "+self._select_scripts.decode("utf-8")+" "+func);
Beispiel #26
0
 def collect(self, p_href, p_page, p_sceneno, p_pageno, p_delay=None):
   #startaddr = p_scene["href"]
   print ("get page: "+p_href)
   #print (startpage)
   self._driver.get(p_href)
   if p_delay :
     print ("after get page ,we need sleep for a while: " + str(p_delay))
     time.sleep(p_delay)
     
   selector = Selector(text=self._driver.page_source)
   ofile = open("/usr/local/var/source1.htm",'w', encoding="utf-8")
   ofile.write(self._driver.page_source)
   ofile.close()
   self._driver.save_screenshot("/usr/local/var/capture1.png")
   actors = p_page["actors"]
   for actidx, actor in enumerate(actors):
     acttype = actor["type"]
     properties = actor["properties"]
     if "indexname" in properties :
       indexname = properties["indexname"]   
     else :
       indexname = "indice"
       
     recorder = None
     if acttype == 5: #element
       recorder = Iterator(p_scenario=self, p_parameters=properties, p_sceneno=p_sceneno, p_pageno=p_pageno)    
     elif acttype == 10: #Recordingkv
       recorder = PageKVCrawl(p_scenario=self, p_parameters=properties, p_sceneno=p_sceneno, p_pageno=p_pageno)     
            
     else:
       raise Exception("Unsupported actor type")    
     data = recorder.do(p_selector=selector, p_pageid=self.getId()+"_page0")
     
     resultary = []
     prevalary = None
     iniprevalary = False
     
     keydata = data["keydata"] if "keydata" in data else None
     keylabel = data["keylabel"]
     if not keydata:
       keyvalue = data["keyvalue"] if "keyvalue" in data else "unknown key value"    
     else:
       for ikey in keydata:
         ikeyval = ikey["value"]
         kvpair = '"'+keylabel+'":"'+ikeyval+'"'
         valary.append(kvpair)
       npvalary = np.array(valary).reshape((len(valary),1))  
       if iniprevalary:
         prevalary = np.hstack((prevalary,npvalary))  
       else:
         prevalary = npvalary    
         iniprevalary = True
               
     values = data["values"]
     for valkey in values.keys():
       valcollect = values[valkey]
       valary = []
       for item in valcollect:
         itemval = item["value"]
         itemtype = properties["pageComponent"]["kvv-mapping"]["values"][valkey]["type"]
         if valkey == "timepoint":
           format = properties["timepoint"]["format"]
           if format == "yyyy-mm-dd HH:mi:ss" :
             itemval = itemval[0:10] +"T"+  itemval[11:]+".000Z"    
           elif format == "HH:mi:ss" :     
             itemval = datetime.datetime.now().strftime('%Y-%m-%dT') +  itemval +".000Z"
                 
         if itemtype == "number" or itemtype == "boolean" :
           kvpair = '"'+valkey+'":'+itemval  
         else:
           kvpair = '"'+valkey+'":"'+itemval+'"'
         valary.append(kvpair)
       npvalary = np.array(valary).reshape((len(valary),1))  
       if iniprevalary :
         prevalary = np.hstack((prevalary,npvalary))  
       else :
         prevalary = npvalary
         iniprevalary = True
     
     if not keydata:
       rows, cols = prevalary.shape
       print ("rows, cols, "+str(rows)+str(cols))
       keysary = []
       for ii in range(rows):
         keysary.append('"'+keylabel+'":"'+keyvalue+'"')     
       npkeysary = np.array(keysary).reshape((len(keysary),1))    
       prevalary = np.hstack((prevalary,npkeysary))  
       
     dir = Configure.get_ouput_dir() + "/" + self.getId()
     filename = self.getTypename()+datetime.datetime.now().strftime('%Y%m%d%H%M%S')+".json"   
     Storage.write_array_result(p_dir=dir, p_file_name=filename, p_contents=prevalary.tolist(), p_prefix="{", p_suffix="}", p_seperator=",")
Beispiel #27
0
 def __init__(self, p_scenario):
   Scenario.__init__(self, p_scenario=p_scenario)
   self._driver = Configure.get_chrome_webdriver()
Beispiel #28
0
 def deletedoc(p_indice, p_type, p_id):
     response = Configure.get_es_client().request(
         "DELETE", "http://test-mhis-service.pingan.com.cn/elasticsearch/" +
         p_indice + "/" + p_type + "/" + p_id)
     print(response.status)
     return response.status