def crawler(self): hasUrl = True while hasUrl: target = self.getTarget() # 判断是否已经取完目标 if target != "": # 预登录处理 self.crawler_run(target) else: hasUrl = False baseUtil.log("Crawler is over!!!!!!!!!!!!!")
def onStartBtnClick(self): photoId = self.picIdEdit.text().strip() outputPath = BaseUtil.ensureXlSuffix(self.outputDesEdit.text().strip()) if photoId == "" or outputPath == "": self.mainWindow.popUpMessageBox(False, "照片ID或输出地址不可为空") return self.loading.show() self.setEnabled(False) self.ctrller.setPhotoId(photoId) self.ctrller.setOutputPath(outputPath) self.scrape_thread.start()
def start2Scrape(self): driver = webdriver.PhantomJS(executable_path=ScrapeUtil.phantomjsUrl) driver.get(ScrapeUtil.getPhotoPageUrl(self.photoId)) try: element = WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.ID, "content"))) except: print("Oops!",sys.exc_info()[0],"occured.") finally: pageSource = driver.page_source driver.close() bsObj = ScrapeUtil.getBSObjByHTML(pageSource) totalLikedNum = int(bsObj.find(class_="v2_new_fav with_count").find("span", {"class":"value"}).get_text()) photoName = bsObj.find(id="content").h2.get_text() users = [] if totalLikedNum > 0: numOfPages = math.ceil(totalLikedNum / self.sizeOfOnePage) print(totalLikedNum) print(numOfPages) self.addUsersPages(users, self.photoId, numOfPages, self.sizeOfOnePage) print("length is", len(users)) """ with open('./test.csv', 'w', newline='', encoding='utf-8') as csvfile: writer = csv.writer(csvfile) writer.writerow(['PhotoName', 'Num', 'NickNames', 'Links']) if totalLikedNum == 0: writer.writerow([photoName, totalLikedNum]) else: for i in range(len(users)): if i == 0: writer.writerow([photoName, totalLikedNum, users[i].nickName, ScrapeUtil.getUserPageUrlTpl(users[i].userId)]) if i > 0: writer.writerow(['', '', users[i].nickName, ScrapeUtil.getUserPageUrlTpl(users[i].userId)]) """ path = self.outputPath if os.path.exists(path): os.remove(path) mywb = openpyxl.Workbook() mysheet = mywb['Sheet'] if totalLikedNum == 0: BaseUtil.writerow2xl(mysheet, 1, [photoName, totalLikedNum]) else: for i in range(len(users)): j = i + 1 if j == 1: BaseUtil.writerow2xl(mysheet, j, [photoName, totalLikedNum, users[i].nickName, ScrapeUtil.getUserPageUrlTpl(users[i].userId)]) if j > 1: BaseUtil.writerow2xl(mysheet, j, ['', '', users[i].nickName, ScrapeUtil.getUserPageUrlTpl(users[i].userId)]) mysheet.cell(row=j, column=3).hyperlink = ScrapeUtil.getUserPageUrlTpl(users[i].userId) mywb.save(path)
def sleep(seconds): BaseUtil.msg("sleep %d seconds" % seconds) DatetimeObj.sleep(seconds)
def analysisPageInfo(self, target, url): info_rule = target.get("data_rule").get("info_rule") baseUtil.log("Load url") try: baseUtil.driver.get(url) except Exception: baseUtil.log("Load Time out,will return!") return None baseUtil.log("Loaded url") time.sleep(6) # 评论采集-数据加载 START comment_rule = target.get("comment_rule") if int(comment_rule.get("need_comment")) == 1: if int(comment_rule.get("comment_page_type")) == 2: for i in xrange(int(comment_rule.get("max_comment_page"))): # 判断 当有按钮时 点击按钮 没有按钮时 滚动 try: if comment_rule.get('comment_page_key') and len( baseUtil.driver.find_elements_by_css_selector( comment_rule.get( "comment_page_key"))) == 0: js = "var p = document.body.scrollTop=100000" baseUtil.driver.execute_script(js) baseUtil.log("Auth Comment scrollTop") else: baseUtil.driver.find_element_by_css_selector( comment_rule.get("comment_page_key")).click() baseUtil.log("Click Comment scrollTop") except: baseUtil.log("No found Click target for " + comment_rule.get('comment_page_key')) time.sleep(2) else: baseUtil.log("Skip Commont!") # 评论采集-数据加载 END html = baseUtil.driver.page_source doc = pq(html) save_data = {} save_data['href'] = url save_data['crawler_time'] = int(time.time()) save_data['site_name'] = target.get('target_name') save_data['site_id'] = target.get('id') save_data['source_type'] = target.get('source_type') # 数据采集 for k, v in info_rule.items(): k_v = None k_v_find = doc.find(v.get("find")) if v.get("getType") == 'text': k_v = k_v_find.text() elif v.get("getType") == 'attr': if v.get("attr_key"): k_v = k_v_find.attr(v.get("attr_key")) else: baseUtil.log("[ERROR]:data rule is error target is :" + target.get("target_url")) elif v.get("getType") == 'html': k_v = k_v_find.html() if v.get("formart") == 'time_0' and k_v != '': k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d %H:%M")) elif v.get("formart") == 'time_1' and k_v != '': k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d %H:%M:%S")) elif v.get("formart") == 'time_2' and k_v != '': k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d")) elif v.get("formart") == 'time_3' and k_v != '': k_v = int(k_v) / 1000 elif v.get("formart") == 'time_4' and k_v != '': k_v = k_v.replace('年', '-') k_v = k_v.replace('月', '-') k_v = k_v.replace('日', '') k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d %H:%M:%S")) elif v.get("formart") == 'time_5' and k_v != '': k_v = time.mktime(time.strptime(k_v, "%Y/%m/%d %H:%M")) elif v.get("formart") == 'time_6' and k_v != '': k_v = k_v.replace('年', '-') k_v = k_v.replace('月', '-') k_v = k_v.replace('日', '') k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d%H:%M")) elif v.get("formart") == 'str_int' and k_v != '': k_v = filter(lambda ch: ch in '0123456789.', k_v) elif v.get("formart") == 'formart_url' and k_v != '': k_v = baseUtil.formartURL(target.get("target_url"), k_v) save_data[k] = k_v # 评论数据采集 save_data['comments'] = [] if int(comment_rule.get("need_comment")) == 1: comment_lines = doc.find(comment_rule.get('comment_line')) for comment_line in comment_lines.items(): comment_line_data = {} for k, comment in comment_rule.get("info_rule").items(): if k and comment.get("find"): comment_find = comment_line.find(comment.get("find")) c_v = None if comment.get("getType") == 'text': c_v = comment_find.text() elif comment.get("getType") == 'attr': if comment.get("attr_key"): c_v = comment_find.attr( comment.get("attr_key")) else: baseUtil.log( "[ERROR]:Comment rule is error target is :" + target.get("target_url")) elif comment.get("getType") == 'html': c_v = comment_find.html() if comment.get( "formart") == 'formart_url' and c_v != '': c_v = baseUtil.formartURL(target.get("target_url"), c_v) comment_line_data[k] = c_v comment_line_data['crawler_time'] = time.time() save_data['comments'].append(comment_line_data) if save_data['content']: save_data['create_time'] = save_data.get('create_time') and int( save_data.get('create_time')) or int(time.time()) baseUtil.saveData(target, save_data)
def crawler_run(self, target): baseUtil.simulationLogin(target['headers'], target['login_info']) self.getAnalysisHtml(target)
def getAnalysisHtml(self, target): baseUtil.log("Analysis begin…………") info_urls = [] # 分页类型 2||3 为滚动加载 加载完分页数据之后再抽取页面源代码 if int(target.get("page_type")) == 1 or int( target.get("page_type")) == 2: url_params_str = baseUtil.formartParamsToUrl( target.get('url_params')) print target.get("target_url") baseUtil.driver.get(target.get("target_url") + url_params_str) time.sleep(16) baseUtil.log("Get html…………") try: for i in xrange(target.get('max_crawler_page')): #分页类型2 不会有加载更多的点击按钮 所以直接滚动 if int(target.get("page_type")) == 1: js = "var p = document.body.scrollTop=100000" baseUtil.driver.execute_script(js) baseUtil.log("Auth scrollTop") else: # 判断 当有按钮时 点击按钮 没有按钮时 滚动 if len( baseUtil.driver.find_elements_by_css_selector( target.get("page_params_key"))) == 0: js = "var p = document.body.scrollTop=100000" baseUtil.driver.execute_script(js) baseUtil.log("Auth scrollTop") else: baseUtil.driver.find_element_by_css_selector( target.get("page_params_key")).click() baseUtil.log("Click scrollTop") time.sleep(6) except: baseUtil.log('[ERROR] not found scroll selector') #获取加载分页后的源代码 html = baseUtil.driver.page_source #print html doc = pq(html) if target.get('data_rule').get( 'get_info_address_type') == 'tag_href': datas_href_items = doc.find( target.get('data_rule').get('get_info_address_value')) for data_href in datas_href_items.items(): info_urls.append(data_href.attr("href")) # 分页类型为1 需要循环加载分页页面 并单独抽取源代码 elif int(target.get("page_type")) == 0: url_params = target.get('url_params') for i in xrange(target.get('max_crawler_page')): url_params[target.get('page_params_key')] = i url_params_str = baseUtil.formartParamsToUrl(url_params) baseUtil.driver.get(target.get("target_url") + url_params_str) time.sleep(6) baseUtil.log("Get html…………") #获取加载分页后的源代码 html = baseUtil.driver.page_source doc = pq(html) if target.get('data_rule').get( 'get_info_address_type') == 'tag_href': datas_href_items = doc.find( target.get('data_rule').get('get_info_address_value')) for data_href in datas_href_items.items(): info_urls.append(data_href.attr("href")) for i in xrange(0, len(info_urls)): info_url = info_urls[i] host = baseUtil.getHostInURL(target.get("target_url")) if "http" in info_url: info_url_host = baseUtil.getHostInURL(info_url) if info_url_host.find(info_url_host) == -1: continue elif info_url[:2] == '//': info_url = "http:" + "" + info_url elif info_url[:1] == '/': info_url = "http://" + host + "" + info_url else: info_url = "http://" + "" + info_url baseUtil.log("target_url:" + info_url) if baseUtil.checkRepeat(info_url): baseUtil.log("Pass checkRepeat") self.analysisPageInfo(target, info_url) time.sleep(2)
#!/usr/bin/python # To enable debug => set debug=True in BaseUtil import os import sys import BaseUtil from BaseUtil import msg from BaseUtil import error from BaseUtil import os_cmd from AWS import AWSResourceManager from AWS import add_sudoer BaseUtil.set_debug(True) mgr = AWSResourceManager('us-east-1') instance = mgr.start_instance('i-ba73ff54') ip_address = instance.ip_address while True: (s,o) = os_cmd("checkport.py " + ip_address + " 12345") if s == 0 : break BaseUtil.sleep(20) if s > 0 : msg("The amazon VM is not reachable - please make sure AWS instances are running and "