def getHtml(self, url, referer="https://www.baidu.com/"): _result = "" try: my_dc = DesiredCapabilities.PHANTOMJS.copy() my_dc["browserName"] = "chrome" my_dc["platform"] = "mac" my_dc["version"] = "63.0.3239.84" my_dc["phantomjs.page.settings.loadImages"] = False my_dc["phantomjs.page.settings.userAgent"] = HtmlURLUtil.__USER_AGENT service_args = ["--load-images=false", "--disk-cache=false", "--ignore-ssl-errors=true"] # "--webdriver-logfile=webdriver.log","--webdriver-loglevel=INFO" for head, value in self.headers.iteritems(): my_dc["phantomjs.page.customHeaders.{}".format(head)] = value my_dc["phantomjs.page.customHeaders.Referer"] = referer self.driver = WebDriver(desired_capabilities=my_dc, service_args=service_args) self.driver.set_script_timeout(20) self.driver.set_page_load_timeout(30) self.driver.implicitly_wait(5) self.driver.set_window_size(2560, 1066) self.driver.get(url) # 保存网页快照图片 # self.driver.save_screenshot(md5_util.md5(url)+".png") _result = self.driver.page_source except: log.getLogger().exception("HtmlURLUtil getHtml error...") # self.driver.close() self.driver.quit() return _result
def run(self): while True: try: self.crawler.action() time.sleep(30) except: self.crawler.mysql.close() log.getLogger().exception("MyCrawlerThread exception...")
def action(self): try: os.system(self.action_str) except: log.getLogger().exception("action impl ....action_str:%s" % self.action_str) raise ConsumberException( "consumer action exception action_str=%s" % self.action_str)
def close(self): """ 提交事务动作 :return: """ try: self.connection.close() except: log.getLogger().exception("close exception .....")
def excuteNotCommit(self, query, params=None): """ :param query: :param params: 参数建议同 queryDict :return: """ try: cursor = self.connection.cursor() cursor.execute(query, params) except: log.getLogger().exception("excute not commit exception .....") self.connection.rollback()
def excuteManyNotCommit(self, query, params): """ :param query: :param params: 参数建议同 queryDict :return: """ try: cursor = self.connection.cursor() cursor.executemany(query, params) except: self.connection.rollback() log.getLogger().exception("excute commit exception .....")
def getTLD(self, url): """ 获取域名对象 :param url: :return: """ try: if not url: return None web = urllib.splitquery(url)[0] return tld.get_tld(web) except: log.getLogger().exception("getTLD ...%s" % url) return None
def queryNotClose(self, query, params=None, cursor_class=DictCursor): """ 带参数查询 :param query: :param params: 推荐使用dict类型的,这样可读性最高 :param cursor_class: 游标类,这里默认为字典游标 :return: """ try: cursor = self.connection.cursor(cursorclass=cursor_class) cursor.execute(query, params) result = cursor.fetchall() except: log.getLogger().exception("query dict exception .....") return result
def writeWebContentToFile(self, webcontent, filepath): if not webcontent: return reload(sys) sys.setdefaultencoding("utf-8") try: _dir = os.path.dirname(filepath) if not os.path.exists(_dir): os.makedirs(_dir) f = open(filepath, "w") f.write(webcontent) f.flush() except: log.getLogger().exception("htmlutil writeWebContentToFile ...") finally: f.close()
def excuteCommit(self, query, params=None): """ :param query: :param params: 参数建议同 queryDict :return: """ try: cursor = self.connection.cursor() cursor.execute(query, params) self.connection.commit() except: self.connection.rollback() log.getLogger().exception("excute commit exception .....") finally: cursor.close() self.connection.close()
def mvFile(source_file, des_file): """ 将source_file移动到des_file :param source_file: :param des_file: :return: """ try: _dir = os.path.dirname(des_file) if not os.path.exists(_dir): os.makedirs(_dir) shutil.move(source_file, des_file) return True except: log.getLogger().exception("mvFile exception ...%s %s" % (source_file, des_file)) return False
def action(self): """ crawler主入口,保存需要爬取的url到web_url_table表中, 并转化到action_queue表中让生产者去取用 只有在web_url_table里边没有链接的时候才根据提供的top_url取新的链接 该方法使用定时任务或者线程单独调用 :return: """ # 先查找种子库,根据种子库爬取 sql = """ select * from seed_url_table where status=0 ORDER by update_time desc,crawler_num ASC limit 0,%s FOR UPDATE """ try: result = self.mysql.queryNotClose(sql, [self.url_num]) if result: # 有种子,则将种子库给放入到web_url_table中 for seed in result: self.saveUrlTableByUrl(seed["url"], seed["content_type"], seed["title"], seed["referer"]) update_crawler_num = """ update seed_url_table set crawler_num = crawler_num + 1,update_time=%s where id=%s """ self.mysql.excuteNotCommit(update_crawler_num, [util.now(), seed["id"]]) # 在web_url_table中选20条放入队列中 sql = """ select * from web_url_table where used=0 ORDER BY create_time desc limit 0,%s FOR UPDATE """ result = self.mysql.queryNotClose(sql, [self.url_num]) if result: self.parseToActionQueue(result) self.mysql.connection.commit() except: log.getLogger().exception("mycrawler action ...") finally: # self.mysql.close() pass
def mergeFile(file_path, new_file, del_old=True): """ 将文件同级目录下的所有文件合并 :param file_path: 要合并的文件目录的内容或者文件同级文件,必须有rw权限 :param new_file: 合并产生的新文件 :param del_old: 是否删除被合并的文件 :return: """ try: if os.path.isfile(file_path): file_path = os.path.dirname(file_path) if not os.path.isdir(file_path): raise Exception("不是目录:%s,请检查该路径是否是文件目录" % file_path) if not os.access(file_path, os.W_OK): raise Exception("文件目录:%s,不可写,请检查该目录是否可写" % file_path) if os.path.isdir(new_file): raise Exception("合并到的文件%s是目录,不能合并" % new_file) pdir = os.path.dirname(new_file) if not os.path.exists(pdir): os.makedirs(pdir) files = os.listdir(file_path) try: newfile = file(new_file, "a") while len(files) >= 1: f = files.pop() if f.startswith("."): # 过滤掉隐藏文件 continue f = file_path + os.sep + f if not os.path.isfile(f): # 过滤掉目录 continue if not os.access(f, os.W_OK): # 文件不可写则忽略该文件 continue try: _f = file(f, "r") newfile.write(_f.read()) if del_old: os.remove(f) # 合并之后删除 os.chmod(f, stat.S_IRUSR) # 改变该文件权限为只读模式,目的:已经合并的文件就不再重复合并 except: log.getLogger().exception("mergeFile ...") finally: _f.close() except: log.getLogger().exception("mergeFile ...") return False finally: newfile.close() except: log.getLogger().exception("mergeFile ...") return False return True
def fail(self): log.getLogger().info("fail......")
def success(self): log.getLogger().info("success......")
def action(self): log.getLogger().info("action......action=%s,params=%s" % (self.action_str, self.params)) raise Exception(self.action_str)
import base.config.common_config as common_config import crawler.util.uuid_util as myuuid import log.common_log as log from base.exception.consumer_exception import ConsumberException from crawler.mycrawler import MyCrawler from crawler.util.html_util import HtmlURLUtil if __name__ == "__main__": """ 爬取链接动作,供消费者(ConsumerAction)调用 sys.argv = [py_file_path,url,title,refere] """ reload(sys) sys.setdefaultencoding("utf-8") log.getLogger().info(sys.argv) try: if not sys.argv or len(sys.argv) <= 3: raise ConsumberException("请设置需要爬取的url,title,referer") html_util = HtmlURLUtil() # url, title, referer, save_path url = sys.argv[1] title = sys.argv[2] referer = sys.argv[3] html_result = html_util.getHtml(url) my_crawler = MyCrawler() file_path = common_config.CRAWLER_SAVE_PATH + os.sep + "tmp" + os.sep + myuuid.getUUID(
def saveUrlTableByUrl(self, url, charset="utf-8", title=None, referer=None): """ 爬取url并获取到该url页面的所有a标签 保存到web_url_table表中 :param url: :param charset: :param title: :param referer: :return: """ sql = """ insert into web_url_table (`url`,`title`,`content_type`,`referer`, `hostname`,`params`,`md5`,`url_type`,`used`,`create_time` ,`update_time`) VALUES (%(url)s,%(title)s,%(content_type)s,%(referer)s, %(hostname)s,%(params)s,%(md5)s,%(url_type)s,%(used)s, %(create_time)s,%(update_time)s) ON DUPLICATE KEY UPDATE md5=md5 """ params = [] now = util.now() _md5 = md5(url) self.html_util = html_util = HtmlURLUtil() try: params.append({ "url": url, "title": title, "content_type": charset, "referer": referer, "hostname": html_util.getTLD(url), "params": html_util.getSortQS(url), "md5": _md5, "url_type": "0", "used": "0", "create_time": now, "update_time": now }) douban = html_util.getHtml(url) # 写入到文件中 file_path = common_config.CRAWLER_SAVE_PATH + os.sep + "tmp" + os.sep + myuuid.getUUID( ).__str__() self.appendContentToFile(url, title, url, douban, file_path) # 查找该页面下的所有的a标签 eles = html_util.getElementsByTagName("a") hsn = html_util.getTLD(url) _charset = html_util.getCharset(douban) if eles: for el in eles: sub_url = el.get_attribute('href') if sub_url.count("javascript"): continue sub_md5 = md5(sub_url) if not util.dictListContains(params, "md5", sub_md5): params.append({ "url": sub_url, "title": html_util.driver.title, "content_type": _charset, "referer": url, "hostname": html_util.getTLD(sub_url), "params": str(html_util.getSortQS(sub_url)), "md5": sub_md5, "url_type": 0 if hsn == html_util.getTLD(sub_url) else 1, "used": "0", "create_time": now, "update_time": now }) self.mysql.excuteManyNotCommit(sql, params) except: log.getLogger().exception("mycrawler saveSeedWebUrlToMysql ...") finally: html_util.closeWebDriver() return params