def _analyze_html(self, url, html): domain = self._site["domain"] d_config = self._site["config"] xpath = expath.XPath(url, html, code=d_config["default_code"]) links = xpath.pick(link_config) if self._site["new_depth"] > d_config["max_depth"]: return "too depth :", self._site["new_depth"] link_infos = [] check_list = [] for link in links: if not "href" in link or not link["href"]: continue link = link["href"].lower() link_domain = link.split("/")[2] if d_config["only_insite"] and link_domain != domain: continue if self._filter_link(link): continue link = self.modify_link(link) type_name = self._get_page_type(link) page_type = CONFIG.G_SITE_COMMON.G_PAGETYPE[type_name]["type"] md5 = util.md5(link) link_infos.append({"md5": md5, "url": link, "type": page_type}) check_list.append(md5) link_infos = util.del_duplicate(link_infos, "md5") if self._test: return link_infos count = self._insert2sql(link_infos, check_list) return "[NEWCount]:%d %s" % (count, url)
def _analyze_html(self, url, html): domain = self._site["domain"] d_config = self._site["config"] xpath = expath.XPath(url, html, code=d_config["default_code"]) links = xpath.pick(link_config) if self._site["new_depth"] > d_config["max_depth"]: return "too depth :", self._site["new_depth"] link_infos = [] check_list = [] for link in links: if not "href" in link or not link["href"]: continue link = link["href"].lower() link_domain = link.split("/")[2] if d_config["only_insite"] and link_domain != domain: continue if self._filter_link(link): continue link = self.modify_link(link) type_name = self._get_page_type(link) page_type = CONFIG.G_SITE_COMMON.G_PAGETYPE[type_name]["type"] md5 = util.md5(link) link_infos.append({"md5":md5, "url":link, "type":page_type}) check_list.append(md5) link_infos = util.del_duplicate(link_infos, "md5") if self._test: return link_infos count = self._insert2sql(link_infos, check_list) return "[NEWCount]:%d %s"%(count, url)
def md5_test(): print "==== md5" test_file = "/var/log/syslog" fmd5 = util.file_md5(test_file) cmd = "echo -n `md5sum /var/log/syslog|awk '{print $1}'`" cmd_ret = commands.getstatusoutput(cmd) if fmd5 != cmd_ret[1]: print "file_md5:", fmd5, cmd_ret[1] md5 = util.md5("12345") if md5 != "827ccb0eea8a706c4c34a16891f84e7b": print "md5:", md5, "827ccb0eea8a706c4c34a16891f84e7b"
def init_url(url, sql_agent=None): """ when the link table empty, you can't use this to add a base url the spider will start crawl by it init_url.py 'base_url' """ if sql_agent == None: sql_agent = sql.Sql(CONFIG.G_MYSQL, CONFIG.G_MAINDB) data = {} data["url"] = url data["md5"] = util.md5(url) data["depth"] = 0 data["type"] = 0 data["last_time"] = int(time.time()) data["domain"] = url.split("/")[2] ret = sql_agent.insert(CONFIG.G_TABLE_LINK["name"], data) sql_agent.commit() return (sql_agent, ret)
def _deal_pick_ret(self, ret, url, md5, table_cfg): #查内容是否在库里,根据情况执行插入或更新 ret_count = 0 insert_count = 0 update_count = 0 if isinstance(ret, list): #提取结果是队列的情况 ret_count = len(ret) check_list = [] for i in range(0, len(ret)): ret[i]["md5"] = util.md5(ret[i]["url"]) check_list.append(ret[i]["md5"]) check_ret = self._db_had(check_list, table_cfg) for data in ret: if data["md5"] in check_ret and self._work_as == "update": self._data2redis_sql(data, table_cfg, "update") update_count += 1 else: self._data2redis_sql(data, table_cfg, "insert") insert_count += 1 elif isinstance(ret, dict): #提取结果是字典的情况 ret_count = 1 ret["url"] = url ret["md5"] = md5 if self._db_had({"md5":md5}, table_cfg): if self._work_as == "update": self._data2redis_sql(ret, table_cfg, "update") update_count += 1 else: self._data2redis_sql(ret, table_cfg, "insert") insert_count += 1 if ret: self._pick_state(md5, CONFIG.G_STATE_PICKED, CONFIG.G_TABLE_LINK) if self._work_as == "update": return (update_count, ret_count) else: return (insert_count, ret_count)
def _deal_pick_ret(self, ret, url, md5, table_cfg): #查内容是否在库里,根据情况执行插入或更新 ret_count = 0 insert_count = 0 update_count = 0 if isinstance(ret, list): #提取结果是队列的情况 ret_count = len(ret) check_list = [] for i in range(0, len(ret)): ret[i]["md5"] = util.md5(ret[i]["url"]) check_list.append(ret[i]["md5"]) check_ret = self._db_had(check_list, table_cfg) for data in ret: if data["md5"] in check_ret and self._workas == "update": self._data2redis_sql(data, table_cfg, "update") update_count += 1 else: self._data2redis_sql(data, table_cfg, "insert") insert_count += 1 elif isinstance(ret, dict): #提取结果是字典的情况 ret_count = 1 ret["url"] = url ret["md5"] = md5 if self._db_had({"md5": md5}, table_cfg): if self._workas == "update": self._data2redis_sql(ret, table_cfg, "update") update_count += 1 else: self._data2redis_sql(ret, table_cfg, "insert") insert_count += 1 if ret: self._pick_state(md5, CONFIG.G_STATE_PICKED, CONFIG.G_TABLE_LINK) if self._workas == "update": return (update_count, ret_count) else: return (insert_count, ret_count)
#!/usr/bin/python #coding=utf-8 import sys import time import config from spiderlib import picker from pylib import util reload(sys) sys.setdefaultencoding("utf-8") #============================================================== picker.CONFIG = config if __name__ == "__main__": if len(sys.argv) > 1: if sys.argv[1] == "test": thread = picker.Picker(0, "test") else: thread = picker.Picker(0) #data = {"url":"http://zhidao.baidu.com/question/132366231.html"} data = {"url": "http://zhidao.baidu.com/question/1668804890558292187.html"} data["md5"] = util.md5(data["url"]) data["type"] = 1 print thread._run(data) else: for index in range(0, config.G_MAX_PICKER_THREAD): thread = picker.Picker(index) thread.start() time.sleep(1) time.sleep(1)