Exemple #1
0
    def _analyze_html(self, url, html):
        domain = self._site["domain"]
        d_config = self._site["config"]
        xpath = expath.XPath(url, html, code=d_config["default_code"])
        links = xpath.pick(link_config)
        if self._site["new_depth"] > d_config["max_depth"]:
            return "too depth :", self._site["new_depth"]

        link_infos = []
        check_list = []
        for link in links:
            if not "href" in link or not link["href"]:
                continue
            link = link["href"].lower()
            link_domain = link.split("/")[2]
            if d_config["only_insite"] and link_domain != domain:
                continue
            if self._filter_link(link):
                continue
            link = self.modify_link(link)
            type_name = self._get_page_type(link)
            page_type = CONFIG.G_SITE_COMMON.G_PAGETYPE[type_name]["type"]
            md5 = util.md5(link)
            link_infos.append({"md5": md5, "url": link, "type": page_type})
            check_list.append(md5)

        link_infos = util.del_duplicate(link_infos, "md5")
        if self._test:
            return link_infos
        count = self._insert2sql(link_infos, check_list)
        return "[NEWCount]:%d %s" % (count, url)
Exemple #2
0
	def _analyze_html(self, url, html):
		domain = self._site["domain"]
		d_config = self._site["config"]
		xpath = expath.XPath(url, html, code=d_config["default_code"])
		links = xpath.pick(link_config)
		if self._site["new_depth"] > d_config["max_depth"]:
			return "too depth :", self._site["new_depth"]

		link_infos = []
		check_list = []
		for link in links:
			if not "href" in link or not link["href"]:
				continue
			link = link["href"].lower()
			link_domain = link.split("/")[2]
			if d_config["only_insite"] and link_domain != domain:
				continue
			if self._filter_link(link):
				continue
			link = self.modify_link(link)
			type_name = self._get_page_type(link)
			page_type = CONFIG.G_SITE_COMMON.G_PAGETYPE[type_name]["type"]
			md5 = util.md5(link)
			link_infos.append({"md5":md5, "url":link, "type":page_type})
			check_list.append(md5)

		link_infos = util.del_duplicate(link_infos, "md5")
		if self._test:
			return link_infos
		count = self._insert2sql(link_infos, check_list)
		return "[NEWCount]:%d %s"%(count, url)
Exemple #3
0
def md5_test():
    print "==== md5"
    test_file = "/var/log/syslog"
    fmd5 = util.file_md5(test_file)
    cmd = "echo -n `md5sum /var/log/syslog|awk '{print $1}'`"
    cmd_ret = commands.getstatusoutput(cmd)
    if fmd5 != cmd_ret[1]:
        print "file_md5:", fmd5, cmd_ret[1]
    md5 = util.md5("12345")
    if md5 != "827ccb0eea8a706c4c34a16891f84e7b":
        print "md5:", md5, "827ccb0eea8a706c4c34a16891f84e7b"
Exemple #4
0
def init_url(url, sql_agent=None):
	"""
		when the link table empty, you can't use this to add a base url
		the spider will start crawl by it
		init_url.py 'base_url'
	"""
	if sql_agent == None:
		sql_agent = sql.Sql(CONFIG.G_MYSQL, CONFIG.G_MAINDB)
	data = {}
	data["url"] = url
	data["md5"] = util.md5(url)
	data["depth"] = 0
	data["type"] = 0
	data["last_time"] = int(time.time())
	data["domain"] = url.split("/")[2]
	ret = sql_agent.insert(CONFIG.G_TABLE_LINK["name"], data)
	sql_agent.commit()
	return (sql_agent, ret)
Exemple #5
0
def init_url(url, sql_agent=None):
    """
		when the link table empty, you can't use this to add a base url
		the spider will start crawl by it
		init_url.py 'base_url'
	"""
    if sql_agent == None:
        sql_agent = sql.Sql(CONFIG.G_MYSQL, CONFIG.G_MAINDB)
    data = {}
    data["url"] = url
    data["md5"] = util.md5(url)
    data["depth"] = 0
    data["type"] = 0
    data["last_time"] = int(time.time())
    data["domain"] = url.split("/")[2]
    ret = sql_agent.insert(CONFIG.G_TABLE_LINK["name"], data)
    sql_agent.commit()
    return (sql_agent, ret)
Exemple #6
0
	def _deal_pick_ret(self, ret, url, md5, table_cfg):
		#查内容是否在库里,根据情况执行插入或更新
		ret_count = 0
		insert_count = 0
		update_count = 0

		if isinstance(ret, list):
			#提取结果是队列的情况
			ret_count = len(ret)
			check_list = []
			for i in range(0, len(ret)):
				ret[i]["md5"] = util.md5(ret[i]["url"])
				check_list.append(ret[i]["md5"])
			check_ret = self._db_had(check_list, table_cfg)
			for data in ret:
				if data["md5"] in check_ret and self._work_as == "update":
					self._data2redis_sql(data, table_cfg, "update")
					update_count += 1
				else:
					self._data2redis_sql(data, table_cfg, "insert")
					insert_count += 1
		elif isinstance(ret, dict):
			#提取结果是字典的情况
			ret_count = 1
			ret["url"] = url
			ret["md5"] = md5
			if self._db_had({"md5":md5}, table_cfg):
				if self._work_as == "update":
					self._data2redis_sql(ret, table_cfg, "update")
					update_count += 1
			else:
				self._data2redis_sql(ret, table_cfg, "insert")
				insert_count += 1
		if ret:
			self._pick_state(md5, CONFIG.G_STATE_PICKED, CONFIG.G_TABLE_LINK)
		if self._work_as == "update":
			return (update_count, ret_count)
		else:
			return (insert_count, ret_count)
Exemple #7
0
 def _deal_pick_ret(self, ret, url, md5, table_cfg):
     #查内容是否在库里,根据情况执行插入或更新
     ret_count = 0
     insert_count = 0
     update_count = 0
     if isinstance(ret, list):
         #提取结果是队列的情况
         ret_count = len(ret)
         check_list = []
         for i in range(0, len(ret)):
             ret[i]["md5"] = util.md5(ret[i]["url"])
             check_list.append(ret[i]["md5"])
         check_ret = self._db_had(check_list, table_cfg)
         for data in ret:
             if data["md5"] in check_ret and self._workas == "update":
                 self._data2redis_sql(data, table_cfg, "update")
                 update_count += 1
             else:
                 self._data2redis_sql(data, table_cfg, "insert")
                 insert_count += 1
     elif isinstance(ret, dict):
         #提取结果是字典的情况
         ret_count = 1
         ret["url"] = url
         ret["md5"] = md5
         if self._db_had({"md5": md5}, table_cfg):
             if self._workas == "update":
                 self._data2redis_sql(ret, table_cfg, "update")
                 update_count += 1
         else:
             self._data2redis_sql(ret, table_cfg, "insert")
             insert_count += 1
     if ret:
         self._pick_state(md5, CONFIG.G_STATE_PICKED, CONFIG.G_TABLE_LINK)
     if self._workas == "update":
         return (update_count, ret_count)
     else:
         return (insert_count, ret_count)
Exemple #8
0
#!/usr/bin/python
#coding=utf-8
import sys
import time
import config
from spiderlib import picker
from pylib import util
reload(sys)
sys.setdefaultencoding("utf-8")

#==============================================================
picker.CONFIG = config
if __name__ == "__main__":
	if len(sys.argv) > 1:
		if sys.argv[1] == "test":
			thread = picker.Picker(0, "test")
		else:
			thread = picker.Picker(0)
		#data = {"url":"http://zhidao.baidu.com/question/132366231.html"}
		data = {"url": "http://zhidao.baidu.com/question/1668804890558292187.html"}
		data["md5"] = util.md5(data["url"])
		data["type"] = 1
		print thread._run(data)
	else:
		for index in range(0, config.G_MAX_PICKER_THREAD):
			thread = picker.Picker(index)
			thread.start()
			time.sleep(1)
		time.sleep(1)