コード例 #1
0
ファイル: net.py プロジェクト: alei76/spiderlib
def get(url, heads=None, encode=False, timeout=30, use_proxy=0, d_config=None):
	if not "/" in url:
		return (-2, "URL error :" + url)
	domain = url.split("/")[2]

	if d_config:
		result = _get(url, domain, heads, timeout, use_proxy, d_config)
		if encode == True:
			(info, html) = result
			html = spider.html2utf8(html, d_config["default_code"])
			result = (info, html)
		return result
	else:
		return _get(url, domain, heads, timeout, use_proxy)
コード例 #2
0
ファイル: picker.py プロジェクト: astraylinux/spiderlib
	def _save_html(self, md5, html, d_config):
		if CONFIG.G_IFSAVE_HTML == False:
			return

		dcode = d_config["config"]["default_code"]
		html = spider.html2utf8(html, dcode)
		item = {"md5":md5, "html":html}
		table = CONFIG.G_TABLE_HTML["name"]
		division = CONFIG.G_TABLE_HTML["division"]
		if not self._db_had(CONFIG.G_TABLE_HTML, {"md5":md5}):
			sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
			"insert", item, "md5", division)
		else:
			sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
			"update", item, "md5", division)
コード例 #3
0
ファイル: picker.py プロジェクト: alei76/spiderlib
    def _save_html(self, md5, html, d_config):
        if CONFIG.G_IFSAVE_HTML == False:
            return

        dcode = d_config["config"]["default_code"]
        html = spider.html2utf8(html, dcode)
        item = {"md5": md5, "html": html}
        table = CONFIG.G_TABLE_HTML["name"]
        division = CONFIG.G_TABLE_HTML["division"]
        if not self._db_had(CONFIG.G_TABLE_HTML, {"md5": md5}):
            sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
            "insert", item, "md5", division)
        else:
            sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
            "update", item, "md5", division)
コード例 #4
0
    def _save_html(self, md5, html):
        if self._test:
            return
        if CONFIG.G_IFSAVE_HTML == False:
            return
        if CONFIG.G_IFSAVE_PASS == False and self._site["task"]["type"] !=\
          CONFIG.G_SITE_COMMON.G_PAGETYPE["detail"]["type"]:
            return

        dcode = self._site["config"]["default_code"]
        html = spider.html2utf8(html, dcode)
        item = {"md5": md5, "html": html}
        table = CONFIG.G_TABLE_HTML["name"]
        division = CONFIG.G_TABLE_HTML["division"]
        if not self._db_had(CONFIG.G_TABLE_HTML, {"md5": md5}):
            sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
             "insert", item, "md5", division)
        else:
            sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
             "update", item, "md5", division)
コード例 #5
0
ファイル: crawler.py プロジェクト: alei76/spiderlib
	def _save_html(self, md5, html):
		if self._test:
			return
		if CONFIG.G_IFSAVE_HTML == False:
			return
		if CONFIG.G_IFSAVE_PASS == False and self._site["task"]["type"] !=\
				CONFIG.G_SITE_COMMON.G_PAGETYPE["detail"]["type"]:
			return

		dcode = self._site["config"]["default_code"]
		html = spider.html2utf8(html, dcode)
		item = {"md5":md5, "html":html}
		table = CONFIG.G_TABLE_HTML["name"]
		division = CONFIG.G_TABLE_HTML["division"]
		if not self._db_had(CONFIG.G_TABLE_HTML, {"md5":md5}):
			sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
				"insert", item, "md5", division)
		else:
			sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
				"update", item, "md5", division)