Example #1
0
 def _update_state(self, state, data):
     if self._test:
         return
     data["crawl_state"] = state
     table = CONFIG.G_TABLE_LINK["name"]
     sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE,\
       table, "update", data, "md5")
Example #2
0
	def _update_state(self, state, data):
		if self._test:
			return
		data["crawl_state"] = state
		table = CONFIG.G_TABLE_LINK["name"]
		sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE,\
				table, "update", data, "md5")
Example #3
0
	def _save_html(self, md5, html, d_config):
		if CONFIG.G_IFSAVE_HTML == False:
			return

		dcode = d_config["config"]["default_code"]
		html = spider.html2utf8(html, dcode)
		item = {"md5":md5, "html":html}
		table = CONFIG.G_TABLE_HTML["name"]
		division = CONFIG.G_TABLE_HTML["division"]
		if not self._db_had(CONFIG.G_TABLE_HTML, {"md5":md5}):
			sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
			"insert", item, "md5", division)
		else:
			sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
			"update", item, "md5", division)
Example #4
0
    def _save_html(self, md5, html, d_config):
        if CONFIG.G_IFSAVE_HTML == False:
            return

        dcode = d_config["config"]["default_code"]
        html = spider.html2utf8(html, dcode)
        item = {"md5": md5, "html": html}
        table = CONFIG.G_TABLE_HTML["name"]
        division = CONFIG.G_TABLE_HTML["division"]
        if not self._db_had(CONFIG.G_TABLE_HTML, {"md5": md5}):
            sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
            "insert", item, "md5", division)
        else:
            sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
            "update", item, "md5", division)
Example #5
0
    def _save_html(self, md5, html):
        if self._test:
            return
        if CONFIG.G_IFSAVE_HTML == False:
            return
        if CONFIG.G_IFSAVE_PASS == False and self._site["task"]["type"] !=\
          CONFIG.G_SITE_COMMON.G_PAGETYPE["detail"]["type"]:
            return

        dcode = self._site["config"]["default_code"]
        html = spider.html2utf8(html, dcode)
        item = {"md5": md5, "html": html}
        table = CONFIG.G_TABLE_HTML["name"]
        division = CONFIG.G_TABLE_HTML["division"]
        if not self._db_had(CONFIG.G_TABLE_HTML, {"md5": md5}):
            sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
             "insert", item, "md5", division)
        else:
            sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
             "update", item, "md5", division)
Example #6
0
    def _insert2sql(self, links, check_list):
        if self._test:
            return 0
        db_had = self._db_had(CONFIG.G_TABLE_LINK, check_list)
        last_time = time.time()
        table = CONFIG.G_TABLE_LINK["name"]
        division = CONFIG.G_TABLE_LINK["division"]
        print "GET LINK: ", len(links)
        count = 0
        for item in links:
            if item["md5"] in db_had:
                continue

            item["depth"] = self._site["new_depth"]
            item["domain"] = self._site["domain"]
            item["last_time"] = last_time
            sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
              "insert", item, "md5", division)
            count += 1
        return count
Example #7
0
	def _save_html(self, md5, html):
		if self._test:
			return
		if CONFIG.G_IFSAVE_HTML == False:
			return
		if CONFIG.G_IFSAVE_PASS == False and self._site["task"]["type"] !=\
				CONFIG.G_SITE_COMMON.G_PAGETYPE["detail"]["type"]:
			return

		dcode = self._site["config"]["default_code"]
		html = spider.html2utf8(html, dcode)
		item = {"md5":md5, "html":html}
		table = CONFIG.G_TABLE_HTML["name"]
		division = CONFIG.G_TABLE_HTML["division"]
		if not self._db_had(CONFIG.G_TABLE_HTML, {"md5":md5}):
			sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
				"insert", item, "md5", division)
		else:
			sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
				"update", item, "md5", division)
Example #8
0
	def _insert2sql(self, links, check_list):
		if self._test:
			return 0
		db_had = self._db_had(CONFIG.G_TABLE_LINK, check_list)
		last_time = time.time()
		table = CONFIG.G_TABLE_LINK["name"]
		division = CONFIG.G_TABLE_LINK["division"]
		print "GET LINK: ", len(links)
		count = 0
		for item in links:
			if item["md5"] in db_had:
				continue

			item["depth"] = self._site["new_depth"]
			item["domain"] = self._site["domain"]
			item["last_time"] = last_time
			sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\
					"insert", item, "md5", division)
			count += 1
		return count
Example #9
0
	def _data2redis_sql(self, sqldata, table_cfg, op_type):
		table = table_cfg["name"]
		division = table_cfg["division"]
		sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, \
				table, op_type, sqldata, "md5", division)
Example #10
0
 def _data2redis_sql(self, sqldata, table_cfg, op_type):
     table = table_cfg["name"]
     division = table_cfg["division"]
     sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, \
       table, op_type, sqldata, "md5", division)