Ejemplo n.º 1
0
def getGuid(url, cursor=None):

	parsed_url = urlparse.urlparse(url)
	netloc = parsed_url.netloc
	if netloc in ["blog.naver.com", "m.blog.naver.com"] or netloc.endswith(".blog.me"):
		return  handleNBUrl(url)
	elif netloc == "blog.daum.net":
		return checkDaumPost(parsed_url[2])
	elif netloc.endswith(".tistory.com"):
		return handleTistory(url)
	else:
		if cursor == None:
			try:
				DBHOST = "10.35.50.116"  
				(db, n_cursor) = getDBCursor(host=DBHOST, user='******', passwd='blogcrawler', db='blogdb') 
			except Exception, msg:
				return None
		else:
Ejemplo n.º 2
0
def getSiteData(url):
	from test_util import getSiteId
	from common_func import getDBCursor

	DBHOST = "bbsdb-mst.s0.crawl.web.search"
	(db, cursor) = getDBCursor(host=DBHOST, user='******', passwd='bbscrawler', db='bbsdb') 

	id = getSiteId(url, cursor)


	if id:

		print id, url
		query = "SELECT type, page, param FROM url_patterns where site_id = %s"
		cursor.execute(query, id)
		results = cursor.fetchall()

		if results:
			return  makeURLPattern(results)

	return None, None
Ejemplo n.º 3
0
	def __init__(self, db_cursor=None):
		self.db_dict = dict()
		self.db_cursor = None
		self.db_table = conf.INDIE_BLOG_CHANNEL_TABLE
		is_my_cursor = False

		if db_cursor is None :
			is_my_cursor = True
			db_con, db_cursor = getDBCursor(host=conf.GUID_GEN_DB_HOST, user=conf.GUID_GEN_DB_USER, passwd=conf.GUID_GEN_DB_PWD, db=conf.GUID_GEN_DB_NAME)
		self.db_cursor = db_cursor

		self.blog_fam_list = ["naver","daum","tistory","blogspot","aladin","interpark","dreamwiz","joins","kyobo","chosun","jinbo","ohmynews","moneta","yes24","donga","indie"]
		self.key_list = ["guid","gen","ourl","curl","vurl","cid","pid","fam","trackback"]
		self.pt_dic = None # pattern dic (ex. pt_dic["SITE_NAME"]["PATTERN_NAME"])
		self.indie_data_dic = dict()
		self.cid_str = "##CID##"
		self.pid_str = "##PID##"
		self.initPatterns()
		self.initIndieData()

		if is_my_cursor :
			self.db_cursor.close()
Ejemplo n.º 4
0
	def setRules(self, db_cursor=None):
		#db, cursor = getDBCursor(host="10.35.31.3", user="******",passwd="zmfltmxkf", db="domanager")


		if db_cursor == None:
			db, cursor = getDBCursor(host="10.35.31.229", user="******",passwd="domanagerA!", db="domanager")
		else:
			cursor = db_cursor


		query = "SELECT a.idx, priority, urlType, scCode, inputPattern, guidPattern, b.name FROM url_patterns a, codes b where a.scCode = b.idx "
		cursor.execute(query )
		results = cursor.fetchall()
		self.makeRules(results)


		query = "SELECT urlpatternID, parserID FROM url_pattern_parser"
		cursor.execute(query )
		results = cursor.fetchall()
		for idx, parser_id in results:
			self.parser_dict[idx] = parser_id

		query = "SELECT a.guidPattern, a.buildPattern, b.name FROM url_build_patterns a, codes b where a.buildTypeCode = b.idx "
		cursor.execute(query )
		results = cursor.fetchall()

		for guid_pattern, build_pattern, name in results:

			o_netloc, o_key_netloc, o_path, o_params = getParsedUrl(guid_pattern)
			guid_pattern = makeUrl(o_netloc, o_key_netloc, o_path, o_params)


			if not guid_pattern in self.url_build_patterns:
				self.url_build_patterns[guid_pattern] = dict()
			self.url_build_patterns[guid_pattern][name] = build_pattern

		if db_cursor == None:
			cursor.close()
			db.close()
Ejemplo n.º 5
0


if __name__ == "__main__":
	a = UrlFactory()		

	TEST_URLS = ["http://ilwar.com/asdf/123?name=asdf  ", "http://ilwar.com/asdf/123/page/123?name=asdf  ", "http://www.ilwar.com/asdf/page/1?name=asdf  ", "http://ilwar.com/asdf/123/page/1?name=asdf  ",  "http://ilwar.com/asdf", "http://www.ilbe.com/index.php?mid=ilbe&category=123&document_srl=123123",  "http://www.ilbe.com/asdfasd?mid=ilbe&category=123&document_srl=123123", "http://www.ilbe.com/123123", "http://abc.tistory.com/asdf/123123",  "http://abc.tistory.com/asdf", "http://abc.tistory.com/123", "http://naver.com/asdfa/asdfa/asdfa/board.php?bo_table=cm_lego&wr_id=1231&dasd=asdf", "http://todayhumor.co.kr/board/list.php?table=gomin", "http://todayhumor.co.kr/board/list.php?table=gomin&page=4" ]

	import sys

	if len(sys.argv) > 1:
		url = sys.argv[1]
		print url, a.getGuid(url)
	else:

		db, cursor = getDBCursor(host="10.35.31.229", user="******",passwd="domanagerA!", db="domanager")
		query = "SELECT down_url, guid FROM url_test "
		cursor.execute(query)
		results = cursor.fetchall()
		fail_count = 0
		for url, guid in results:
			res = a.getGuid(url)
			if res:
				(type, new_guid, other_dict) = res
				if guid != new_guid:
					print url, guid, new_guid , "SOMETHING WRONG"
					fail_count += 1
				else:
					print url, guid, "IT's OK!!"
			else:
				print url, "AAAAAAAA" 
Ejemplo n.º 6
0
	def makeRule(self, db_cursor=None):

		#db, cursor = getDBCursor(host="10.35.50.116", user="******",passwd="blogcrawler", db="pado")
		#db, cursor = getDBCursor(host="10.35.50.116", user="******",passwd="blogcrawler", db="pado")

		if db_cursor:
			cursor = db_cursor
		else:
			db, cursor = getDBCursor(host="10.35.31.229", user="******",passwd="domanagerA!", db="domanager")

		code_dict = dict()
		query = "SELECT idx, name from codes where code_category = 4"
		cursor.execute(query)
		results = cursor.fetchall()
		code_dict = dict()
		for idx, name in results:
			code_dict[idx] = name
			
		query = "SELECT a.idx, domainID, domain, activeYN, b.idx, field, ruleCode, ruleVal, b.parentRuleID FROM pado.parsers a, pado.parser_rules b where a.idx = b.parserID"

		cursor.execute(query)
		results = cursor.fetchall()

		# http://clien.net/cs2/bbs/board.php?bo_table=park&wr_id=35767059
		# http://mlbpark.donga.com/mbs/articleVC.php?mbsC=bullpen2&mbsIdx=1954295
		# http://marumaru.in/b/free/76251
		# parsing rule type : meta, next_text, class, id, html, class_count
		children_rules = dict()
		offset_dict = dict()
		all_rules = dict()

		for parser_id, domain_id, host, activeYN, idx, field, rule_code, value, parent_id in results:

			if self.mode == "service" and activeYN != "Y":
				continue
			if rule_code in code_dict:
				type = code_dict[rule_code]
			else:
				type = ""

			if rule_code == 60:
				self.template_rules[host] = int(value), parser_id
				continue

			if not host in self.rules:
				self.rules[host] = dict() 
				current_rule = HostParserRule()
				self.rules[host][parser_id] = current_rule
			else:
				if parser_id in self.rules[host]:
					current_rule = self.rules[host][parser_id]
				else:
					current_rule = HostParserRule()
					self.rules[host][parser_id] = current_rule

			if parent_id:
				parent_id = int(parent_id)
				if type == "offset":
					if parent_id in current_rule.rules:
						current_rule.rules[parent_id].offset = value
					elif parent_id in all_rules:
						all_rules[parent_id].offset = value
					else:
						offset_dict[parent_id] = value
				elif parent_id in current_rule.rules:
					current_rule.rules[parent_id].children_rules[idx] = NodeRule(field, type, value)

					if idx in offset_dict:
						current_rule.rules[parent_id].children_rules[idx].offset = offset_dict[idx]
					all_rules[idx] = current_rule.rules[parent_id].children_rules[idx]
				else:
					if parent_id  not in children_rules:
						children_rules[parent_id] = dict()
					children_rules[parent_id][idx] = NodeRule(field, type, value)
					if idx in offset_dict:
						children_rules[parent_id][idx].offset = offset_dict[idx]
					all_rules[idx] = children_rules[parent_id][idx]

			if type == "delete":
				self.string_filter[parent_id] = value
			else:
				#   class[3] 
				if parent_id:
					pass
				else:
					t_rule = NodeRule(field, type, value)
					all_rules[idx] = t_rule
					if idx in offset_dict:
						t_rule.offset = offset_dict[idx]
					current_rule.rules[idx] = t_rule
				if idx in children_rules:
					for c_idx, child_rule in children_rules[idx].items():
						if child_rule.type == "offset":
							current_rule.rules[idx].offset = child_rule.value
						else:
							current_rule.rules[idx].children_rules[c_idx] = child_rule
			self.id_dict[parser_id] = current_rule

		if db_cursor == None:
			cursor.close()	
Ejemplo n.º 7
0
	def getCursor(self):
		DBHOST = "10.35.50.116"  
		(self.db, self.cursor) = getDBCursor(host=DBHOST, user='******', passwd='blogcrawler', db='blogdb')