Beispiel #1
0
class GuidChecker:
	def __init__(self):
		self.getCursor()
		self.tmp_dict = dict()
		self.bf = BlogUrlFactory()

	def getCursor(self):
		DBHOST = "10.35.50.116"  
		(self.db, self.cursor) = getDBCursor(host=DBHOST, user='******', passwd='blogcrawler', db='blogdb') 

	def getGuid(self, url):
		parsed_url = urlparse.urlparse(url)
		netloc = parsed_url.netloc
		if netloc == ["blog.naver.com", "m.blog.naver.com"] or netloc.endswith(".blog.me"):
			return  handleNBUrl(url)
		elif netloc == "blog.daum.net":
			return checkDaumPost(parsed_url[2])
		elif netloc.endswith(".tistory.com"):
			return handleTistory(url)
		else:
			if netloc in self.tmp_dict:
				type, id = self.tmp_dict[netloc]
			else:
				type, id = getBlogType(netloc, self.cursor)
			if  type:
				self.tmp_dict[netloc] = type, id
				if type == "tistory.com":
					return handleTistory(url, id)
				elif type == "blog.naver.com":
					return handleNBUrl(url, id)
			else:
				self.tmp_dict[netloc] = "", ""

			guid = self.bf.getGuid(url)
			if guid:
				return guid
			return ""

	def getChannel(self, url):
		return self.bf.getChannelUrl(url)


	def close(self):
		self.db.close()
		self.cursor.close()
Beispiel #2
0
	def __init__(self):
		self.downLoader = Downloader()
		self.attr_dict = dict()
		self.attr_dict["tid"] = """__addParam("author","****");"""
		self.attr_dict["r_url"] = """    """
		# http://cfs.tistory.com/custom/named/bw/bwell/rss.xml
		self.domain_match = dict()
		self.http_parser = HttpParser()
		self.bf = BlogUrlFactory()
Beispiel #3
0
class BlogChecker:
	
	def __init__(self):
		self.downLoader = Downloader()
		self.attr_dict = dict()
		self.attr_dict["tid"] = """__addParam("author","****");"""
		self.attr_dict["r_url"] = """    """
		# http://cfs.tistory.com/custom/named/bw/bwell/rss.xml
		self.domain_match = dict()
		self.http_parser = HttpParser()
		self.bf = BlogUrlFactory()

	def checkDomain(self, domain):

		#header, url, html = getDownloadData(domain, self.opener)
		url, header, html = self.downLoader.open(domain)
		print "download ", domain
		if url == "ERROR" or header == None:
			return None, None

		parsing_result = self.http_parser.plugParser(header, html, url)
		links = self.http_parser.html_parser.links
		isTistory = False
		naver_id = ""

		if html.find("TOP_SSL_URL = 'https://www.tistory.com';") >= 0:
			isTistory = True
			

		feed_urls = set()

		for tt_url in links:
			try:
				link = links[tt_url]
				if isTistory and tt_url.find(domain) >= 0:
					try:
						path = urlparse.urlparse(tt_url).path[1:]
						postno = int(path)
						tistory_id = getTistoryId(tt_url)
						if tistory_id:
							return "tistory.com", tistory_id
						isTistory = False
					except Exception, msg:
						pass

				if link.inout.find("R") > 0 and tt_url.find("/response") < 0 and tt_url.find("atom") < 0 and tt_url.find("comments") < 0:
					feed_urls.add(tt_url)
					status, response = self.downLoader.getResponse(tt_url)

					t_url = response.url
					if t_url != link.url and t_url.find("tistory.com") >= 0 and t_url.endswith("/rss.xml") :
						end_cur = t_url.rfind("/")
						tistory_id = t_url[t_url[:end_cur].rfind("/")+1:end_cur]
						return "tistory.com", tistory_id

					if t_url.startswith("http://blog.rss.naver.com/") and  t_url.endswith(".xml") and len(links) < 5:
						return "blog.naver.com", t_url.replace("http://blog.rss.naver.com/","").replace(".xml", "")

				if link.tag == "REFRESH":
					netloc = urlparse.urlparse(tt_url).netloc
					if tt_url.find(".tistory.com/") >= 0:
						tistory_id = netloc[:netloc.find(".")]
						return "tistory.com", tistory_id
					else:
						return self.checkDomain("http://%s/"%netloc)
					
				if len(links) < 3:
					if link.url.find("blog.naver.com") >= 0:     # http://baljak.com/
						try:
							ret_dict = self.bf.getAllDataDic(link.url)
							print link.url
							if ret_dict and ret_dict["gen"] == "blog.naver.com":
								naver_id = ret_dict["cid"]
								if naver_id not in ["PostList"]:
									return "blog.naver.com", naver_id
						except Exception, msg:
							print msg

			except Exception, msg:
				getLogger().error(msg)
Beispiel #4
0
	def __init__(self):
		self.getCursor()
		self.tmp_dict = dict()
		self.bf = BlogUrlFactory()