Esempio n. 1
0
def getRssInfo(rss):
	downLoader = Downloader()
	try:
		(t_url, header, html) = downLoader.open(rss) 
		print "download ", rss
	except Exception, msg:
		getLogger().error("feed download error : %s %s", msg, rss)
		return None
Esempio n. 2
0
def getTistoryId(url):
	downLoader = Downloader()
	attr_dict = dict()
	attr_dict["tid"] = "livere_blogurl = '****.tistory.com';"
	attr_dict["tid2"] = """__addParam("author","****");"""
	try:
		(t_url, header, html) = downLoader.open(url) 
		print "download", url
	except Exception, msg:
		getLogger().error("feed download error : %s %s", msg, rss)
		return None
Esempio n. 3
0
class BlogChecker:
	
	def __init__(self):
		self.downLoader = Downloader()
		self.attr_dict = dict()
		self.attr_dict["tid"] = """__addParam("author","****");"""
		self.attr_dict["r_url"] = """    """
		# http://cfs.tistory.com/custom/named/bw/bwell/rss.xml
		self.domain_match = dict()
		self.http_parser = HttpParser()
		self.bf = BlogUrlFactory()

	def checkDomain(self, domain):

		#header, url, html = getDownloadData(domain, self.opener)
		url, header, html = self.downLoader.open(domain)
		print "download ", domain
		if url == "ERROR" or header == None:
			return None, None

		parsing_result = self.http_parser.plugParser(header, html, url)
		links = self.http_parser.html_parser.links
		isTistory = False
		naver_id = ""

		if html.find("TOP_SSL_URL = 'https://www.tistory.com';") >= 0:
			isTistory = True
			

		feed_urls = set()

		for tt_url in links:
			try:
				link = links[tt_url]
				if isTistory and tt_url.find(domain) >= 0:
					try:
						path = urlparse.urlparse(tt_url).path[1:]
						postno = int(path)
						tistory_id = getTistoryId(tt_url)
						if tistory_id:
							return "tistory.com", tistory_id
						isTistory = False
					except Exception, msg:
						pass

				if link.inout.find("R") > 0 and tt_url.find("/response") < 0 and tt_url.find("atom") < 0 and tt_url.find("comments") < 0:
					feed_urls.add(tt_url)
					status, response = self.downLoader.getResponse(tt_url)

					t_url = response.url
					if t_url != link.url and t_url.find("tistory.com") >= 0 and t_url.endswith("/rss.xml") :
						end_cur = t_url.rfind("/")
						tistory_id = t_url[t_url[:end_cur].rfind("/")+1:end_cur]
						return "tistory.com", tistory_id

					if t_url.startswith("http://blog.rss.naver.com/") and  t_url.endswith(".xml") and len(links) < 5:
						return "blog.naver.com", t_url.replace("http://blog.rss.naver.com/","").replace(".xml", "")

				if link.tag == "REFRESH":
					netloc = urlparse.urlparse(tt_url).netloc
					if tt_url.find(".tistory.com/") >= 0:
						tistory_id = netloc[:netloc.find(".")]
						return "tistory.com", tistory_id
					else:
						return self.checkDomain("http://%s/"%netloc)
					
				if len(links) < 3:
					if link.url.find("blog.naver.com") >= 0:     # http://baljak.com/
						try:
							ret_dict = self.bf.getAllDataDic(link.url)
							print link.url
							if ret_dict and ret_dict["gen"] == "blog.naver.com":
								naver_id = ret_dict["cid"]
								if naver_id not in ["PostList"]:
									return "blog.naver.com", naver_id
						except Exception, msg:
							print msg

			except Exception, msg:
				getLogger().error(msg)
Esempio n. 4
0

	uf = UrlFactory()

	if len(sys.argv) >1:
		url = sys.argv[1]
	else:
		url = "http://mlbpark.donga.com/mbs/articleV.php?mbsC=bullpen2&mbsIdx=2203518"
		url = "http://mlbpark.donga.com/mbs/articleV.php?mbsC=bullpen2&mbsIdx=2203532"
		url = "http://mlbpark.donga.com/mbs/articleV.php?mbsC=bullpen2&mbsIdx=2203496"

	test_urls = ["http://mlbpark.donga.com/mbs/articleV.php?mbsC=bullpen2&mbsIdx=1954295"] 


	opener = Downloader()
	real_URL, http_header, http_content = opener.open(url)


	print real_URL, http_header
	"""
	opener = urllib2.build_opener()
	req = urllib2.Request(url)
	req.add_header("User-agent", USER_AGENT)
	#req.add_header("User-agent", "wget")
	rs = opener.open(req)

	http_header = str(rs.info())
	http_content = rs.read()
	real_URL = rs.url
	"""