def getRssInfo(rss): downLoader = Downloader() try: (t_url, header, html) = downLoader.open(rss) print "download ", rss except Exception, msg: getLogger().error("feed download error : %s %s", msg, rss) return None
def getTistoryId(url): downLoader = Downloader() attr_dict = dict() attr_dict["tid"] = "livere_blogurl = '****.tistory.com';" attr_dict["tid2"] = """__addParam("author","****");""" try: (t_url, header, html) = downLoader.open(url) print "download", url except Exception, msg: getLogger().error("feed download error : %s %s", msg, rss) return None
class BlogChecker: def __init__(self): self.downLoader = Downloader() self.attr_dict = dict() self.attr_dict["tid"] = """__addParam("author","****");""" self.attr_dict["r_url"] = """ """ # http://cfs.tistory.com/custom/named/bw/bwell/rss.xml self.domain_match = dict() self.http_parser = HttpParser() self.bf = BlogUrlFactory() def checkDomain(self, domain): #header, url, html = getDownloadData(domain, self.opener) url, header, html = self.downLoader.open(domain) print "download ", domain if url == "ERROR" or header == None: return None, None parsing_result = self.http_parser.plugParser(header, html, url) links = self.http_parser.html_parser.links isTistory = False naver_id = "" if html.find("TOP_SSL_URL = 'https://www.tistory.com';") >= 0: isTistory = True feed_urls = set() for tt_url in links: try: link = links[tt_url] if isTistory and tt_url.find(domain) >= 0: try: path = urlparse.urlparse(tt_url).path[1:] postno = int(path) tistory_id = getTistoryId(tt_url) if tistory_id: return "tistory.com", tistory_id isTistory = False except Exception, msg: pass if link.inout.find("R") > 0 and tt_url.find("/response") < 0 and tt_url.find("atom") < 0 and tt_url.find("comments") < 0: feed_urls.add(tt_url) status, response = self.downLoader.getResponse(tt_url) t_url = response.url if t_url != link.url and t_url.find("tistory.com") >= 0 and t_url.endswith("/rss.xml") : end_cur = t_url.rfind("/") tistory_id = t_url[t_url[:end_cur].rfind("/")+1:end_cur] return "tistory.com", tistory_id if t_url.startswith("http://blog.rss.naver.com/") and t_url.endswith(".xml") and len(links) < 5: return "blog.naver.com", t_url.replace("http://blog.rss.naver.com/","").replace(".xml", "") if link.tag == "REFRESH": netloc = urlparse.urlparse(tt_url).netloc if tt_url.find(".tistory.com/") >= 0: tistory_id = netloc[:netloc.find(".")] return "tistory.com", tistory_id else: return self.checkDomain("http://%s/"%netloc) if len(links) < 3: if link.url.find("blog.naver.com") >= 0: # http://baljak.com/ try: ret_dict = self.bf.getAllDataDic(link.url) print link.url if ret_dict and ret_dict["gen"] == "blog.naver.com": naver_id = ret_dict["cid"] if naver_id not in ["PostList"]: return "blog.naver.com", naver_id except Exception, msg: print msg except Exception, msg: getLogger().error(msg)
uf = UrlFactory() if len(sys.argv) >1: url = sys.argv[1] else: url = "http://mlbpark.donga.com/mbs/articleV.php?mbsC=bullpen2&mbsIdx=2203518" url = "http://mlbpark.donga.com/mbs/articleV.php?mbsC=bullpen2&mbsIdx=2203532" url = "http://mlbpark.donga.com/mbs/articleV.php?mbsC=bullpen2&mbsIdx=2203496" test_urls = ["http://mlbpark.donga.com/mbs/articleV.php?mbsC=bullpen2&mbsIdx=1954295"] opener = Downloader() real_URL, http_header, http_content = opener.open(url) print real_URL, http_header """ opener = urllib2.build_opener() req = urllib2.Request(url) req.add_header("User-agent", USER_AGENT) #req.add_header("User-agent", "wget") rs = opener.open(req) http_header = str(rs.info()) http_content = rs.read() real_URL = rs.url """