class GuidChecker: def __init__(self): self.getCursor() self.tmp_dict = dict() self.bf = BlogUrlFactory() def getCursor(self): DBHOST = "10.35.50.116" (self.db, self.cursor) = getDBCursor(host=DBHOST, user='******', passwd='blogcrawler', db='blogdb') def getGuid(self, url): parsed_url = urlparse.urlparse(url) netloc = parsed_url.netloc if netloc == ["blog.naver.com", "m.blog.naver.com"] or netloc.endswith(".blog.me"): return handleNBUrl(url) elif netloc == "blog.daum.net": return checkDaumPost(parsed_url[2]) elif netloc.endswith(".tistory.com"): return handleTistory(url) else: if netloc in self.tmp_dict: type, id = self.tmp_dict[netloc] else: type, id = getBlogType(netloc, self.cursor) if type: self.tmp_dict[netloc] = type, id if type == "tistory.com": return handleTistory(url, id) elif type == "blog.naver.com": return handleNBUrl(url, id) else: self.tmp_dict[netloc] = "", "" guid = self.bf.getGuid(url) if guid: return guid return "" def getChannel(self, url): return self.bf.getChannelUrl(url) def close(self): self.db.close() self.cursor.close()
def __init__(self): self.downLoader = Downloader() self.attr_dict = dict() self.attr_dict["tid"] = """__addParam("author","****");""" self.attr_dict["r_url"] = """ """ # http://cfs.tistory.com/custom/named/bw/bwell/rss.xml self.domain_match = dict() self.http_parser = HttpParser() self.bf = BlogUrlFactory()
class BlogChecker: def __init__(self): self.downLoader = Downloader() self.attr_dict = dict() self.attr_dict["tid"] = """__addParam("author","****");""" self.attr_dict["r_url"] = """ """ # http://cfs.tistory.com/custom/named/bw/bwell/rss.xml self.domain_match = dict() self.http_parser = HttpParser() self.bf = BlogUrlFactory() def checkDomain(self, domain): #header, url, html = getDownloadData(domain, self.opener) url, header, html = self.downLoader.open(domain) print "download ", domain if url == "ERROR" or header == None: return None, None parsing_result = self.http_parser.plugParser(header, html, url) links = self.http_parser.html_parser.links isTistory = False naver_id = "" if html.find("TOP_SSL_URL = 'https://www.tistory.com';") >= 0: isTistory = True feed_urls = set() for tt_url in links: try: link = links[tt_url] if isTistory and tt_url.find(domain) >= 0: try: path = urlparse.urlparse(tt_url).path[1:] postno = int(path) tistory_id = getTistoryId(tt_url) if tistory_id: return "tistory.com", tistory_id isTistory = False except Exception, msg: pass if link.inout.find("R") > 0 and tt_url.find("/response") < 0 and tt_url.find("atom") < 0 and tt_url.find("comments") < 0: feed_urls.add(tt_url) status, response = self.downLoader.getResponse(tt_url) t_url = response.url if t_url != link.url and t_url.find("tistory.com") >= 0 and t_url.endswith("/rss.xml") : end_cur = t_url.rfind("/") tistory_id = t_url[t_url[:end_cur].rfind("/")+1:end_cur] return "tistory.com", tistory_id if t_url.startswith("http://blog.rss.naver.com/") and t_url.endswith(".xml") and len(links) < 5: return "blog.naver.com", t_url.replace("http://blog.rss.naver.com/","").replace(".xml", "") if link.tag == "REFRESH": netloc = urlparse.urlparse(tt_url).netloc if tt_url.find(".tistory.com/") >= 0: tistory_id = netloc[:netloc.find(".")] return "tistory.com", tistory_id else: return self.checkDomain("http://%s/"%netloc) if len(links) < 3: if link.url.find("blog.naver.com") >= 0: # http://baljak.com/ try: ret_dict = self.bf.getAllDataDic(link.url) print link.url if ret_dict and ret_dict["gen"] == "blog.naver.com": naver_id = ret_dict["cid"] if naver_id not in ["PostList"]: return "blog.naver.com", naver_id except Exception, msg: print msg except Exception, msg: getLogger().error(msg)
def __init__(self): self.getCursor() self.tmp_dict = dict() self.bf = BlogUrlFactory()