class CustomParser: def __init__(self, type="service"): self.prm = None self.parser = HttpParser() self.rss_parser = rssParser() self.df = DateFactory() self.field_transformer = FieldTransformer() self.type = type def setRules(self, cursor=None): self.prm = ParsingRuleManager(self.type) self.prm.makeRule(cursor) def getLinks(self, header, html, url): if url.find("/feed/") > 0 or url.find("feeds.feedburner.com") > 0 or url.endswith(".xml") or url.find("=rss") > 0 or url.find("/rss") > 0: ret_dict = self.rss_parser.parse(url, html) else: ret_dict = self.parser.plugParser(header, html, url) if "links" in ret_dict: return ret_dict["links"] return dict() def parse(self, header, html, url, parser_id=None): if self.prm == None: self.setRules() ret_dict = self.parser.plugParser(header, html, url) result_dict = dict() if parser_id != None and parser_id in self.prm.id_dict: try: host_rule = self.prm.id_dict[parser_id] result_dict = self.getDataByRule(host_rule, ret_dict, url) result_dict["parser_id"] = parser_id return result_dict except Exception, msg: getLogger().error(msg) domain = ret_dict["domain"] if domain in self.prm.template_rules: try: t_parser_id, org_parser_id = self.prm.template_rules[domain] host_rule = self.prm.id_dict[t_parser_id] result_dict = self.getDataByRule(host_rule, ret_dict, url) result_dict["parser_id"] = org_parser_id return result_dict except Exception, msg: getLogger().error(msg)
def __init__(self, type="service"): self.prm = None self.parser = HttpParser() self.rss_parser = rssParser() self.df = DateFactory() self.field_transformer = FieldTransformer() self.type = type
def __init__(self): self.downLoader = Downloader() self.attr_dict = dict() self.attr_dict["tid"] = """__addParam("author","****");""" self.attr_dict["r_url"] = """ """ # http://cfs.tistory.com/custom/named/bw/bwell/rss.xml self.domain_match = dict() self.http_parser = HttpParser() self.bf = BlogUrlFactory()
def readClient(self): s=self.sender() headers=str(s.readAll()) p=HttpParser() plen=p.execute(headers, len(headers)) if p.get_method()=="GET": path=p.get_path() try: if path.startswith("/ajax"): code=200 ext='json' content="" method,arg=path.split('/')[2:] getattr(self, method)(int(arg)) else: try: _path=os.path.join(self._root, path[1:]) if not os.path.exists(_path): raise Error404 elif os.path.isdir(_path): _path=os.path.join(_path, 'index.html') if not os.path.exists(_path): raise Error404 ext=os.path.splitext(_path)[1][1:].lower() code=200 with open(_path, 'rb') as f: content=f.read() except Error404 as e: code=404 ext='html' content='<h1>404 - File Not Found ({0})</h1>'.format(path) except Exception as e: code=500 ext='html' content='<h1>500 - Internal Error</h1>' print e _resp={ 'code' : code, 'status' : status_reasons[code], 'content-type' : ext2ct[ext], 'content' : content } response=RESPONSE.format(**_resp) elif p.get_method()=='POST': print "POST", headers response='' else: response='' s.writeData(response) s.waitForBytesWritten() s.close()
class BlogChecker: def __init__(self): self.downLoader = Downloader() self.attr_dict = dict() self.attr_dict["tid"] = """__addParam("author","****");""" self.attr_dict["r_url"] = """ """ # http://cfs.tistory.com/custom/named/bw/bwell/rss.xml self.domain_match = dict() self.http_parser = HttpParser() self.bf = BlogUrlFactory() def checkDomain(self, domain): #header, url, html = getDownloadData(domain, self.opener) url, header, html = self.downLoader.open(domain) print "download ", domain if url == "ERROR" or header == None: return None, None parsing_result = self.http_parser.plugParser(header, html, url) links = self.http_parser.html_parser.links isTistory = False naver_id = "" if html.find("TOP_SSL_URL = 'https://www.tistory.com';") >= 0: isTistory = True feed_urls = set() for tt_url in links: try: link = links[tt_url] if isTistory and tt_url.find(domain) >= 0: try: path = urlparse.urlparse(tt_url).path[1:] postno = int(path) tistory_id = getTistoryId(tt_url) if tistory_id: return "tistory.com", tistory_id isTistory = False except Exception, msg: pass if link.inout.find("R") > 0 and tt_url.find("/response") < 0 and tt_url.find("atom") < 0 and tt_url.find("comments") < 0: feed_urls.add(tt_url) status, response = self.downLoader.getResponse(tt_url) t_url = response.url if t_url != link.url and t_url.find("tistory.com") >= 0 and t_url.endswith("/rss.xml") : end_cur = t_url.rfind("/") tistory_id = t_url[t_url[:end_cur].rfind("/")+1:end_cur] return "tistory.com", tistory_id if t_url.startswith("http://blog.rss.naver.com/") and t_url.endswith(".xml") and len(links) < 5: return "blog.naver.com", t_url.replace("http://blog.rss.naver.com/","").replace(".xml", "") if link.tag == "REFRESH": netloc = urlparse.urlparse(tt_url).netloc if tt_url.find(".tistory.com/") >= 0: tistory_id = netloc[:netloc.find(".")] return "tistory.com", tistory_id else: return self.checkDomain("http://%s/"%netloc) if len(links) < 3: if link.url.find("blog.naver.com") >= 0: # http://baljak.com/ try: ret_dict = self.bf.getAllDataDic(link.url) print link.url if ret_dict and ret_dict["gen"] == "blog.naver.com": naver_id = ret_dict["cid"] if naver_id not in ["PostList"]: return "blog.naver.com", naver_id except Exception, msg: print msg except Exception, msg: getLogger().error(msg)