Example #1
0
class CustomParser:
	def __init__(self, type="service"):
		self.prm = None
		self.parser = HttpParser()
		self.rss_parser = rssParser()
		self.df = DateFactory()
		self.field_transformer = FieldTransformer()
		self.type = type

	def setRules(self, cursor=None):
		self.prm = ParsingRuleManager(self.type)
		self.prm.makeRule(cursor)

	def getLinks(self,  header, html, url):
		if url.find("/feed/") > 0 or url.find("feeds.feedburner.com") > 0 or url.endswith(".xml") or url.find("=rss") > 0 or url.find("/rss") > 0:
			ret_dict = self.rss_parser.parse(url, html)
		else:	
			ret_dict = self.parser.plugParser(header,  html, url)
		if "links" in ret_dict:
			return ret_dict["links"]
		return dict()

	def parse(self, header, html, url, parser_id=None):

		if self.prm == None:
			self.setRules()

		ret_dict = self.parser.plugParser(header,  html, url)

		result_dict = dict()
		if parser_id != None and parser_id in self.prm.id_dict:
			try:
				host_rule = self.prm.id_dict[parser_id]
				result_dict = self.getDataByRule(host_rule, ret_dict, url)
				result_dict["parser_id"] = parser_id
				return result_dict
			except Exception, msg:
				getLogger().error(msg)

		domain = ret_dict["domain"]
		if domain in self.prm.template_rules:
			try:
				t_parser_id, org_parser_id  = self.prm.template_rules[domain]
				host_rule = self.prm.id_dict[t_parser_id]
				result_dict = self.getDataByRule(host_rule, ret_dict, url)
				result_dict["parser_id"] = org_parser_id
				return result_dict
			except Exception, msg:
				getLogger().error(msg)
Example #2
0
	def __init__(self, type="service"):
		self.prm = None
		self.parser = HttpParser()
		self.rss_parser = rssParser()
		self.df = DateFactory()
		self.field_transformer = FieldTransformer()
		self.type = type
Example #3
0
	def __init__(self):
		self.downLoader = Downloader()
		self.attr_dict = dict()
		self.attr_dict["tid"] = """__addParam("author","****");"""
		self.attr_dict["r_url"] = """    """
		# http://cfs.tistory.com/custom/named/bw/bwell/rss.xml
		self.domain_match = dict()
		self.http_parser = HttpParser()
		self.bf = BlogUrlFactory()
Example #4
0
    def readClient(self):
        s=self.sender()
        headers=str(s.readAll())

        p=HttpParser()
        plen=p.execute(headers, len(headers))

        if p.get_method()=="GET":
            path=p.get_path()

            try:
                if path.startswith("/ajax"):
                    code=200
                    ext='json'
                    content=""

                    method,arg=path.split('/')[2:]
                    getattr(self, method)(int(arg))

                else:
                    try:
                        _path=os.path.join(self._root, path[1:])
                        if not os.path.exists(_path): raise Error404
                        elif os.path.isdir(_path):
                            _path=os.path.join(_path, 'index.html')
                            if not os.path.exists(_path): raise Error404

                        ext=os.path.splitext(_path)[1][1:].lower()
                        code=200
                        with open(_path, 'rb') as f: content=f.read()

                    except Error404 as e:
                        code=404
                        ext='html'
                        content='<h1>404 - File Not Found ({0})</h1>'.format(path)

            except Exception as e:
                code=500
                ext='html'
                content='<h1>500 - Internal Error</h1>'

                print e

            _resp={
                'code'         : code,
                'status'       : status_reasons[code],
                'content-type' : ext2ct[ext],
                'content'      : content
            }
            response=RESPONSE.format(**_resp)

        elif p.get_method()=='POST':
            print "POST", headers
            response=''

        else: response=''
        
        s.writeData(response)
        s.waitForBytesWritten()
        s.close()
Example #5
0
class BlogChecker:
	
	def __init__(self):
		self.downLoader = Downloader()
		self.attr_dict = dict()
		self.attr_dict["tid"] = """__addParam("author","****");"""
		self.attr_dict["r_url"] = """    """
		# http://cfs.tistory.com/custom/named/bw/bwell/rss.xml
		self.domain_match = dict()
		self.http_parser = HttpParser()
		self.bf = BlogUrlFactory()

	def checkDomain(self, domain):

		#header, url, html = getDownloadData(domain, self.opener)
		url, header, html = self.downLoader.open(domain)
		print "download ", domain
		if url == "ERROR" or header == None:
			return None, None

		parsing_result = self.http_parser.plugParser(header, html, url)
		links = self.http_parser.html_parser.links
		isTistory = False
		naver_id = ""

		if html.find("TOP_SSL_URL = 'https://www.tistory.com';") >= 0:
			isTistory = True
			

		feed_urls = set()

		for tt_url in links:
			try:
				link = links[tt_url]
				if isTistory and tt_url.find(domain) >= 0:
					try:
						path = urlparse.urlparse(tt_url).path[1:]
						postno = int(path)
						tistory_id = getTistoryId(tt_url)
						if tistory_id:
							return "tistory.com", tistory_id
						isTistory = False
					except Exception, msg:
						pass

				if link.inout.find("R") > 0 and tt_url.find("/response") < 0 and tt_url.find("atom") < 0 and tt_url.find("comments") < 0:
					feed_urls.add(tt_url)
					status, response = self.downLoader.getResponse(tt_url)

					t_url = response.url
					if t_url != link.url and t_url.find("tistory.com") >= 0 and t_url.endswith("/rss.xml") :
						end_cur = t_url.rfind("/")
						tistory_id = t_url[t_url[:end_cur].rfind("/")+1:end_cur]
						return "tistory.com", tistory_id

					if t_url.startswith("http://blog.rss.naver.com/") and  t_url.endswith(".xml") and len(links) < 5:
						return "blog.naver.com", t_url.replace("http://blog.rss.naver.com/","").replace(".xml", "")

				if link.tag == "REFRESH":
					netloc = urlparse.urlparse(tt_url).netloc
					if tt_url.find(".tistory.com/") >= 0:
						tistory_id = netloc[:netloc.find(".")]
						return "tistory.com", tistory_id
					else:
						return self.checkDomain("http://%s/"%netloc)
					
				if len(links) < 3:
					if link.url.find("blog.naver.com") >= 0:     # http://baljak.com/
						try:
							ret_dict = self.bf.getAllDataDic(link.url)
							print link.url
							if ret_dict and ret_dict["gen"] == "blog.naver.com":
								naver_id = ret_dict["cid"]
								if naver_id not in ["PostList"]:
									return "blog.naver.com", naver_id
						except Exception, msg:
							print msg

			except Exception, msg:
				getLogger().error(msg)