Python HttpParser Examples

Programming Language: Python

Namespace/Package Name: parser

Class/Type: HttpParser

Examples at hotexamples.com: 5

Python HttpParser - 5 examples found. These are the top rated real world Python examples of parser.HttpParser extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

plugParser(2)

HttpParser(1)

execute(1)

get_method(1)

get_path(1)

Example #1

Show file

File: custom_parser.py Project: Shiwoo-Park/glance

class CustomParser:
	def __init__(self, type="service"):
		self.prm = None
		self.parser = HttpParser()
		self.rss_parser = rssParser()
		self.df = DateFactory()
		self.field_transformer = FieldTransformer()
		self.type = type

	def setRules(self, cursor=None):
		self.prm = ParsingRuleManager(self.type)
		self.prm.makeRule(cursor)

	def getLinks(self,  header, html, url):
		if url.find("/feed/") > 0 or url.find("feeds.feedburner.com") > 0 or url.endswith(".xml") or url.find("=rss") > 0 or url.find("/rss") > 0:
			ret_dict = self.rss_parser.parse(url, html)
		else:	
			ret_dict = self.parser.plugParser(header,  html, url)
		if "links" in ret_dict:
			return ret_dict["links"]
		return dict()

	def parse(self, header, html, url, parser_id=None):

		if self.prm == None:
			self.setRules()

		ret_dict = self.parser.plugParser(header,  html, url)

		result_dict = dict()
		if parser_id != None and parser_id in self.prm.id_dict:
			try:
				host_rule = self.prm.id_dict[parser_id]
				result_dict = self.getDataByRule(host_rule, ret_dict, url)
				result_dict["parser_id"] = parser_id
				return result_dict
			except Exception, msg:
				getLogger().error(msg)

		domain = ret_dict["domain"]
		if domain in self.prm.template_rules:
			try:
				t_parser_id, org_parser_id  = self.prm.template_rules[domain]
				host_rule = self.prm.id_dict[t_parser_id]
				result_dict = self.getDataByRule(host_rule, ret_dict, url)
				result_dict["parser_id"] = org_parser_id
				return result_dict
			except Exception, msg:
				getLogger().error(msg)

Example #2

Show file

File: custom_parser.py Project: Shiwoo-Park/glance

	def __init__(self, type="service"):
		self.prm = None
		self.parser = HttpParser()
		self.rss_parser = rssParser()
		self.df = DateFactory()
		self.field_transformer = FieldTransformer()
		self.type = type

Example #3

Show file

File: feedFinder.py Project: Shiwoo-Park/glance

	def __init__(self):
		self.downLoader = Downloader()
		self.attr_dict = dict()
		self.attr_dict["tid"] = """__addParam("author","****");"""
		self.attr_dict["r_url"] = """    """
		# http://cfs.tistory.com/custom/named/bw/bwell/rss.xml
		self.domain_match = dict()
		self.http_parser = HttpParser()
		self.bf = BlogUrlFactory()

Example #4

Show file

File: httpserver.py Project: jazzycamel/ChristmasLights

    def readClient(self):
        s=self.sender()
        headers=str(s.readAll())

        p=HttpParser()
        plen=p.execute(headers, len(headers))

        if p.get_method()=="GET":
            path=p.get_path()

            try:
                if path.startswith("/ajax"):
                    code=200
                    ext='json'
                    content=""

                    method,arg=path.split('/')[2:]
                    getattr(self, method)(int(arg))

                else:
                    try:
                        _path=os.path.join(self._root, path[1:])
                        if not os.path.exists(_path): raise Error404
                        elif os.path.isdir(_path):
                            _path=os.path.join(_path, 'index.html')
                            if not os.path.exists(_path): raise Error404

                        ext=os.path.splitext(_path)[1][1:].lower()
                        code=200
                        with open(_path, 'rb') as f: content=f.read()

                    except Error404 as e:
                        code=404
                        ext='html'
                        content='<h1>404 - File Not Found ({0})</h1>'.format(path)

            except Exception as e:
                code=500
                ext='html'
                content='<h1>500 - Internal Error</h1>'

                print e

            _resp={
                'code'         : code,
                'status'       : status_reasons[code],
                'content-type' : ext2ct[ext],
                'content'      : content
            }
            response=RESPONSE.format(**_resp)

        elif p.get_method()=='POST':
            print "POST", headers
            response=''

        else: response=''
        
        s.writeData(response)
        s.waitForBytesWritten()
        s.close()

Example #5

Show file

File: feedFinder.py Project: Shiwoo-Park/glance

class BlogChecker:
	
	def __init__(self):
		self.downLoader = Downloader()
		self.attr_dict = dict()
		self.attr_dict["tid"] = """__addParam("author","****");"""
		self.attr_dict["r_url"] = """    """
		# http://cfs.tistory.com/custom/named/bw/bwell/rss.xml
		self.domain_match = dict()
		self.http_parser = HttpParser()
		self.bf = BlogUrlFactory()

	def checkDomain(self, domain):

		#header, url, html = getDownloadData(domain, self.opener)
		url, header, html = self.downLoader.open(domain)
		print "download ", domain
		if url == "ERROR" or header == None:
			return None, None

		parsing_result = self.http_parser.plugParser(header, html, url)
		links = self.http_parser.html_parser.links
		isTistory = False
		naver_id = ""

		if html.find("TOP_SSL_URL = 'https://www.tistory.com';") >= 0:
			isTistory = True
			

		feed_urls = set()

		for tt_url in links:
			try:
				link = links[tt_url]
				if isTistory and tt_url.find(domain) >= 0:
					try:
						path = urlparse.urlparse(tt_url).path[1:]
						postno = int(path)
						tistory_id = getTistoryId(tt_url)
						if tistory_id:
							return "tistory.com", tistory_id
						isTistory = False
					except Exception, msg:
						pass

				if link.inout.find("R") > 0 and tt_url.find("/response") < 0 and tt_url.find("atom") < 0 and tt_url.find("comments") < 0:
					feed_urls.add(tt_url)
					status, response = self.downLoader.getResponse(tt_url)

					t_url = response.url
					if t_url != link.url and t_url.find("tistory.com") >= 0 and t_url.endswith("/rss.xml") :
						end_cur = t_url.rfind("/")
						tistory_id = t_url[t_url[:end_cur].rfind("/")+1:end_cur]
						return "tistory.com", tistory_id

					if t_url.startswith("http://blog.rss.naver.com/") and  t_url.endswith(".xml") and len(links) < 5:
						return "blog.naver.com", t_url.replace("http://blog.rss.naver.com/","").replace(".xml", "")

				if link.tag == "REFRESH":
					netloc = urlparse.urlparse(tt_url).netloc
					if tt_url.find(".tistory.com/") >= 0:
						tistory_id = netloc[:netloc.find(".")]
						return "tistory.com", tistory_id
					else:
						return self.checkDomain("http://%s/"%netloc)
					
				if len(links) < 3:
					if link.url.find("blog.naver.com") >= 0:     # http://baljak.com/
						try:
							ret_dict = self.bf.getAllDataDic(link.url)
							print link.url
							if ret_dict and ret_dict["gen"] == "blog.naver.com":
								naver_id = ret_dict["cid"]
								if naver_id not in ["PostList"]:
									return "blog.naver.com", naver_id
						except Exception, msg:
							print msg

			except Exception, msg:
				getLogger().error(msg)