class JDHandler(object): def __init__(self): # 初始化四个网站的解析器 self.jd_parser = JdParser() def analyzeHtml(self,htmlContent=None,jdFrom=None): """ jd_html: 输入的html源码, jd_from:[lagou,51job,zhilian,liepin]中的一个 """ result = dict() result_inc = dict() result_job = dict() if not jdFrom: raise ValueError("jdFrom invalid") try: result = self.jd_parser.parser(htmlContent,jdFrom=jdFrom,detail=True) # detail=False 为基础解析,True为详尽解析 except Exception as e: raise NamedError(e.message) result_inc = result["jdInc"] result_job = result["jdJob"] # jdId 和 jdUrl 需要处理时填充 # result["jdId"] = "None" # result["jdUrl"] = "None" result["jdInc"] = JdIncRaw(**result_inc) result["jdJob"] = JdJobRaw(**result_job) result = JdRaw(**result) return result
class JDHandler(object): def __init__(self): # 初始化四个网站的解析器 self.jd_parser = JdParser() def analyzeHtml(self, htmlContent=None, jdFrom=None): """ jd_html: 输入的html源码, jd_from:[lagou,51job,zhilian,liepin]中的一个 """ result = dict() result_inc = dict() result_job = dict() if not jdFrom: raise ValueError("jdFrom invalid") try: result = self.jd_parser.parser( htmlContent, jdFrom=jdFrom, detail=True) # detail=False 为基础解析,True为详尽解析 except Exception as e: raise NamedError(e.message) result_inc = result["jdInc"] result_job = result["jdJob"] # jdId 和 jdUrl 需要处理时填充 # result["jdId"] = "None" # result["jdUrl"] = "None" result["jdInc"] = JdIncRaw(**result_inc) result["jdJob"] = JdJobRaw(**result_job) result = JdRaw(**result) return result
def __init__(self): # 初始化四个网站的解析器 self.jd_parser = JdParser()
class JdStringHandler(tornado.web.RequestHandler): def post(self): try: htmlContent = self.get_argument("htmlContent", "") jdFrom = self.get_argument("jdFrom", "") result = jdParser.parser(htmlContent=htmlContent, jdFrom=jdFrom, type=type) except Exception, e: result = {"error": str(e)} self.write(result) if __name__ == "__main__": jdParser = JdParser() tornado.options.parse_command_line() app = tornado.web.Application( handlers=[(r'/', JdParserHandler), (r'/jdparser', JdParserHandler), (r"/jdUrl", JdUrlHandler)], #handlers=[(r'/', JdParserHandler),(r'/jdparser',JdParserHandler)], template_path=os.path.join(os.path.dirname(__file__), "templates"), static_path='/home/deng/workplace/jd_data/test_cos/', debug=True, autoescape=None, ) http_server = tornado.httpserver.HTTPServer(app) http_server.listen(options.port) print "starting tornado at port %d..." % options.port tornado.ioloop.IOLoop.instance().start()
cnt = 1 def parser_single(fname): global cnt print fname, cnt htmlContent = codecs.open(fname, 'rb', 'utf-8').read() result = test.parser(htmlContent, jdFrom="jobui") res = normal_key(result) res["jobCate"] = jobName2jobCate(res["jobPosition"].decode("utf-8")) res["file_name"] = fname.rsplit('/', 1)[-1] cnt += 1 return res if __name__ == "__main__": import multiprocessing, time test = JdParser() path = '/home/jkmiao/Desktop/jobui_yy/' fnames = [path + fname for fname in os.listdir(path)] start = time.clock() pool = multiprocessing.Pool(8) data = pool.map(parser_single, fnames) data2xls(data, fname="./test_jds/jd_jobui_yy_new.xls") print 'done', cnt, len(data) print 'time used', time.clock() - start