Esempio n. 1
0
    def __init__(self):
        # 初始化四个网站的解析器

        self.jd_parser = JdParser()
Esempio n. 2
0
class JdStringHandler(tornado.web.RequestHandler):
    def post(self):
        try:
            htmlContent = self.get_argument("htmlContent", "")
            jdFrom = self.get_argument("jdFrom", "")
            result = jdParser.parser(htmlContent=htmlContent,
                                     jdFrom=jdFrom,
                                     type=type)
        except Exception, e:
            result = {"error": str(e)}
        self.write(result)


if __name__ == "__main__":

    jdParser = JdParser()

    tornado.options.parse_command_line()
    app = tornado.web.Application(
        handlers=[(r'/', JdParserHandler), (r'/jdparser', JdParserHandler),
                  (r"/jdUrl", JdUrlHandler)],
        #handlers=[(r'/', JdParserHandler),(r'/jdparser',JdParserHandler)],
        template_path=os.path.join(os.path.dirname(__file__), "templates"),
        static_path='/home/deng/workplace/jd_data/test_cos/',
        debug=True,
        autoescape=None,
    )
    http_server = tornado.httpserver.HTTPServer(app)
    http_server.listen(options.port)
    print "starting tornado at port %d..." % options.port
    tornado.ioloop.IOLoop.instance().start()
Esempio n. 3
0
cnt = 1


def parser_single(fname):
    global cnt
    print fname, cnt
    htmlContent = codecs.open(fname, 'rb', 'utf-8').read()
    result = test.parser(htmlContent, jdFrom="jobui")
    res = normal_key(result)
    res["jobCate"] = jobName2jobCate(res["jobPosition"].decode("utf-8"))
    res["file_name"] = fname.rsplit('/', 1)[-1]
    cnt += 1
    return res


if __name__ == "__main__":
    import multiprocessing, time

    test = JdParser()

    path = '/home/jkmiao/Desktop/jobui_yy/'
    fnames = [path + fname for fname in os.listdir(path)]
    start = time.clock()
    pool = multiprocessing.Pool(8)
    data = pool.map(parser_single, fnames)
    data2xls(data, fname="./test_jds/jd_jobui_yy_new.xls")

    print 'done', cnt, len(data)
    print 'time used', time.clock() - start