Exemple #1
0
class JDHandler(object):

    def __init__(self):
        # 初始化四个网站的解析器

        self.jd_parser = JdParser()


    def analyzeHtml(self,htmlContent=None,jdFrom=None):
        """
        jd_html: 输入的html源码,
        jd_from:[lagou,51job,zhilian,liepin]中的一个
        """
        result = dict()
        result_inc = dict()
        result_job = dict()

        if not jdFrom:
            raise ValueError("jdFrom invalid")

        try:
            result = self.jd_parser.parser(htmlContent,jdFrom=jdFrom,detail=True)   # detail=False 为基础解析,True为详尽解析
        except Exception as e:
            raise NamedError(e.message)


        result_inc = result["jdInc"]
        result_job = result["jdJob"]

        # jdId 和 jdUrl 需要处理时填充
        # result["jdId"] = "None"
        # result["jdUrl"] = "None"

        result["jdInc"] = JdIncRaw(**result_inc)
        result["jdJob"] = JdJobRaw(**result_job)

        result = JdRaw(**result)

        return result
class JDHandler(object):
    def __init__(self):
        # 初始化四个网站的解析器

        self.jd_parser = JdParser()

    def analyzeHtml(self, htmlContent=None, jdFrom=None):
        """
        jd_html: 输入的html源码,
        jd_from:[lagou,51job,zhilian,liepin]中的一个
        """
        result = dict()
        result_inc = dict()
        result_job = dict()

        if not jdFrom:
            raise ValueError("jdFrom invalid")

        try:
            result = self.jd_parser.parser(
                htmlContent, jdFrom=jdFrom,
                detail=True)  # detail=False 为基础解析,True为详尽解析
        except Exception as e:
            raise NamedError(e.message)

        result_inc = result["jdInc"]
        result_job = result["jdJob"]

        # jdId 和 jdUrl 需要处理时填充
        # result["jdId"] = "None"
        # result["jdUrl"] = "None"

        result["jdInc"] = JdIncRaw(**result_inc)
        result["jdJob"] = JdJobRaw(**result_job)

        result = JdRaw(**result)

        return result
Exemple #3
0
    def __init__(self):
        # 初始化四个网站的解析器

        self.jd_parser = JdParser()
Exemple #4
0
class JdStringHandler(tornado.web.RequestHandler):
    def post(self):
        try:
            htmlContent = self.get_argument("htmlContent", "")
            jdFrom = self.get_argument("jdFrom", "")
            result = jdParser.parser(htmlContent=htmlContent,
                                     jdFrom=jdFrom,
                                     type=type)
        except Exception, e:
            result = {"error": str(e)}
        self.write(result)


if __name__ == "__main__":

    jdParser = JdParser()

    tornado.options.parse_command_line()
    app = tornado.web.Application(
        handlers=[(r'/', JdParserHandler), (r'/jdparser', JdParserHandler),
                  (r"/jdUrl", JdUrlHandler)],
        #handlers=[(r'/', JdParserHandler),(r'/jdparser',JdParserHandler)],
        template_path=os.path.join(os.path.dirname(__file__), "templates"),
        static_path='/home/deng/workplace/jd_data/test_cos/',
        debug=True,
        autoescape=None,
    )
    http_server = tornado.httpserver.HTTPServer(app)
    http_server.listen(options.port)
    print "starting tornado at port %d..." % options.port
    tornado.ioloop.IOLoop.instance().start()
    def __init__(self):
        # 初始化四个网站的解析器

        self.jd_parser = JdParser()
Exemple #6
0
cnt = 1


def parser_single(fname):
    global cnt
    print fname, cnt
    htmlContent = codecs.open(fname, 'rb', 'utf-8').read()
    result = test.parser(htmlContent, jdFrom="jobui")
    res = normal_key(result)
    res["jobCate"] = jobName2jobCate(res["jobPosition"].decode("utf-8"))
    res["file_name"] = fname.rsplit('/', 1)[-1]
    cnt += 1
    return res


if __name__ == "__main__":
    import multiprocessing, time

    test = JdParser()

    path = '/home/jkmiao/Desktop/jobui_yy/'
    fnames = [path + fname for fname in os.listdir(path)]
    start = time.clock()
    pool = multiprocessing.Pool(8)
    data = pool.map(parser_single, fnames)
    data2xls(data, fname="./test_jds/jd_jobui_yy_new.xls")

    print 'done', cnt, len(data)
    print 'time used', time.clock() - start