Ejemplo n.º 1
0
    def real_run(self, job):

        indexUrl = job.get('indexUrl')
        pagecontent = job.get('pagecontent')
        #print pagecontent
        ps = self.cv_parser.parser(htmlContent=pagecontent, cvFrom=self.channel)

        # 判断是否重复cv
        if self.check_has_duplication(indexUrl, ps):
            return

        cvRaw_obj = constructCvRawObj(ps)
        cvRaw_obj.cvId = job.get('indexUrl')
        cvRaw_obj.cvFrom = self.channel
        cvRaw_obj.updateTime = job.get('updateTime')
        cvRaw_obj.contentSign = job.get('contentSign')

        cvRaw = convertToCvRaw(cvRaw_obj)
        cvMeasure = self.measure(cvRaw)

        if not cvMeasure:
            print cvRaw

        cvMeasured_obj = constructCvMeasureObj(cvMeasure)

        self.store(indexUrl, cvRaw_obj, cvMeasured_obj, False)
Ejemplo n.º 2
0
    def run(self, job):

        job['indexUrl'] = job.get('cvId')  # 只是为了和其它runner一致,使它有indexUrl
        assert ("cvId" in job)
        cvRaw_obj = constructCvRawObj(job)
        cvRaw_obj.cvId = job.get('cvId')
        cvRaw_obj.cvFrom = self.channel
        cvRaw_obj.updateTime = job.get('updateTime')

        cvRaw = convertToCvRaw(cvRaw_obj)
        cvMeasure = self.measure_runner.run(cvRaw)
        cvMeasured_obj = constructCvMeasureObj(cvMeasure)
        self.store_runner.run(cvRaw.cvId, None, cvMeasured_obj)
Ejemplo n.º 3
0
    def run(self, job):

        indexUrl = job.get('indexUrl')
        pagecontent = job.get('pagecontent')
        ps = self.parse_runner.run(pagecontent)
        # 排除重复cv
        if self.check_dupli_runner.run(indexUrl, ps):
            return

        cvRaw_obj = constructCvRawObj(ps)
        self.add_other_fields(cvRaw_obj, self.channel, job)
        cvRaw = convertToCvRaw(cvRaw_obj)
        cvMeasure = self.measure_runner.run(cvRaw)

        # 转为mongoengine模型
        cvMeasured_obj = constructCvMeasureObj(cvMeasure)
        self.store_runner.run(indexUrl, cvRaw_obj, cvMeasured_obj)