def real_run(self, job): indexUrl = job.get('indexUrl') pagecontent = job.get('pagecontent') #print pagecontent ps = self.cv_parser.parser(htmlContent=pagecontent, cvFrom=self.channel) # 判断是否重复cv if self.check_has_duplication(indexUrl, ps): return cvRaw_obj = constructCvRawObj(ps) cvRaw_obj.cvId = job.get('indexUrl') cvRaw_obj.cvFrom = self.channel cvRaw_obj.updateTime = job.get('updateTime') cvRaw_obj.contentSign = job.get('contentSign') cvRaw = convertToCvRaw(cvRaw_obj) cvMeasure = self.measure(cvRaw) if not cvMeasure: print cvRaw cvMeasured_obj = constructCvMeasureObj(cvMeasure) self.store(indexUrl, cvRaw_obj, cvMeasured_obj, False)
def run(self, job): job['indexUrl'] = job.get('cvId') # 只是为了和其它runner一致,使它有indexUrl assert ("cvId" in job) cvRaw_obj = constructCvRawObj(job) cvRaw_obj.cvId = job.get('cvId') cvRaw_obj.cvFrom = self.channel cvRaw_obj.updateTime = job.get('updateTime') cvRaw = convertToCvRaw(cvRaw_obj) cvMeasure = self.measure_runner.run(cvRaw) cvMeasured_obj = constructCvMeasureObj(cvMeasure) self.store_runner.run(cvRaw.cvId, None, cvMeasured_obj)
def run(self, job): indexUrl = job.get('indexUrl') pagecontent = job.get('pagecontent') ps = self.parse_runner.run(pagecontent) # 排除重复cv if self.check_dupli_runner.run(indexUrl, ps): return cvRaw_obj = constructCvRawObj(ps) self.add_other_fields(cvRaw_obj, self.channel, job) cvRaw = convertToCvRaw(cvRaw_obj) cvMeasure = self.measure_runner.run(cvRaw) # 转为mongoengine模型 cvMeasured_obj = constructCvMeasureObj(cvMeasure) self.store_runner.run(indexUrl, cvRaw_obj, cvMeasured_obj)