def process_item(self, item, spider): # 将item转换为es数据 lagou = LagouType() lagou.title = item.get("title", "") lagou.url = item.get("url", "") lagou.url_object_id = item.get("url_object_id", "") lagou.salary = item.get("salary", "") # lagou.job_city = item.get("job_city", "") # lagou.work_years = item.get("work_years", "") # lagou.degree_need = item.get("degree_need", "") # lagou.job_type = item.get("job_type", "") # lagou.publish_time = item.get("publish_time", "") # lagou.job_advantage = item.get("job_advantage", "") lagou.job_desc = item.get("job_desc", "") lagou.job_addr = item.get("job_addr", "") # lagou.company_name = item.get("company_name", "") # lagou.company_url = item.get("company_url", "") # lagou.tags = item.get("tags", "") lagou.meta.id = item["url_object_id"] lagou.save() return item
def save_to_es(self): artical = LagouType() artical.title = self['title'] artical.create_date = self["crawl_time"] artical.url = self["url"] artical.url_object_id = self["url_object_id"] artical.salary = self["salary"] artical.job_city = self["job_city"] artical.work_years = self["work_years"] artical.degree_need = self["degree_need"] artical.job_type = self["job_type"] artical.tags = self["tags"] artical.publish_time = self["publish_time"] artical.job_advantage = self["job_advantage"] artical.job_desc = self["job_desc"] artical.job_addr = self["job_addr"] artical.company_name = self["company_name"] artical.company_url = self["company_url"] artical.suggest = gen_suggests(LagouType._doc_type.index, ((artical.title, 10), (artical.tags, 7))) artical.save() redis_cli.incr("lagou_count") return
def save_to_es(self): #turn the item in the ES's item job = LagouType() job.title = self["title"] job.url = self["url"] job.url_object_id = self["url_object_id"] job.salary = self["salary"] job.job_city = self["job_city"] job.degree_need = self["degree_need"] job.job_type = self["job_type"] job.job_advantage = self["job_advantage"] job.job_desc = self["job_desc"] job.job_addr = self["job_addr"] job.company_name = self["company_name"] job.company_url = self["company_url"] job.tags = self["tags"] job.suggest = get_suggests(LagouType._doc_type.index, ((job.title, 10), (job.tags, 7), (job.job_desc, 4), (job.job_type, 3))) job.save() redis_cli.incr("job_count") return
def save_to_es(self): crawl_time = datetime.datetime.now().strftime(SQL_DATE_FORMAT) job_id = extract_num(self["url"]) article = LagouType() article.meta.id = job_id article.title = self['title'] article.url = self['url'] article.salary = self['salary'] article.job_city = self['job_city'] article.work_years = self['work_years'] article.degree_need = self['degree_need'] article.job_type = self['job_type'] article.publish_time = self['publish_time'] article.job_advantage = self['job_advantage'] article.job_desc = self['job_desc'] article.job_addr = self['job_addr'] article.company_name = self['company_name'] article.crawl_time = crawl_time article.crawl_update_time = crawl_time article.suggest = gen_suggest(LagouType._doc_type.index, ((article.title, 10), (article.company_name, 9), (article.job_desc, 8), (article.job_addr, 7))) redis_cli.incr("lagou_count") article.save()
def save_to_es(self): lagou = LagouType() lagou.title = self['title'] lagou.url = self['url'] lagou.url_object_id = self['url_object_id'] lagou.salary = self['salary'] lagou.job_city = self['job_city'] lagou.work_years = self['work_years'] lagou.degree_need = self['degree_need'] lagou.job_type = self['job_type'] lagou.publish_time = self['publish_time'] lagou.job_advantage = self['job_advantage'] lagou.job_desc = remove_tags(self['job_desc']) lagou.job_addr = self['job_addr'] lagou.company_name = self['company_name'] lagou.company_url = self['company_url'] if 'tags' in self: lagou.tags = self['tags'] lagou.crawl_time = self['crawl_time'] lagou.suggest = gen_suggests(LagouType._doc_type.index, ((lagou.title, 10), (lagou.tags, 7), (lagou.job_desc, 5))) lagou.save() return