def save_to_es(self): job = ArticleType() job.url = self['url'] job.url_object_id = self['url_object_id'] job.title = self['title'] job.salary = self['salary'] job.job_city = self['job_city'] job.work_years = self['work_years'] job.degree_need = self['degree_need'] job.job_type = self['job_type'] if self['publish_time']: job.publish_time = self['publish_time'] job.tags = self['tags'] job.job_advantage = self['job_advantage'] job.job_desc = self['job_desc'] job.job_addr = self['job_addr'] job.company_url = self['company_url'] job.company_name = self['company_name'] job.crawl_time = self['crawl_time'] #job.crawl_update_time = self['crawl_update_time'] job.suggest = gen_suggests(ArticleType._doc_type.index, ((job.title, 10), (job.tags, 7))) job.save() redis_cli.incr("lagou_count") return
def save_to_es(self): article = ArticleType() article.title = self['title'] article.create_date = self["create_date"] article.content = remove_tags(self["content"]) article.front_image_url = self["front_image_url"] if "front_image_path" in self: article.front_image_path = self["front_image_path"] article.praise_nums = self["praise_nums"] article.fav_nums = self["fav_nums"] article.comment_nums = self["comment_nums"] article.url = self["url"] article.tags = self["tags"] article.meta.id = self["url_object_id"] # article.suggest = [{"input":[], "weight":2}] article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) article.save() redis_cli.incr("jobble_count") return
def save_to_es(self): article = ArticleType() article.title = self["title"] article.url = self["url"] article.front_image_url = self["front_image_url"] if "front_image_path" in self: article.front_image_path = self["front_image_path"] article.type = self["type"] article.size = self["size"] article.update_time = self["update_time"] article.content = remove_tags(self["content"]) article.tag = self["tag"] article.fav_nums = self["fav_nums"] if "download_urls" in self: article.download_urls = self["download_urls"] article.meta.id = self["url_object_id"] article.suggest = gen_suggestions(ArticleType._doc_type.index, ((article.title, 10), (article.tag, 7))) article.save() #redis_cli.incr("lcsoft_count") return
def save_to_es(self): article = ArticleType() article.title = self['title'] article.create_date = self['create_date'] article.content = remove_tags(self['content']) article.front_image_url = self['front_image_url'] if 'front_image_path' in self: article.front_image_path = self['front_image_path'] article.praise_nums = self['praise_nums'] article.fav_nums = self['fav_nums'] article.comment_nums = self['comment_nums'] article.url = self['url'] article.tags = self['tags'] article.meta.id = self['url_object_id'] article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) #article.suggest = gen_suggests(ArticleType._doc_type.index,((article.title,10),(article.tags,7))) article.save() #save into elastics search after called in pipelines #全局变量 redis and then test redis_cli.incr("jobbole_count") #automatic +1 from 1 return
def save_to_es(self): # transfer data to ES format article = ArticleType() article.title = self['title'] article.create_date = self['create_date'] article.url = self['url'] article.meta.id = self['url_object_id'] article.author = self['author'] article.author_description = self['author_description'] article.applause = self['applause'] article.content = remove_tags(self['content']) article.save() return
def save_to_es(self): article = ArticleType() article.title = self['title'] article.times = self['times'] article.redianzang = self['redianzang'] article.shoucang = self['shoucang'] article.pinglun = self['pinglun'] article.context = remove_tags(self['context']) print(article.context) article.url = self['url'] # 建议词在es中的格式[{"input":[],"weight":2}] article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.context, 7))) article.save() return
def save_to_es(self): # 将item转换为es的数据 article = ArticleType() article.title = self['title'] article.create_time = self['create_time'] article.content = remove_tags(self['content']) article.front_image_url = self['front_image_url'] if "front_image_path" in self: article.front_image_path = self['front_image_path'] article.praise_num = self['praise_num'] article.fav_num = self['fav_num'] article.comment_num = self['comment_num'] article.url = self['url'] article.tags = self['tags'] article.meta.id = self['url_object_id'] article.suggest=gen_suggests(ArticleType._doc_type.index,((article.title,10),(article.tags,7))) article.save() redis_cli.incr("jobbole_count") return
def save_to_es(self): article = ArticleType() article.title = self['title'] article.create_date = self['create_date'] article.content = remove_tags(self['content']) if 'front_image_path' in self: article.front_image_path = self['front_image_path'] article.front_image_url = self['front_image_url'] article.praise_nums = self['praise_nums'] article.fav_nums = self['fav_nums'] article.comment_nums = self['comment_nums'] article.url = self['url'] article.tags = self['tags'] article.meta.id = self['url_object_id'] article.suggest = gen_suggests(ArticleType._doc_type.index,((article.title,10),(article.tags,7))) article.save() return
def save_to_es(self): article = ArticleType() article.title = self["title"] article.create_date = self["create_date"] article.content = remove_tags(self["content"]) article.front_image_url = self["front_image_url"] if "front_image_path" in self: article.front_image_path = self["front_image_path"] article.praise_nums = self["praise_nums"] article.comment_nums = self["comment_nums"] article.fav_nums = self["fav_nums"] article.url = self["url"] article.tags = self["tags"] article.meta.id = self["url_object_id"] article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) article.save() return
def process_item(self, item, spider): article = ArticleType() article.title = item["title"] article.url = item["url"] article.front_image_path = item.get("front_image_path") article.front_image_url = item["front_image_url"] article.create_date = item["create_date"] article.praise_nums = item["praise_nums"] article.fav_nums = item["fav_nums"] article.comment_nums = item["comment_nums"] article.tag = item["tag"] article.content = remove_tags(item["content"]) article.meta.id = item["url_object_id"] article.suggest = self.get_suggest(ArticleType._doc_type.index, ((article.title, 10), (article.tag, 7))) article.save() return item
def save2elastic(self): # 将item转换为es数据 article = ArticleType() article.title = self["title"] article.url_object_id = self["url_object_id"] article.url = self["url"] article.front_image_url = self["front_image_url"] if "front_image_path" in self: article.front_image_path = self["front_image_path"] article.creat_date = self["creat_date"] article.praise_num = self["praise_num"] article.collect_num = self["collect_num"] article.comment_num = self["comment_num"] article.content = remove_tags(self["content"]) article.tags = self["tags"] article.meta.id = self["url_object_id"] article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) article.save()
def save_to_es(self): article = ArticleType() article.title = self['title'] article.create_date = self['create_date'] if "front_image_path" in self: article.front_image_url = self["front_image_path"] article.front_image_path = self["front_image_path"] article.praise_nums = self["praise_nums"] article.fav_nums = self["fav_nums"] article.comment_nums = self["comment_nums"] article.url = self["url"] article.tags = self["tags"] article.meta.id = self["url_object_id"] article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) article.content = self["content"] article.save() # redis 记录插入item数 redis_cli.incr("jobbole_count") return
def save_to_es(self): # 将item转换为es的数据 article = ArticleType() article.title = self["title"] article.create_date = self["create_date"] article.url = self["url"] article.content = self["content"] article.meta.id = self["url_object_id"] article.front_image_url = self["front_image_url"] if "front_image_path" in self: article.front_image_path = self["front_image_path"] article.tags = self["tags"] article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) article.save() return
def process_item(self, item, spider): """ 将item转换为es的数据格式 :param item: :param spider: :return: """ # 初始化一个es的document article = ArticleType() # 将该条document的id设置为url_object_id article.meta.id = item['url_object_id'] article.url = item['url'] article.title = item['title'] article.article_type = item['article_type'] article.data_source = item['data_source'] article.publish_time = item['publish_time'] article.abstract = item['abstract'] if item['tags']: article.tags = item['tags'] else: article.tags = '无' # 热度计算公式 article.hot_score = 8 * item['comment_num'] + 3 * item[ 'praise_num'] + 5 * item['collection_num'] + item['read_num'] # 传入的元组需要按权值从大到小排列 article.suggest = self.gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.article_type, 5), (article.tags, 3))) # 调用save方法直接存储到es中 article.save() return item
def save_to_es(self): article = ArticleType() article.title = self['title'] article.create_date = self['create_date'] article.content = remove_tags(self['content']) article.front_image_url = self['front_image_url'] article.front_image_path = self[ 'front_image_path'] if "front_image_path" in self else None article.praise_nums = self['praise_nums'] article.fav_nums = self['praise_nums'] article.comment_nums = self['praise_nums'] article.tags = self['tags'] article.url = self['url'] article.meta.id = self["url_object_id"] article.suggest = gen_suggest(ArticleType, ((article.title, 10), (article.tags, 7))) article.save() redis_cli.incr("jobbole_count") return