コード例 #1
0
ファイル: items.py プロジェクト: ASKTIME/lagouspiders
    def save_to_es(self):
        job = ArticleType()
        job.url = self['url']
        job.url_object_id = self['url_object_id']
        job.title = self['title']
        job.salary = self['salary']
        job.job_city = self['job_city']
        job.work_years = self['work_years']
        job.degree_need = self['degree_need']
        job.job_type = self['job_type']
        if self['publish_time']:
            job.publish_time = self['publish_time']
        job.tags = self['tags']
        job.job_advantage = self['job_advantage']
        job.job_desc = self['job_desc']
        job.job_addr = self['job_addr']
        job.company_url = self['company_url']
        job.company_name = self['company_name']
        job.crawl_time = self['crawl_time']
        #job.crawl_update_time = self['crawl_update_time']

        job.suggest = gen_suggests(ArticleType._doc_type.index,
                                   ((job.title, 10), (job.tags, 7)))

        job.save()
        redis_cli.incr("lagou_count")

        return
コード例 #2
0
ファイル: items.py プロジェクト: tomdev2008/ArticleSpider
    def save_to_es(self):
        article = ArticleType()
        article.title = self['title']
        article.create_date = self["create_date"]
        article.content = remove_tags(self["content"])
        article.front_image_url = self["front_image_url"]
        if "front_image_path" in self:
            article.front_image_path = self["front_image_path"]
        article.praise_nums = self["praise_nums"]
        article.fav_nums = self["fav_nums"]
        article.comment_nums = self["comment_nums"]
        article.url = self["url"]
        article.tags = self["tags"]
        article.meta.id = self["url_object_id"]

        # article.suggest = [{"input":[], "weight":2}]
        article.suggest = gen_suggests(ArticleType._doc_type.index,
                                       ((article.title, 10),
                                        (article.tags, 7)))

        article.save()

        redis_cli.incr("jobble_count")

        return
コード例 #3
0
ファイル: items.py プロジェクト: NeilHUI/SoftwareWebSpider
    def save_to_es(self):
        article = ArticleType()
        article.title = self["title"]
        article.url = self["url"]
        article.front_image_url = self["front_image_url"]
        if "front_image_path" in self:
            article.front_image_path = self["front_image_path"]
        article.type = self["type"]
        article.size = self["size"]
        article.update_time = self["update_time"]
        article.content = remove_tags(self["content"])
        article.tag = self["tag"]
        article.fav_nums = self["fav_nums"]
        if "download_urls" in self:
            article.download_urls = self["download_urls"]
        article.meta.id = self["url_object_id"]

        article.suggest = gen_suggestions(ArticleType._doc_type.index,
                                          ((article.title, 10),
                                           (article.tag, 7)))

        article.save()

        #redis_cli.incr("lcsoft_count")
        return
コード例 #4
0
    def save_to_es(self):
        article = ArticleType()
        article.title = self['title']
        article.create_date = self['create_date']
        article.content = remove_tags(self['content'])
        article.front_image_url = self['front_image_url']
        if 'front_image_path' in self:
            article.front_image_path = self['front_image_path']
        article.praise_nums = self['praise_nums']
        article.fav_nums = self['fav_nums']
        article.comment_nums = self['comment_nums']
        article.url = self['url']
        article.tags = self['tags']
        article.meta.id = self['url_object_id']

        article.suggest = gen_suggests(ArticleType._doc_type.index,
                                       ((article.title, 10),
                                        (article.tags, 7)))

        #article.suggest = gen_suggests(ArticleType._doc_type.index,((article.title,10),(article.tags,7)))

        article.save()  #save into elastics search after called in pipelines
        #全局变量 redis and then test
        redis_cli.incr("jobbole_count")  #automatic +1 from 1

        return
コード例 #5
0
ファイル: items.py プロジェクト: bmei1/medium.com_articles
    def save_to_es(self):
        # transfer data to ES format
        article = ArticleType()
        article.title = self['title']
        article.create_date = self['create_date']
        article.url = self['url']
        article.meta.id = self['url_object_id']
        article.author = self['author']
        article.author_description = self['author_description']
        article.applause = self['applause']
        article.content = remove_tags(self['content'])

        article.save()

        return
コード例 #6
0
 def save_to_es(self):
     article = ArticleType()
     article.title = self['title']
     article.times = self['times']
     article.redianzang = self['redianzang']
     article.shoucang = self['shoucang']
     article.pinglun = self['pinglun']
     article.context = remove_tags(self['context'])
     print(article.context)
     article.url = self['url']
     # 建议词在es中的格式[{"input":[],"weight":2}]
     article.suggest = gen_suggests(ArticleType._doc_type.index,
                                    ((article.title, 10),
                                     (article.context, 7)))
     article.save()
     return
コード例 #7
0
ファイル: items.py プロジェクト: depchen/spider
 def save_to_es(self):
     # 将item转换为es的数据
     article = ArticleType()
     article.title = self['title']
     article.create_time = self['create_time']
     article.content = remove_tags(self['content'])
     article.front_image_url = self['front_image_url']
     if "front_image_path" in self:
         article.front_image_path = self['front_image_path']
     article.praise_num = self['praise_num']
     article.fav_num = self['fav_num']
     article.comment_num = self['comment_num']
     article.url = self['url']
     article.tags = self['tags']
     article.meta.id = self['url_object_id']
     article.suggest=gen_suggests(ArticleType._doc_type.index,((article.title,10),(article.tags,7)))
     article.save()
     redis_cli.incr("jobbole_count")
     return
コード例 #8
0
ファイル: items.py プロジェクト: Erick-LONG/ArticleSpider
    def save_to_es(self):
        article = ArticleType()
        article.title = self['title']
        article.create_date = self['create_date']
        article.content = remove_tags(self['content'])
        if 'front_image_path' in self:
            article.front_image_path = self['front_image_path']
        article.front_image_url = self['front_image_url']
        article.praise_nums = self['praise_nums']
        article.fav_nums = self['fav_nums']
        article.comment_nums = self['comment_nums']
        article.url = self['url']
        article.tags = self['tags']
        article.meta.id = self['url_object_id']

        article.suggest = gen_suggests(ArticleType._doc_type.index,((article.title,10),(article.tags,7)))

        article.save()
        return
コード例 #9
0
ファイル: items.py プロジェクト: evanxu123/search_engine
 def save_to_es(self):
     article = ArticleType()
     article.title = self["title"]
     article.create_date = self["create_date"]
     article.content = remove_tags(self["content"])
     article.front_image_url = self["front_image_url"]
     if "front_image_path" in self:
         article.front_image_path = self["front_image_path"]
     article.praise_nums = self["praise_nums"]
     article.comment_nums = self["comment_nums"]
     article.fav_nums = self["fav_nums"]
     article.url = self["url"]
     article.tags = self["tags"]
     article.meta.id = self["url_object_id"]
     article.suggest = gen_suggests(ArticleType._doc_type.index,
                                    ((article.title, 10),
                                     (article.tags, 7)))
     article.save()
     return
コード例 #10
0
ファイル: pipelines.py プロジェクト: clefairylin/scrapyTest
    def process_item(self, item, spider):
        article = ArticleType()
        article.title = item["title"]
        article.url = item["url"]
        article.front_image_path = item.get("front_image_path")
        article.front_image_url = item["front_image_url"]
        article.create_date = item["create_date"]
        article.praise_nums = item["praise_nums"]
        article.fav_nums = item["fav_nums"]
        article.comment_nums = item["comment_nums"]
        article.tag = item["tag"]
        article.content = remove_tags(item["content"])
        article.meta.id = item["url_object_id"]

        article.suggest = self.get_suggest(ArticleType._doc_type.index,
                                           ((article.title, 10),
                                            (article.tag, 7)))
        article.save()
        return item
コード例 #11
0
    def save2elastic(self):
        # 将item转换为es数据
        article = ArticleType()
        article.title = self["title"]
        article.url_object_id = self["url_object_id"]
        article.url = self["url"]
        article.front_image_url = self["front_image_url"]
        if "front_image_path" in self:
            article.front_image_path = self["front_image_path"]
        article.creat_date = self["creat_date"]
        article.praise_num = self["praise_num"]
        article.collect_num = self["collect_num"]
        article.comment_num = self["comment_num"]
        article.content = remove_tags(self["content"])
        article.tags = self["tags"]
        article.meta.id = self["url_object_id"]

        article.suggest = gen_suggests(ArticleType._doc_type.index,
                                       ((article.title, 10),
                                        (article.tags, 7)))
        article.save()
コード例 #12
0
    def save_to_es(self):
        article = ArticleType()
        article.title = self['title']
        article.create_date = self['create_date']
        if "front_image_path" in self:
            article.front_image_url = self["front_image_path"]
            article.front_image_path = self["front_image_path"]
        article.praise_nums = self["praise_nums"]
        article.fav_nums = self["fav_nums"]
        article.comment_nums = self["comment_nums"]
        article.url = self["url"]
        article.tags = self["tags"]
        article.meta.id = self["url_object_id"]
        article.suggest = gen_suggests(ArticleType._doc_type.index,
                                       ((article.title, 10),
                                        (article.tags, 7)))
        article.content = self["content"]
        article.save()
        # redis 记录插入item数
        redis_cli.incr("jobbole_count")

        return
コード例 #13
0
    def save_to_es(self):
        # 将item转换为es的数据
        article = ArticleType()

        article.title = self["title"]
        article.create_date = self["create_date"]
        article.url = self["url"]
        article.content = self["content"]

        article.meta.id = self["url_object_id"]
        article.front_image_url = self["front_image_url"]

        if "front_image_path" in self:
            article.front_image_path = self["front_image_path"]
        article.tags = self["tags"]

        article.suggest = gen_suggests(ArticleType._doc_type.index,
                                       ((article.title, 10),
                                        (article.tags, 7)))
        article.save()

        return
コード例 #14
0
ファイル: pipelines.py プロジェクト: ogli324/search_engine
    def process_item(self, item, spider):
        """
        将item转换为es的数据格式
        :param item:
        :param spider:
        :return:
        """
        # 初始化一个es的document
        article = ArticleType()

        # 将该条document的id设置为url_object_id
        article.meta.id = item['url_object_id']

        article.url = item['url']
        article.title = item['title']
        article.article_type = item['article_type']
        article.data_source = item['data_source']
        article.publish_time = item['publish_time']
        article.abstract = item['abstract']
        if item['tags']:
            article.tags = item['tags']
        else:
            article.tags = '无'

        # 热度计算公式
        article.hot_score = 8 * item['comment_num'] + 3 * item[
            'praise_num'] + 5 * item['collection_num'] + item['read_num']

        # 传入的元组需要按权值从大到小排列
        article.suggest = self.gen_suggests(ArticleType._doc_type.index,
                                            ((article.title, 10),
                                             (article.article_type, 5),
                                             (article.tags, 3)))

        # 调用save方法直接存储到es中
        article.save()

        return item
コード例 #15
0
ファイル: items.py プロジェクト: abcd567/spiders
    def save_to_es(self):
        article = ArticleType()
        article.title = self['title']
        article.create_date = self['create_date']
        article.content = remove_tags(self['content'])
        article.front_image_url = self['front_image_url']
        article.front_image_path = self[
            'front_image_path'] if "front_image_path" in self else None
        article.praise_nums = self['praise_nums']
        article.fav_nums = self['praise_nums']
        article.comment_nums = self['praise_nums']
        article.tags = self['tags']
        article.url = self['url']
        article.meta.id = self["url_object_id"]

        article.suggest = gen_suggest(ArticleType,
                                      ((article.title, 10), (article.tags, 7)))

        article.save()

        redis_cli.incr("jobbole_count")

        return