Beispiel #1
0
    def get_insert_sql(self):
        # 插入知乎question表的sql语句
        insert_sql = """
            insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num,
              watch_user_num, click_num, crawl_time
              )
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num),
              watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num)
        """
        zhihu_id = self["zhihu_id"][0]
        topics = ",".join(self["topics"])
        url = self["url"][0]
        title = "".join(self["title"])
        content = "".join(self["content"])
        answer_num = extract_num("".join(self["answer_num"]))
        comments_num = extract_num("".join(self["comments_num"]))

        if len(self["watch_user_num"]) == 2:
            watch_user_num = int(self["watch_user_num"][0])
            click_num = int(self["watch_user_num"][1])
        else:
            watch_user_num = int(self["watch_user_num"][0])
            click_num = 0

        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        params = (zhihu_id, topics, url, title, content, answer_num, comments_num,
                  watch_user_num, click_num, crawl_time)

        return insert_sql, params
Beispiel #2
0
    def get_insert_sql(self):
        insert_sql = """
                    INSERT INTO
                      zhihu_question (zhihu_id, topics, url, title, content, answer_num, comments_num,
                        watch_user_num, click_num, crawl_time)
                    VALUES
                      (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                    ON DUPLICATE KEY UPDATE
                      content=VALUES(content), answer_num=VALUES(answer_num),
                      comments_num=VALUES(comments_num), watch_user_num=VALUES(watch_user_num),
                      click_num=VALUES(click_num)
                """

        # 提供另外一种方法处理list
        zhihu_id = self["zhihu_id"][0]
        topics = ",".join(self["topics"])
        url = self["url"][0]
        title = "".join(self["title"])
        content = "".join(self["content"])
        answer_num = extract_num("".join(self["answer_num"]))
        comments_num = extract_num("".join(self["comments_num"]))
        watch_user_num = extract_num(self["watch_user_num"][0])
        click_num = extract_num(self["watch_user_num"][1])
        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        params = (zhihu_id, topics, url, title, content, answer_num,
                  comments_num, watch_user_num, click_num, crawl_time)

        return insert_sql, params
Beispiel #3
0
    def get_insert_sql(self):
        # 插入知乎question表的sql语句
        insert_sql = """
                    insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, 
                    watch_user_num, click_num, crawl_time) 
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                    ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num),
                    watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num), crawl_time=VALUES(crawl_time)
        """

        zhihu_id = self["zhihu_id"][
            0]  # the other way -- int("".join(self["zhihu_id"])) 转int,因为在数据库中是int类型 这里不用是因为zhihu.py中已经处理过
        topics = ",".join(self["topics"])
        url = self["url"][0]  # "".join(self["zhihu_id"])
        title = "".join(self["title"])
        content = "".join(self["content"])
        answer_num = extract_num("".join(self["answer_num"]))
        comments_num = extract_num("".join(self["comments_num"]))
        # watch_user_num = extract_num("".join(self["watch_user_num"]))
        crawl_time = datetime.datetime.now().strftime(
            SQL_DATETIME_FORMAT)  # strftime 将time转化为时间类型

        # 处理click_num的问题
        if len(self["watch_user_num"]) == 2:
            watch_user_num = int(self["watch_user_num"][0])
            click_num = int(self["watch_user_num"][1])
        else:
            watch_user_num = int(self["watch_user_num"][0])
            click_num = 0

        # 和insert into中的各个值顺序保持一致
        params = (zhihu_id, topics, url, title, content, answer_num,
                  comments_num, watch_user_num, click_num, crawl_time)

        return insert_sql, params
Beispiel #4
0
    def get_insert_sql(self):
        # 插入知乎question表的sql语句
        insert_sql = """
            insert into zhihu_question(zhihu_id, topics, url, title, 
                                        content, answer_num, comments_num, watch_user_num, 
                                        click_num, crawl_time)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num),
              watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num)
        """
        # 虽然zhihu_id用的是add_value(),但传进来后还是list。url同理
        zhihu_id = self["zhihu_id"][0]
        topics = ",".join(self["topics"])
        url = self["url"][0]
        title = "".join(self["title"])
        content = "".join(self["content"])
        # 由于知乎在数字中加入了逗号,这里做了一点小改动
        answer_num = extract_num("".join(
            [text.replace(",", "") for text in self["answer_num"]]))
        comments_num = extract_num("".join(
            [text.replace(",", "") for text in self["comments_num"]]))
        if len(self["watch_user_num"]) == 2:
            watch_user_num = int(self["watch_user_num"][0].replace(",", ""))
            click_num = int(self["watch_user_num"][1].replace(",", ""))
        else:
            watch_user_num = int(self["watch_user_num"][0].replace(",", ""))
            click_num = 0

        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        params = (zhihu_id, topics, url, title, content, answer_num,
                  comments_num, watch_user_num, click_num, crawl_time)

        return insert_sql, params
    def save_to_es(self):
        #turn the item in the ES's item
        question = ZhihuQuestionType()
        question.zhihu_id = self["zhihu_id"][0]
        question.topics = ",".join(self["topics"])
        question.url = self["url"][0]
        question.title = "".join(self["title"])
        if "content" in self:
            if self["content"]:
                question.content = "".join(self["content"])
        else:
            question.content = "No content"
        question.answer_num = extract_num("".join(self["answer_num"]))
        question.comments_num = extract_num("".join(self["comments_num"]))
        if len(self["watch_user_num"]) == 2:
            question.watch_user_num = int(self["watch_user_num"][0])
            question.click_num = int(self["watch_user_num"][1])
        else:
            question.watch_user_num = int(self["watch_user_num"][0])
            question.click_num = 0
        question.crawl_time = datetime.datetime.now().strftime(
            SQL_DATETIME_FORMAT)
        question.suggest = get_suggests(ZhihuQuestionType._doc_type.index,
                                        ((question.title, 10),
                                         (question.topics, 7),
                                         (question.content, 5)))

        question.save()
        redis_cli.incr("question_count")
        return
Beispiel #6
0
    def get_insert_sql(self):
        """
        插入知乎question表的sql语句
        :return:
        """
        insert_sql = '''
        insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, 
            watch_user_num, click_num, crawl_time)
        VALUES (%s, %s,%s,%s,%s,%s,%s,%s,%s,%s) 
        ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num),
        watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num)
        '''
        zhihu_id = self['zhihu_id'][0]
        topics = ','.join(self['topics'])
        url = self['url'][0]
        title = ''.join(self['title'])
        content = ''.join(self['content'])
        answer_num = extract_num(''.join(self['answer_num']))
        comments_num = extract_num(''.join(self['comments_num']))
        watch_user_num = extract_num((self['watch_user_num'])[0])
        click_num = extract_num((self['watch_user_num'])[1])
        crawl_time = datetime.now().strftime(SQL_DATETIME_FORMAT)
        # crawl_time = datetime.now().date()

        params = (zhihu_id, topics, url, title, content, answer_num,
                  comments_num, watch_user_num, click_num, crawl_time)
        return insert_sql, params
Beispiel #7
0
    def save_to_es(self):
        zhihu_question = ZhihuQuestion()
        zhihu_question.title_suggest = gen_suggests(ZhihuQuestion._doc_type.index,
                                                    ((zhihu_question.title, 10), (zhihu_question.topics, 7)))
        zhihu_question.title = self['title']
        zhihu_question.content = self["content"]
        zhihu_question.url = self["url"]
        zhihu_question.question_id = self["zhihu_id"][0]
        zhihu_question.answer_num = extract_num("".join(self["answer_num"]))
        zhihu_question.comments_num = extract_num("".join(self["comments_num"]))

        if len(self["watch_user_num"]) == 2:
            self["watch_user_num"][0] = str.replace(self["watch_user_num"][0], ",", "")
            zhihu_question.watch_user_num = int(self["watch_user_num"][0])
            self["watch_user_num"][1] = str.replace(self["watch_user_num"][1], ",", "")
            zhihu_question.click_num = int(self["watch_user_num"][1])
        else:
            zhihu_question.watch_user_num = int(self["watch_user_num"][0])
            zhihu_question.click_num = 0

        zhihu_question.topics = self["topics"]
        zhihu_question.meta.id = self["zhihu_id"][0]

        zhihu_question.save()
        redis_cli.incr("zhihu_question_count")
        return
Beispiel #8
0
    def save_to_es(self):
        try:
            if len(self["watch_user_num"]) == 2:
                watch_user_num = int(self["watch_user_num"][0])
                click_num = int(self["watch_user_num"][1])
            else:
                watch_user_num = int(self["watch_user_num"][0])
                click_num = 0
            topics = ",".join(self["topics"])
        except Exception:
            watch_user_num = 0
            click_num = 0

        try:
            topics = ",".join(self["topics"])
        except Exception:
            topics = ""

        try:
            answer_num = extract_num("".join(self["answer_num"]))
        except:
            answer_num = 0

        zhihu_id = self["zhihu_id"][0]
        url = self["url"][0]
        title = "".join(self["title"])
        content = "".join(self["content"])
        comments_num = extract_num("".join(self["comments_num"]))

        crawl_time = datetime.datetime.now().strftime(SQL_DATE_FORMAT)
        article = ZhihuQuestionType()
        article.meta.id = zhihu_id
        article.topics = topics
        article.url = url
        article.title = title
        article.content = content
        article.answer_num = answer_num
        article.comments_num = comments_num
        article.watch_user_num = watch_user_num
        article.click_num = click_num
        article.crawl_time = crawl_time

        article.suggest = gen_suggest(ZhihuQuestionType._doc_type.index, ((article.title, 10), (article.content, 9)))

        redis_cli.incr("zhihu_count")
        article.save()
        return
Beispiel #9
0
    def get_insert_sql(self):
        #插入知乎question表的sql语句
        insert_sql = """
            insert into zhihu_question(zhihu_id, topics, url, title, content, create_time, update_time, answer_num, comments_num, watch_user_num, click_num, crawl_time, crawl_update_time)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num),
              watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num)            
        """
        #以下对数组 list [] 进行处理
        #zhihu_id = int("".join(self["zhihu_id"]))
        zhihu_id = self["zhihu_id"][0]  #zhihu_id在zhihu.py中已经处理成int了
        topics = ",".join(self["topics"])
        #url = "".join(self["url"])
        url = self["url"][0]
        title = "".join(self["title"])
        content = "".join(self["content"])
        answer_num = extract_num("".join(
            self["answer_num"]))  #在utils里的common.py里定义方法extract_num  #用来提取数字
        comments_num = extract_num("".join(self["comments_num"]))

        if len(self["watch_user_num"]) == 2:
            self["watch_user_num"] = [
                x.replace(',', '') for x in self["watch_user_num"]
            ]  #去掉['3,110', '824,551']中的,
            watch_user_num = int(self["watch_user_num"][0])
            click_num = int(self["watch_user_num"][1])
        else:
            watch_user_num = int(self["watch_user_num"][0])
            click_num = 0

        crawl_time = datetime.datetime.now().strftime(
            SQL_DATETIME_FORMAT
        )  #strftime(SQL_DATETIME_FORMAT)可以把time类型转化成str类型  # (SQL_DATETIME_FORMAT)可以在settings.py中指定要转为哪种格式

        #         create_time = datetime.datetime.now().date()  #填充date,使得数据库可以插入
        #         update_time = datetime.datetime.now().date()
        #         crawl_update_time = datetime.datetime.now().date()
        create_time = None  #填充date,使得数据库可以插入
        update_time = None
        crawl_update_time = None

        params = (zhihu_id, topics, url, title, content, create_time,
                  update_time, answer_num, comments_num, watch_user_num,
                  click_num, crawl_time, crawl_update_time)

        return insert_sql, params
Beispiel #10
0
 def save_to_es(self):
     question = ArticleQuestionType()
     question.question_id = "zhihu-" + str(self["zhihu_id"][0])
     question.title = "".join(self["title"])
     question.content = remove_tags("".join(self["content"]))
     question.answer_num = extract_num("".join(self["answer_num"]).replace(
         ',', ''))
     question.suggest = gen_suggests(ArticleQuestionType._doc_type.index,
                                     ((question.title, 10), ))
     question.source = "Zhihu"
     question.url = self["url"][0]
     question.save()
Beispiel #11
0
    def get_insert_sql(self):
        insert_sql = '''
                    insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num,
                    watch_user_num, follower_num, crawl_time)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                    ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num),
                    watch_user_num=VALUES(watch_user_num), follower_num=VALUES(follower_num)
                '''
        zhihu_id = self['zhihu_id'][0]
        topics = ','.join(self['topics'])
        url = ''.join(self['url'])  # 因为里面就只有一个内容,所以这么写等同于self['url'][0]
        title = self['title'][0]
        content = self['content'][1] if self['content'][1] else self['content'][0]
        answer_num = extract_num(self['answer_num'][0])
        comments_num = extract_num(self['comments_num'][0])
        follower_num = self['watch_user_num'][0]
        watch_user_num = self['watch_user_num'][1]
        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        params = (zhihu_id, topics, url, title, content, answer_num, comments_num,
                    watch_user_num, follower_num, crawl_time)
        return insert_sql, params
Beispiel #12
0
    def get_insert_sql(self):
        # 插入知乎question表的sql语句
        insert_sql = """
            insert into zhihu_question ( `comments_num`, `answer_num`, `click_num`, `zhihu_id`, `watch_user_num`, `url`, `title`, `topics`, `content`) 
            values ( %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num),
              watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num)
        """
        zhihu_id = self["zhihu_id"][0]
        url = self["url"][0]
        title = "".join(self["title"])
        content = "".join(self["content"])
        comments_num = extract_num("".join(self["comments_num"]))

        try:
            topics = ",".join(self["topics"])
        except Exception:
            topics = ""

        try:
            answer_num = extract_num("".join(self["answer_num"]))
        except:
            answer_num = 0

        try:
            if len(self["watch_user_num"]) == 2:
                watch_user_num = int(self["watch_user_num"][0])
                click_num = int(self["watch_user_num"][1])
            else:
                watch_user_num = int(self["watch_user_num"][0])
                click_num = 0
        except:
            click_num = 0
            watch_user_num = 0
        # crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        params = (comments_num, answer_num, click_num, zhihu_id, watch_user_num, url, title, topics, content)

        return insert_sql, params
Beispiel #13
0
    def get_insert_sql(self):
        # 插入知乎question表的sql语句
        insert_sql = """
            insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num,
              watch_user_num, click_num, crawl_time
              ) 
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num),
              watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num)
        """

        zhihu_id = self["zhihu_id"][0]
        topics = ",".join(self["topics"])
        url = self["url"][0]
        title = "".join(self["title"])
        try:
            content = "".join(self["content"])
        except BaseException:
            content = "无"
        try:
            answer_num = extract_num("".join(self["answer_num"]))
        except BaseException:
            answer_num = 0
        comments_num = extract_num("".join(self["comments_num"]))

        if len(self["watch_user_num"]) == 2:
            # 注意处理异常invalid literal for int() with base 10.数字中有逗号无法转换成int类型.
            watch_user_num = int((self["watch_user_num"][0]).replace(",", ""))
            click_num = int((self["watch_user_num"][1]).replace(",", ""))
        else:
            watch_user_num = int((self["watch_user_num"][0]).replace(",", ""))
            click_num = 0

        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        params = (zhihu_id, topics, url, title, content, answer_num, comments_num,
                  watch_user_num, click_num, crawl_time)

        return insert_sql, params
Beispiel #14
0
    def save_to_es(self):
        question = ZhihuQuestionType()

        question.meta.id = self['zhihu_id'][0]
        question.zhihu_id = self['zhihu_id'][0]
        question.topics = ",".join(self.get('topics', [""]))
        question.url = self['url'][0]
        question.title = "".join(self.get("title", [""]))
        question.content = remove_tags("".join(self.get("content", [""])))

        # 由于知乎在数字中加入了逗号,这里做了一点小改动
        answer_num = extract_num("".join(
            [text.replace(",", "") for text in self.get('answer_num', [""])]))
        comments_num = extract_num("".join([
            text.replace(",", "") for text in self.get('comments_num', [""])
        ]))
        if len(self["watch_user_num"]) == 2:
            watch_user_num = int(self["watch_user_num"][0].replace(",", ""))
            click_num = int(self["watch_user_num"][1].replace(",", ""))
        else:
            watch_user_num = int(self["watch_user_num"][0].replace(",", ""))
            click_num = 0

        question.answer_num = answer_num
        question.comments_num = comments_num
        question.watch_user_num = watch_user_num
        question.click_num = click_num
        question.crawl_time = datetime.datetime.now().date()

        question.suggest = gen_suggests(ZhihuQuestionType._index._name,
                                        ((question.title, 10),
                                         (question.topics, 7),
                                         (question.content, 5)))

        question.save()
        # redis中记录爬取document数目
        redis_cli.incr("zhihuquestion_count")
        return
Beispiel #15
0
    def get_insert_sql(self):
        insert_sql = """
            insert into zhihu_question(zhihu_id, tag, url, title, main_content, click_num, focus_num,
                        comment_num, answer_num, crawl_time)
            VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE answer_num=VALUES(answer_num),comment_num=VALUES(comment_num),
                                    click_num=VALUES(click_num),focus_num=VALUES(focus_num)           
        """
        zhihu_id = int("".join(self["zhihu_id"]))
        tag = ",".join(self["tag"])
        title = "".join(self["title"])
        main_content = "".join(self["main_content"])
        focus_num = extract_num("".join(self["focus_num"]))
        click_num = extract_num("".join(self["click_num"]))
        comment_num = extract_num("".join(self["comment_num"]))
        answer_num = extract_num("".join(self["answer_num"]))
        url = self["url"][0]
        crawl_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        items = (zhihu_id, tag, url, title, main_content, click_num,
                 comment_num, answer_num, focus_num, crawl_time)

        return insert_sql, items
Beispiel #16
0
    def get_insert_sql(self):
        insert_sql = """
            insert into lagou_job(title, url, salary, job_city, work_years, degree_need,
            job_type, publish_time, job_advantage, job_desc, job_addr, company_url, company_name, job_id,crawl_time)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s) ON DUPLICATE KEY UPDATE job_desc=VALUES(job_desc)
        """
        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)
        job_id = extract_num(self["url"])
        params = (self["title"], self["url"], self["salary"], self["job_city"], self["work_years"], self["degree_need"],
                  self["job_type"], self["publish_time"], self["job_advantage"], self["job_desc"], self["job_addr"],
                  self["company_url"],
                  self["company_name"], job_id,crawl_time)

        return insert_sql, params
Beispiel #17
0
    def get_insert_sql(self):
        insert_sql = """
                           insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num,comments_num,
                              watch_user_num,crawl_time)
                           VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)
                           ON DUPLICATE KEY UPDATE content=VALUES (content),answer_num=VALUES (answer_num),comments_num=VALUES (comments_num),
                              watch_user_num=VALUES (watch_user_num)
                          
                       """
        zhihu_id = self["zhihu_id"][0]
        topics = ','.join(self["topics"])
        url = self["url"][0]
        title = ''.join(self["title"])
        content = ''.join(self["content"])
        answer_num = extract_num(''.join(self["answer_num"]))
        comments_num = extract_num(''.join(self['comments_num']))
        watch_user_num = extract_num(''.join(self['watch_user_num']))
        # click_num = extract_num(''.join(self['click_num']))
        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        params = (zhihu_id, topics, url, title, content, answer_num,
                  comments_num, watch_user_num, crawl_time)
        return insert_sql, params
Beispiel #18
0
    def get_insert_sql(self):
        insert_sql = """
            insert into lagou_job(title, url,url_object_id, salary, job_city, work_years, degree_need,
            job_type, publish_time, job_advantage, job_desc, job_addr, company_url, company_name, job_id)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE job_desc=VALUES(job_desc)
        """

        job_id = extract_num(self["url"])
        params = (self["title"], self["url"], self["url_object_id"],
                  self["salary"], self["job_city"], self["work_years"],
                  self["degree_need"], self["job_type"], self["publish_time"],
                  self["job_advantage"], self["job_desc"], self["job_addr"],
                  self["company_url"], self["company_name"], job_id)

        return insert_sql, params
Beispiel #19
0
    def get_insert_sql(self):
        insert_sql = """
            insert into lagou_jobs(title, url, url_object_id, salary, job_city, work_years, degree_need, job_type, publish_time, job_advantage, job_desc, job_address, company_url, company_name, id, crawl_time, crawl_update_time, tags)
            values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            on duplicate key update job_desc=values(job_desc)
        """

        id = extract_num(self["url"])
        params = (self['title'], self['url'], self['url_object_id'],
                  self['salary'], self['job_city'], self['work_years'],
                  self['degree_need'], self['job_type'], self['publish_time'],
                  self['job_advantage'], self['job_desc'], self['job_address'],
                  self['company_url'], self['company_name'], id,
                  self['crawl_time'], self['crawl_update_time'], self['tags'])

        return insert_sql, params
Beispiel #20
0
    def get_insert_sql(self):
        insert_sql = """
            insert into lagou_job(title, url, salary, job_city, work_years, degree_need,
            job_type, publish_time, job_advantage, job_desc, job_addr, company_url, company_name, job_id)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE salary=VALUES(salary), job_city=VALUES(job_city), work_years=VALUES(work_years),
              degree_need=VALUES(degree_need), job_type=VALUES(job_type), publish_time=VALUES(publish_time), tags=VALUES(tags)
              , job_advantage=VALUES(job_advantage), job_desc=VALUES(job_desc), crawl_time=VALUES(crawl_time)
        """

        job_id = extract_num(self["url"])
        params = (self["title"], self["url"], self["salary"], self["job_city"], self["work_years"], self["degree_need"],
                  self["job_type"], self["publish_time"], self["job_advantage"], self["job_desc"], self["job_addr"],
                  self["company_url"],
                  self["company_name"], job_id)

        return insert_sql, params
Beispiel #21
0
 def save_to_es(self):
     lagou = Lagou(meta={'id': extract_num(self["url"])})
     lagou.title = self['title']
     lagou.url = self['url']
     lagou.url_object_id = self['url_object_id']
     lagou.salary = self['salary']
     lagou.job_city = self['job_city']
     lagou.work_years = self['work_years']
     lagou.degree_need = self['degree_need']
     lagou.job_type = self['job_type']
     lagou.publish_time = self['publish_time']
     lagou.job_advantage = self['job_advantage']
     lagou.job_desc = self['job_desc']
     lagou.job_address = self['job_address']
     lagou.company_url = self['company_url']
     lagou.company_name = self['company_name']
     lagou.crawl_time = self['crawl_time']
     lagou.crawl_update_time = self['crawl_update_time']
     lagou.tags = self['tags']
     lagou.suggest = gen_suggests(Lagou._index._name,
                                  ((lagou.title, 10), (lagou.job_desc, 7)))
     lagou.save()
     return
Beispiel #22
0
    def save_to_es(self):
        crawl_time = datetime.datetime.now().strftime(SQL_DATE_FORMAT)
        job_id = extract_num(self["url"])
        article = LagouType()
        article.meta.id = job_id
        article.title = self['title']
        article.url = self['url']
        article.salary = self['salary']
        article.job_city = self['job_city']
        article.work_years = self['work_years']
        article.degree_need = self['degree_need']
        article.job_type = self['job_type']
        article.publish_time = self['publish_time']
        article.job_advantage = self['job_advantage']
        article.job_desc = self['job_desc']
        article.job_addr = self['job_addr']
        article.company_name = self['company_name']
        article.crawl_time = crawl_time
        article.crawl_update_time = crawl_time

        article.suggest = gen_suggest(LagouType._doc_type.index, ((article.title, 10), (article.company_name, 9), (article.job_desc, 8), (article.job_addr, 7)))

        redis_cli.incr("lagou_count")
        article.save()