def save_to_es(self): zhihu_question = ZhihuQuestion() zhihu_question.title_suggest = gen_suggests( ZhihuQuestion._doc_type.index, ((zhihu_question.title, 10), (zhihu_question.topics, 7))) zhihu_question.title = self['title'] zhihu_question.content = self["content"] zhihu_question.url = self["url"] zhihu_question.question_id = self["zhihu_id"][0] zhihu_question.answer_num = extract_num("".join(self["answer_num"])) zhihu_question.comments_num = extract_num("".join( self["comments_num"])) if len(self["watch_user_num"]) == 2: self["watch_user_num"][0] = str.replace(self["watch_user_num"][0], ",", "") zhihu_question.watch_user_num = int(self["watch_user_num"][0]) self["watch_user_num"][1] = str.replace(self["watch_user_num"][1], ",", "") zhihu_question.click_num = int(self["watch_user_num"][1]) else: zhihu_question.watch_user_num = int(self["watch_user_num"][0]) zhihu_question.click_num = 0 zhihu_question.topics = self["topics"] zhihu_question.meta.id = self["zhihu_id"][0] zhihu_question.save() redis_cli.incr("zhihu_question_count") return
def get_insert_sql(self): insert_sql = ''' insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) on duplicate key UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num), watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num), crawl_update_time=VALUES(crawl_time) ''' # 这个item是用ItemLoader加载的item,每一项都是list,可以像JobBoleArticleItem那样处理 # 也可以自己手动处理 zhihu_id = self['zhihu_id'][0] topics = ','.join(self['topics']) url = self['url'][0] title = self['title'][0] content = self['content'][0] answer_num = extract_num((''.join(self['answer_num']).replace(',', ''))) comments_num = extract_num( (''.join(self['comments_num']).replace(',', ''))) watch_user_num = extract_num(self['watch_user_num'][0].replace( ',', '')) click_num = extract_num(self['click_num'][1].replace(',', '')) crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) return insert_sql, params
def get_insert_sql(self): insert_sql = """ insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time ) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(content),answer_num=VALUES(answer_num), comments_num=VALUES(comments_num),watch_user_num=VALUES(watch_user_num),click_num=VALUES(click_num) """ zhihu_id = self['zhihu_id'][0] #不能用''.join(self['zhihu_id'])因为list里面是int topics = ','.join(self['topics']) url = ''.join(self['url']) title = ''.join(self['title']) content = ''.join(self['content']) answer_num = extract_num(''.join(self['answer_num'])) if self.get('answer_num', '') else 0 comments_num = extract_num(''.join(self['comments_num'])) watch_user_num = int(self['watch_user_num'][1].replace(',', '')) click_num = int(self['watch_user_num'][0].replace(',', '')) crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) return insert_sql, params
def get_insert_sql(self): # 插入知乎question表的sql语句 insert_sql = 'insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num), watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num)' zhihu_id = self['zhihu_id'][0] topics = ','.join(self['topics']) url = self['url'][0] title = self['title'][0] content = self['content'][0] answer_num = extract_num(''.join(self['answer_num']).replace(',','')) comments_num = extract_num(self['comments_num'][1].replace(',','')) if len(self['watch_user_num']) == 2: watch_user_num = int(self['watch_user_num'][0].replace(',','')) click_num = int(self['watch_user_num'][1].replace(',','')) else: watch_user_num = int(self['watch_user_num'][0].replace(',','')) click_num = 0 crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) return insert_sql, params
def get_insert_sql(self): insert_sql = """ insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num), watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num) """ # 另一种处理itemloader(返回原始数据是list)的方法 zhihu_id = self['zhihu_id'][0] topics = ','.join(self['topics']) url = self['url'][0] title = self['title'][0] content = self['content'][0] answer_num = extract_num(''.join(self['answer_num'])) comments_num = extract_num(''.join(self['comments_num'])) watch_user_num = extract_num(self['watch_user_num'][0].replace( ',', '')) click_num = extract_num(self['watch_user_num'][1].replace(',', '')) crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) return insert_sql, params
def get_insert_sql(self): # 这里必须与pipelines下的do_insert()方法下的调用get_insert_sql一致,也与其他网站调用都是写统一的函数。像上面的JobboleArticleItem一样 insert_sql = """ insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comment_num, watch_user_num, click_num, crawl_time )VALUES(%s, %s, %s, %s,%s, %s, %s, %s,%s, %s, %s, %s) ON DUPLICATE KEY UPDATE content = VALUES(content),answer_num = VALUES(answer_num),comments_num = VALUES(comment_num), watch_user_num = VALUES(watch_user_num), click_num =VALUES(click_num), crawl_time = VALUES(crawl_time) """ zhihu_id = self["zhihu_id"][0] topics = ",".join(self["topics"]) url = self["url"][0] title = "".join(self["title"]) content = "".join(self["content"]) answer_num = "".join(self["answer_num"]) comments_num = extract_num("".join(self["comments_num"])) watch_user_num = extract_num("".join(self["watch_user_num"])) click_num = extract_num("".join(self["click_num"])) crawl_time = datetime.datetime.now().strftime( SQL_DATETIME_FORMAT) # 上strtime是想把time转换成str类型的 params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) return insert_sql, params
def save_to_es(self): question = QuestionType() question.title = "".join(self["title"]) question.topics = ",".join(self["topics"]) #question.crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) question.url = self["url"][0] question.zhihu_id = self["zhihu_id"][0] question.answer_num = extract_num("".join(self["answer_num"])) question.comment_nums = extract_num("".join(self["comments_num"])) if len(self["watch_user_num"]) == 2: question.watch_user_num = self["watch_user_num"][0] question.click_num = self["watch_user_num"][1] else: question.watch_user_num = self["watch_user_num"][0] question.click_num = 0 question.content = remove_tags("".join(self["content"])) question.suggest = gen_suggests(QuestionType._doc_type.index, ((question.title, 10), (question.topics, 7))) question.save() redis_cli.incr("zhihu_count") return
def get_insert_sql(self): insert_sql = """ insert into zhihu_question (zhihu_id,topics,title,url,content,answer_num,comments_num,watch_user_num,click_num,crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num), watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num) """ zhihu_id = self["zhihu_id"][0] topics = ",".join(self["topics"]) url = self["url"][0] title = "".join(self["title"]) try: content = "".join(self["content"]) except BaseException: content = "无" try: answer_num = extract_num("".join(self["answer_num"])) except BaseException: answer_num = 0 comments_num = extract_num("".join(self["comments_num"])) if len(self["watch_user_num"]) == 2: watch_user_num = extract_num(self["watch_user_num"][0]) click_num = extract_num(self["watch_user_num"][1]) else: watch_user_num = extract_num(self["watch_user_num"][0]) click_num = 0 crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) return insert_sql, params
def get_insert_sql(self): # 插入知乎question表的sql语句 insert_sql = """ insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num), watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num) """ zhihu_id = self["zhihu_id"][0] topics = ",".join(self["topics"]) url = self["url"][0] title = "".join(self["title"]) content = "".join(self["content"]) answer_num = extract_num("".join(self["answer_num"])) comments_num = extract_num("".join(self["comments_num"])) if len(self["watch_user_num"]) == 2: self["watch_user_num"][0] = str.replace(self["watch_user_num"][0], ",", "") watch_user_num = int(self["watch_user_num"][0]) self["watch_user_num"][1] = str.replace(self["watch_user_num"][1], ",", "") click_num = int(self["watch_user_num"][1]) else: watch_user_num = int(self["watch_user_num"][0]) click_num = 0 crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) return insert_sql, params
def get_insert_sql(self): insert_sql = """ insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s,%s, %s) ON DUPLICATE KEY UPDATE content=VALUES(content),answer_num=VALUES(answer_num), comments_num=VALUES (comments_num),watch_user_num=VALUES (watch_user_num), click_num=VALUES (click_num) """ zhihu_id = self["zhihu_id"][0] topics = ",".join(self["topics"]) url = self["url"][0] title = "".join(self["title"]) content = "".join(self["content"]) answer_num = extract_num("".join(self["answer_num"])) comments_num = extract_num("".join(self["comments_num"])) watch_user_num = extract_num("".join(self["watch_user_num"])) click_num = extract_num("".join(self["click_num"])) crawl_time = datetime.datetime.now().strftime("") params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) return insert_sql, params
def save_to_es(self): """ 存储到elasticsearch """ question = ZhiHuQuestionType() question.zhihu_id = self['zhihu_id'][0] question.topics = ",".join(self["topics"]) question.url = self["url"][0] question.title = "".join(self["title"]) question.content = content = "".join(self["content"]) try: question.answer_num = self["answer_num"][0] except: question.answer_num = 0 question.comment_num = extract_num("".join(self["comment_num"])) if len(self["watch_user_num"]) == 2: question.watch_user_num = self["watch_user_num"][0] question.click_num = self["watch_user_num"][1] else: question.watch_user_num = self["watch_user_num"][0] question.click_num = 0 crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) question.crawl_time = crawl_time question.suggest = gen_suggests(es_question, ZhiHuQuestionType._doc_type.index, ((question.title, 10), (question.topics, 7), (question.content, 3))) question.save() redis_cli.incr("question_count") return
def get_insert_sql(self): insert_sql = ''' insert into zhihu_question(zhihu_id,topics,url,content,title,answer_num,comments_num,watch_user_num ,click_num,crawl_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE content VALUES (content),comments_num = VALUES (comments_num), answer_num=VALUES (answer_num), watch_user_num =VALUES (watch_user_num),click_num = VALUES (click_num),crawl_update_time = VALUES (crawl_time) ''' zhihu_id = int(''.join(self['zhihu_id'])) topics = ','.join(self['topics']) url = ''.join(self['url']) content = ''.join(self['content']) if self['content'] else 'nothing' title = ''.join(self['title']) answer_num = extract_num(''.join(self['answer_num'])) comments_num = extract_num(''.join(self['comments_num'])) watch_user_num = self['watch_user_num'][0].replace(',','') click_num = self['watch_user_num'][1].replace(',','') crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) parmas = (zhihu_id,topics,url,content,title,answer_num,comments_num,watch_user_num,click_num,crawl_time) return insert_sql,parmas
def get_insert_sql(self): #插入知乎question表的sql语句 insert_sql = """ insert into zhihu_question(zhihu_id,topics,url,title,content,answer_num,comments_num,watch_user_num,click_num,crawl_time) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ zhihu_id = self["zhihu_id"][0] topics = ",".join(self["topics"]) url = self["url"][0] title = self["title"][0] content = self["content"][0] answer_num = extract_num("".join(self["answer_num"])) comments_num = extract_num("".join(self["comments_num"])) watch_user_num = extract_num("".join(self["watch_user_num"])) click_num = extract_num("".join(self["click_num"])) crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) return insert_sql, params
def get_insert_sql(self): insert_sql = """ insert into zhihu_question (question_id, topics, url, title, content, answers_num, comments_num, watch_users_num, clicked_num) values (%s, %s, %s, %s, %s, %s, %s, %s, %s) """ question_id = int("".join(self['question_id'])) topics = ",".join(self['topics']) url = self['url'][0] title = self['title'][0] content = self['content'][0] answers_num = extract_num("".join(self['answers_num'])) comments_num = extract_num("".join(self['comments_num'])) watch_users_num = int(self['watch_users_num'][0]) clicked_num = int(self['watch_users_num'][1]) params = (question_id, topics, url, title, content, answers_num, comments_num, watch_users_num, clicked_num) return insert_sql, params
def get_insert_sql(self): # 插入知乎question表的sql语句 insert_sql = ''' insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time )VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num), watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num) ''' zhihu_id = self['zhihu_id'][0] topics = ",".join(self['topics']) url = self['url'][0] # 等价于"".join(self['url']) title = "".join(self['title']) content = "".join(self['content']) answer_num = extract_num("".join(self['answer_num'])) comments_num = extract_num("".join(self['comments_num'])) watch_user_num = extract_num("".join(self['watch_user_num'])) click_num = extract_num("".join(self['click_num'])) crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) return insert_sql, params
def get_insert_sql(self): insert_sql = """ insert into lagou_job(title, url, salary, job_city, work_years, degree_need, job_type, publish_time, job_advantage, job_desc, job_addr, company_url, company_name, job_id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE job_desc=VALUES(job_desc) """ job_id = extract_num(self["url"]) params = (self["title"], self["url"], self["salary"], self["job_city"], self["work_years"], self["degree_need"], self["job_type"], self["publish_time"], self["job_advantage"], self["job_desc"], self["job_addr"], self["company_url"], self["company_name"], job_id) return insert_sql, params
def get_insert_sql(self): insert_sql = """ insert into zhihu_question(zhihu_id,topics,url,title,content,answer_num, comments_num,watch_user_num,click_num,crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(content), comments_num=VALUES(comments_num), answer_num=VALUES(answer_num), click_num=VALUES(click_num) """ # 这里做这些处理是因为用itemloader得到的item的值都是list的,所以要做转换 zhihu_id = self['zhihu_id'][0] topics = ','.join(self['topics']) url = self['url'][0] title = ''.join(self['title']) content = '\n'.join(self['content']) answer_num = extract_num(''.join(self['answer_num'])) comments_num = extract_num(''.join(self['comments_num'])) watch_user_num = extract_num(self['watch_user_num'][0]) click_num = extract_num(self['watch_user_num'][1] if len(self['watch_user_num']) == 2 else '0') crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) values = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) return insert_sql, values
def get_insert_sql(self): #插入知乎表 insert_sql = ''' insert into zhihu_question(zhihu_id,topics,url,title,content,answer_num,comment_num, watch_user_num,click_num,crawl_time) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON duplicate key UPDATE content = VALUES (content), answer_num = VALUES(answer_num),comment_num = VALUES (comment_num) ,watch_user_num = VALUES (watch_user_num),click_num = VALUES (click_num), ''' zhihu_id = self['zhihu_id'][0] topics = ','.join(self['topics']) url = self['url'][0] title = ''.join(self['title']) content = ''.join(self['content']) answer_num = extract_num(''.join(self['answer_num'])) comment_num =extract_num(''.join(self['comment_num'])) watch_user_num = extract_num(''.join(self['watch_user_num'])) click_num = extract_num(''.join(self['click_num'])) crawl_time = datetime.datetime.now().strptime(SQL_DATETIME_FORMAT) params = (zhihu_id, topics, url, title,content,answer_num,comment_num,watch_user_num,click_num,crawl_time) return insert_sql,params
def get_insert_sql(self): insert_sql = """ insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num), watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num) """ zhihu_id = self['zhihu_id'][0] topics = ",".join(self['topics']) url = self['url'][0] title = self['title'][0] content = self['content'][0] answer_num = extract_num(self['answer_num'][0]) watch_user_num = int(self['watch_user_num'][0]) click_num = int(self['watch_user_num'][1]) comments_num = self['comments_num'][0] crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) return insert_sql, params