def crawl_dialogue_by_comment_id(cid, mid): cur_time = int(time.time() * 1000) dialogue_url = AJAX_URL.format(cid, cur_time) html = get_page(dialogue_url, auth_level=2, is_ajax=True) dialogue_data = dialogue.get_dialogue(html, mid, cid) CommonOper.add_one(dialogue_data)
def get_people_and_follows(people_id, selector): try: people = People() people.people_id = people_id people.name = selector.xpath( '//div[@class="aw-user-center"]/div[1]/div/h1/text()')[0].strip() people.desc = "".join( selector.xpath( '//div[@class="aw-user-center"]/div[1]/div/span/text()')) if selector.xpath('//i[contains(@class,"i-user-locate")]'): user_locate = selector.xpath( '//i[contains(@class,"i-user-locate")]')[0].getparent() people.province = "".join(user_locate.xpath('a[1]/text()')) people.city = "".join(user_locate.xpath('a[2]/text()')) if selector.xpath('//i[contains(@class,"i-user-post")]'): user_post = selector.xpath( '//i[contains(@class,"i-user-post")]')[0].getparent() people.post = "".join(user_post.xpath('text()')).strip() if selector.xpath('//i[contains(@class,"i-user-visits")]'): user_visits = selector.xpath( '//i[contains(@class,"i-user-visits")]')[0].getparent() user_visits_str = "".join(user_visits.xpath('text()')) people.home_visit_num = re.findall('(\d+)', user_visits_str)[0] people_type_spans = selector.xpath( '//div[@class="aw-user-center"]/div[1]/div/p[3]/span') people.user_type = people_type_spans[0].xpath( 'a/em/text()')[0].replace("»", "").strip() people.weiwang_num = people_type_spans[1].xpath('em/text()')[0] people.agree_num = people_type_spans[2].xpath('em/text()')[0] people.thanks_num = people_type_spans[3].xpath('em/text()')[0] people.gold_num = people_type_spans[4].xpath('em/text()')[0] if '+' in people.gold_num: people.gold_num = 100 if selector.xpath('//span[contains(text(),"最后活跃")]'): last_active_time_str = selector.xpath( '//span[contains(text(),"最后活跃")]')[0].getparent().getnext( ).xpath('text()')[0] people.last_active_time = str2datetime(last_active_time_str) CommonOper.add_one(people) CommonOper.add_filter_key("people_id", people_id) except Exception as e: jsl_log.warning( "get people info error,people_id:{},here are details {}".format( people_id, format_tb(e.__traceback__)[0])) app.send_task("tasks.people.do_follow", args=( people_id, 0, ), queue="people_queue", routing_key="people")
def do_question(question_id): if not CommonOper.is_exist("question_id", question_id): question_url = question_url_format.format(question_id) jsl_log.info("开始爬取url:{}".format(question_url)) crawl_question_and_answer(question_url) else: jsl_log.info("question id:{}已存在,跳过".format(question_id))
def do_people(people_id): if not CommonOper.is_exist("people_id", people_id): people_url = people_url_format.format(people_id) jsl_log.info("开始爬取url:{}".format(people_url)) crawl_people(people_url) else: jsl_log.info("people id:{}已存在,跳过".format(people_id))
def task_filter(task_type, param): if task_type == 'question': if not CommonOper.is_exist("question_id", param): app.send_task('tasks.question.do_question', args=(param, ), queue='question_queue', routing_key='question') else: jsl_log.info("相关question已存在,question_id:{}".format(param)) elif task_type == 'people': if not CommonOper.is_exist("people_id", param): app.send_task('tasks.people.do_people', args=(param, ), queue='people_queue', routing_key='people') else: jsl_log.info("相关people已存在,people_id:{}".format(param))
def crawl_dialogue_by_comment_id(cid, mid): cur_time = int(time.time() * 1000) dialogue_url = AJAX_URL.format(cid, cur_time) html = get_page(dialogue_url, auth_level=2, is_ajax=True) dialogue_data, uids = dialogue.get_dialogue(html, mid, cid) if dialogue_data: CommonOper.add_one(dialogue_data) if uids: for uid in uids: # crawl_person_infos_not_in_seed_ids(uid) app.send_task('tasks.user.crawl_person_infos_not_in_seed_ids', args=(uid, ), queue='user_crawler', routing_key='for_user_info')
def crawl_dialogue_by_comment_id(cid, mid): cur_time = int(time.time() * 1000) dialogue_url = AJAX_URL.format(cid, cur_time) html = get_page(dialogue_url, auth_level=2, is_ajax=True) dialogue_data, uids = dialogue.get_dialogue(html, mid, cid) if dialogue_data: CommonOper.add_one(dialogue_data) if uids: for uid in uids: # crawl_person_infos_not_in_seed_ids(uid) app.send_task('tasks.user.crawl_person_infos_not_in_seed_ids', args=(uid,), queue='user_crawler', routing_key='for_user_info')
def get_answer_comment(answer_id, selector): try: comment_list = selector.xpath('//ul/li') comment_id = 0 answer_comments = [] for c in comment_list: comment_id = comment_id + 1 answer_comment = AnswerComment() answer_comment.answer_id = answer_id answer_comment.comment_id = comment_id answer_comment.people_id = c.xpath('div/a/@data-id')[0] task_filter('people', answer_comment.people_id) post_time_str = c.xpath('div/span/text()')[0] answer_comment.post_time = str2datetime(post_time_str) answer_comment.content = "".join( c.xpath('div/p[@class="clearfix"]/text()')) answer_comments.append(answer_comment) CommonOper.add_all(answer_comments) except Exception as e: jsl_log.warning( "get answer_comment_list error,answer_id:{},here are details {}". format(answer_id, e))
def get_answers_and_agree(question_id, selector): try: answer_list = selector.xpath('//div[@class="aw-item"]') answers = [] for a in answer_list: answer = Answer() answer.question_id = question_id answer.answer_id = a.xpath('@id')[0].split('_')[2] answer.answer_type = 1 answer.people_id = a.xpath('a/@data-id')[0] task_filter('people', answer.people_id) answer.content = "".join(a.xpath('div/div/div[1]/div/text()')) post_time_str = a.xpath('div/div/div[2]/span/text()')[0] answer.post_time = str2datetime(post_time_str) answers.append(answer) answer_count_str = a.xpath('div/div/div[2]/a/text()')[0] answer_count = re.search('(\d+)', answer_count_str) if answer_count: app.send_task('tasks.question.do_answer_comment', args=(answer.answer_id, ), queue='answer_comment_queue', routing_key='answer_comment') agrees = [] agree_list = a.xpath('div/div/div[1]/p[2]/a/@data-id') for p in agree_list: task_filter('people', p) agree = Agree() agree.question_id = question_id agree.answer_id = answer.answer_id agree.people_id = p agrees.append(agree) CommonOper.add_all(agrees) CommonOper.add_all(answers) except Exception as e: jsl_log.warning( "get answer_list error,question_id:{},here are details {}".format( question_id, e))
def get_follows(follower_id, page_num, selector): try: follow_list = selector.xpath('//li') follows = [] if len(follow_list) == 30: app.send_task("tasks.people.do_follow", args=( follower_id, int(page_num) + 1, ), queue="people_queue", routing_key="people") for f in follow_list: follow = Follow() follow.refer_id = f.xpath('div/a/@data-id')[0] follow.follow_type = 1 follow.follower_id = follower_id follows.append(follow) task_filter("people", follow.refer_id) CommonOper.add_all(follows) except Exception as e: jsl_log.warning( "get follow_list error,follower_id:{},here are details {}".format( follower_id, e))
def get_question_and_agree(question_id, selector): try: question = Question() question.question_id = selector.xpath( '//div[@id="question_topic_editor"]/@data-id')[0] question.title = selector.xpath( '//div[@class="aw-mod-head"]/h1/text()')[0] question.people_id = selector.xpath( '//dd[@class="pull-left"]/a/@data-id')[0] task_filter('people', question.people_id) post_time_str = selector.xpath( '//div[@class="aw-question-detail-meta"]/div[1]/span[1]/text()' )[0].replace("发表时间 ", "") question.post_time = str2datetime(post_time_str) access_time_str = selector.xpath( '//div[@class="aw-side-bar-mod-body"]/ul/li[1]/span/text()')[0] question.access_time = str2datetime(access_time_str) question.read_num = selector.xpath( '//div[@class="aw-side-bar-mod-body"]/ul/li[2]/span/text()')[0] question.follow_num = selector.xpath( '//div[@class="aw-side-bar-mod-body"]/ul/li[3]/span/text()')[0] question.content = "".join( selector.xpath( '//div[contains(@class,"aw-question-detail-txt")]/text()')) CommonOper.add_one(question) CommonOper.add_filter_key("question_id", question_id) agrees = [] agree_list = selector.xpath( '//div[@class="aw-question-detail-meta"]/p[contains(@class,"aw-agree-by")]/a/@data-id' ) for p in agree_list: task_filter('people', p) agree = Agree() agree.question_id = question.question_id agree.people_id = p agrees.append(agree) CommonOper.add_all(agrees) except Exception as e: jsl_log.warning( "get question error,question_id:{},here are details {}".format( question_id, e))