Exemple #1
0
    def get_Index(self):
        url_to_get_index1 = self.urls[0]

        for i in range(1, 900):
            response1 = get_response_and_text(url=url_to_get_index1 + str(i) +
                                              '&size=20',
                                              headers=self.headers)
            response_in_function = response1['response_in_function']
            response_in_function_text = response1['response_in_function_text']
            datajson = json.loads(response_in_function_text)
            this_url_index_list = []  #专为获取评论浏览数量而设计
            for i in datajson:
                url_index = 'https://m.sohu.com/a/' + str(i['id']) + '_' + str(
                    i['authorId'])
                publish_time = i['publicTime']
                publish_time = int(publish_time) / 1000
                time_format = '%Y-%m-%d %H:%M:%S'
                publish_time_stamp_9 = time.localtime(float(publish_time))
                publish_time = time.strftime(time_format, publish_time_stamp_9)
                data_index = {
                    'publish_user':
                    i['authorName'],
                    'title':
                    i['title'],
                    'publish_time':
                    publish_time,
                    'id':
                    i['id'],
                    'url':
                    url_index,
                    'cmsid':
                    i['cmsId'],
                    'spider_time':
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                this_url_index_list.append(data_index)
                # self.content_data_list.append(data_index)#因为获得浏览量是单独的一个请求,所以
                # break
            viewernum_url = 'https://v2.sohu.com/public-api/articles/pv?articleIds='
            for viewernum_id in this_url_index_list:
                viewernum_url = viewernum_url + ',' + str(viewernum_id['id'])
            viewernum_url = viewernum_url.replace('articleIds=,',
                                                  'articleIds=')
            viewernum_info = requests.get(url=viewernum_url,
                                          headers=self.headers)
            viewernum_info_json = json.loads(viewernum_info.text)
            for data_index_no_viewer in this_url_index_list:
                noviewer_id = data_index_no_viewer['id']
                print noviewer_id
                data_index_no_viewer['read_count'] = viewernum_info_json[
                    '%s' % (str(noviewer_id))]

            self.content_data_list = self.content_data_list + this_url_index_list

            # break
            time.sleep(1)

        self.global_status_num_index = 0
Exemple #2
0
        def get_comment_comment(data1):  # 评论中有评论,起名data1是为了防止覆盖data变量
            id = data1['id']
            # session1 = requests.session()
            headers = {
                'User-Agent':
                'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
            }
            try:
                comment_url = 'http://www.toutiao.com/api/comment/get_reply/?comment_id=' + str(
                    id) + '&item_id=' + str(id) + '&offset=0&count=20'

                response1 = get_response_and_text(url=comment_url,
                                                  headers=headers)
                response_in_function = response1['response_in_function']
                response_in_function_text = response1[
                    'response_in_function_text']
                datajson = json.loads(response_in_function_text)

                # break
            except Exception as e:
                print e

            reply_nodes = []
            # datajson=json.loads(response_in_function.text)
            datajson = json.loads(response_in_function_text)
            for one_comment in datajson['data']['data']:
                content = one_comment['text']
                like_count = one_comment['digg_count']
                publish_time = one_comment['create_time']
                publish_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime(int(publish_time)))
                publish_user_id = one_comment['user']['user_id']
                publish_user = one_comment['user']['screen_name']
                publish_user_photo = one_comment['user']['avatar_url']
                id = one_comment['id']
                try:
                    ancestor_id = data1['ancestor_id']
                except Exception as e:
                    print e
                    ancestor_id = 'wrong'
                parent_id = data1['id']
                thisnode = {
                    'publish_user': publish_user,
                    'content': content,
                    'like_count': like_count,
                    'publish_time': publish_time,
                    'publish_user_id': publish_user_id,
                    'publish_user_photo': publish_user_photo,
                    'id': id,
                    'ancestor_id': ancestor_id,
                    'parent_id': parent_id,
                    # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                reply_nodes.append(thisnode)

            return reply_nodes
Exemple #3
0
        def get_content_inside_next_page(data):
            url = data['nexturl']
            content = data['content']
            img_urls = data['img_urls']
            headers = {
                'User-Agent':
                'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
            }
            response1 = get_response_and_text(url=url, headers=headers)
            response_in_function = response1['response_in_function']
            response_in_function_text = response1['response_in_function_text']

            datasoup = BeautifulSoup(response_in_function_text, 'lxml')
            content1 = ''
            for i in datasoup.select(
                    'body > div.scrollBox.mt10 > div.article > div.art_co.sau > p'
            ):
                content1 += i.text
            content += content1
            # 8-3
            Re_find_img_url = re.compile(r'src=".*?"/\>')
            content_part_data = datasoup.select('div.article')
            if content_part_data:
                data_find_by_re = Re_find_img_url.findall(
                    str(content_part_data[0]))
                img_urls2 = []
                for url_img_re in data_find_by_re:
                    imgurl = url_img_re.split('"')[1]
                    img_urls2.append(imgurl)
                for url_without_http in img_urls2:
                    if 'http' not in url_without_http:
                        url_without_http = 'http:' + url_without_http
                        img_urls.append(url_without_http)
                        pass
            # 8-3

            next_page_selector = datasoup.select(
                'body > div.scrollBox.mt10 > div.article > div.mb10.mt5.fs14 > a.page-next.ml5'
            )
            if next_page_selector:
                next_page_html = next_page_selector[0].get('href')
                if next_page_html and len(next_page_html) > 3:
                    next_page_url = next_page_html
                    next_url = 'http://m.xilu.com' + next_page_url
                    content_result = get_content_inside_next_page({
                        'content':
                        content,
                        'nexturl':
                        next_url,
                        'img_urls':
                        img_urls
                    })
                    return content_result
                else:
                    return {'content': content, 'img_urls': img_urls}
Exemple #4
0
        def get_content_inside(data):
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
            }
            url_for_debug = data['url']
            response1 = get_response_and_text(url=url_for_debug,
                                              headers=headers)
            response_in_function = response1['response_in_function']
            response_in_function_text = response1['response_in_function_text']

            img_urls = []
            content = ''
            datasoup = BeautifulSoup(response_in_function_text, 'lxml')
            if datasoup.select('#articleContent > div.display-content > p'):
                for i in datasoup.select(
                        '#articleContent > div.display-content > p'):
                    content += i.text
            else:
                for i in datasoup.select(
                        '#articleContent > div.display-content'):
                    content += i.text

            try:
                content_data = str(datasoup.select('#articleContent')[0])
            except Exception as e:
                print e
                try:
                    content_data = str(datasoup.select('#articleContent')[0])
                except:
                    return
            Re_find_img = re.compile(r'src=".*?"')
            imgs_find_by_re = Re_find_img.findall(content_data)
            for img_url in imgs_find_by_re:
                img_url = img_url.split('"')[1]
                if 'http' not in img_url:
                    img_url = 'https:' + img_url
                img_urls.append(img_url)
            data['content'] = content
            data['img_urls'] = img_urls
            data['reply_nodes'] = []
            data['spider_time'] = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')

            self.comments_url_list.append(data)
Exemple #5
0
    def get_Index(self):
        while True:
            for url_to_get_index in self.urls:
                for i in range(1):
                    try:
                        response1 = get_response_and_text(url=url_to_get_index)
                        response_in_function = response1[
                            'response_in_function']
                        response_in_function_text = response1[
                            'response_in_function_text']
                        response_text = response_in_function_text.decode(
                            'utf-8')
                        datajson = json.loads(response_text)
                        datajson_index_data = datajson['data']
                        for one_index in datajson_index_data:
                            try:
                                title = one_index['title']
                            except:
                                title = ''
                            try:
                                reply_count = int(one_index['comments_count'])
                            except:
                                reply_count = 0
                            url = 'http://www.toutiao.com' + one_index[
                                'source_url']
                            try:
                                publish_user = one_index['source']  # publisher
                            except:
                                publish_user = ''
                            try:
                                publish_user_photo = one_index[
                                    'media_avatar_url']
                                if 'http' not in publish_user_photo:
                                    publish_user_photo = 'http:' + publish_user_photo
                            except:
                                publish_user_photo = ''
                            try:
                                vedio_id = one_index['video_id']
                            except:
                                vedio_id = None
                            try:
                                is_ad = one_index['label']
                            except:
                                is_ad = False

                            if vedio_id:
                                continue  # 如果是视频,直接舍弃
                            if is_ad == u'广告':
                                continue

                            id = one_index['group_id']

                            dict1 = {
                                'id':
                                id,
                                'url':
                                url,
                                'reply_count':
                                reply_count,
                                'title':
                                title,
                                'publish_user':
                                publish_user,
                                'publish_user_photo':
                                publish_user_photo,
                                'spider_time':
                                datetime.datetime.now().strftime(
                                    '%Y-%m-%d %H:%M:%S')
                            }

                            self.content_data_list.append(dict1)
                            # break

                            # redis1.lpush('toutiao_index',dict(dict1))
                    except Exception as e:
                        pass
Exemple #6
0
        def get_comment_inside(data):
            # session1 = requests.session()
            headers = {
                'User-Agent':
                'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
            }

            while True:  # 强制请求
                try:
                    print data
                    comment_url = 'http://www.toutiao.com/api/comment/list/?group_id=' + str(
                        data['id']) + '&item_id=' + str(
                            data['item_id']) + '&offset=0&count=20'
                    response1 = get_response_and_text(url=comment_url)
                    response_in_function = response1['response_in_function']
                    response_in_function_text = response1[
                        'response_in_function_text']

                    break
                except Exception as e:
                    print e, 'mark1'
                    if 'item_id' in e:
                        messege = {'msg': e.message}
                        logger_toutiao.log(msg=messege, level=logging.WARNING)
            comments_data = []
            try:
                data_json = json.loads(response_in_function_text)
            except Exception as e:
                print e
            for one_comment in data_json['data']['comments']:
                content = one_comment['text']
                like_count = one_comment['digg_count']
                publish_time = one_comment['create_time']
                publish_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime(int(publish_time)))
                publish_user_photo = one_comment['user']['avatar_url']
                publish_user_id = one_comment['user']['user_id']
                publish_user = one_comment['user']['name']  #8-17日改
                id = one_comment['id']
                reply_count = one_comment['reply_count']
                parent_id = data['id']
                ancestor_id = data['id']

                if reply_count > 0:
                    reply_nodes = get_comment_comment({
                        'id': id,
                        'ancestor_id': data['id']
                    })
                else:
                    reply_nodes = []

                thisnode = {
                    'content': content,
                    'like_count': like_count,
                    'publish_time': publish_time,
                    'publish_user_photo': publish_user_photo,
                    'publish_user_id': publish_user_id,
                    'publish_user': publish_user,
                    'id': id,
                    'reply_count': reply_count,
                    'reply_nodes': reply_nodes,
                    'parent_id': parent_id,
                    'ancestor_id': ancestor_id,
                    # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                # data['reply_nodes'].append(thisnode)
                comments_data.append(thisnode)

            # 这里的评论能获取的就20个,所以不设计下一页,评论中的评论也是不设计下一页

            data['reply_nodes'] = comments_data
            while len(self.result_list) > 600:
                time.sleep(1)
                print 'is waiting the lenth of the result_list to decrease to 300'

            self.result_list.append(data)
Exemple #7
0
        def get_content_in_wenda_comments_more(id_replynodes, data=None):
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
            }
            try:
                if not id_replynodes['next_comment_url']:
                    url_comments_more = 'https://www.wukong.com/wenda/web/question/loadmorev1/?count=10&qid=' + \
                                        id_replynodes['id'] + '&offset=10&req_type=1'
                    response1 = get_response_and_text(url=url_comments_more,
                                                      headers=headers)
                    response_in_function = response1['response_in_function']
                    response_in_function_text = response1[
                        'response_in_function_text']

                else:
                    response1 = get_response_and_text(
                        url=id_replynodes['next_comment_url'], headers=headers)
                    response_in_function = response1['response_in_function']
                    response_in_function_text = response1[
                        'response_in_function_text']

            except Exception as e:
                print e
            datajson = json.loads(response_in_function_text)
            for one_comment in datajson['data']['ans_list']:
                datasoup_content = BeautifulSoup(one_comment['content'],
                                                 'lxml')
                content = datasoup_content.text
                img_urls = []
                Re_find_img = re.compile(r'src=".*?"')
                img_urls_find_by_re = Re_find_img.findall(
                    one_comment['content'])
                for img_url in img_urls_find_by_re:
                    img_url_split = img_url.split('"')[1]
                    img_urls.append(img_url_split)
                like_count = one_comment['digg_count']
                id = one_comment['ansid']
                publish_time = one_comment['create_time']  # 时间戳mark
                publish_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime(int(publish_time)))
                reply_count = one_comment['comment_count']
                publish_user_photo = one_comment['user']['avatar_url']
                publish_user = one_comment['user']['uname']
                publish_user_id = one_comment['user']['user_id']
                reply_nodes = get_content_in_wenda_comments_comments({
                    'id':
                    id,
                    'reply_nodes': [],
                    'next_comment_url':
                    None
                })
                parent_id = id_replynodes['id']
                ancestor_id = data['id']

                this_node = {
                    'publish_time': publish_time,
                    'content': content,
                    'like_count': like_count,
                    'id': id,
                    'reply_count': reply_count,
                    'publish_user_photo': publish_user_photo,
                    'publish_user': publish_user,
                    'publish_user_id': publish_user_id,
                    'reply_nodes': reply_nodes,
                    'ancestor_id': ancestor_id,
                    'parent_id': parent_id,
                    # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                id_replynodes['reply_nodes'].append(this_node)

            if datajson['data']['has_more']:
                url_offset = response_in_function.url.split('&offset=')
                offset = int(url_offset[1].split('&')[0]) + 10
                url = url_offset[0] + '&offset=' + str(offset)
                id_replynodes['next_comment_url'] = url
                reply_nodes2 = get_content_in_wenda_comments_more(
                    id_replynodes)
                return reply_nodes2
            else:
                return id_replynodes['reply_nodes']
Exemple #8
0
        def get_content_in_wenda_comments_comments(id_replynodes, data=None):

            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
            }
            try:
                if not id_replynodes['next_comment_url']:
                    url_comments_more = 'https://www.wukong.com/wenda/web/comment/brow/?ansid=' + \
                                        id_replynodes['id'] + '&count=10&offset=0'
                    response1 = get_response_and_text(url=url_comments_more)
                    response_in_function = response1['response_in_function']
                    response_in_function_text = response1[
                        'response_in_function_text']

                else:
                    response1 = get_response_and_text(
                        url=id_replynodes['next_comment_url'], headers=headers)
                    response_in_function = response1['response_in_function']
                    response_in_function_text = response1[
                        'response_in_function_text']

                # break
            except Exception as e:

                print e

            datajson_comment2 = json.loads(response_in_function_text)
            try:
                datajson_comment2['comments']
            except Exception as e:
                print e
            for comment2 in datajson_comment2['comments']:
                id = comment2['comment_id']
                like_count = comment2['digg_count']
                content = comment2['content']
                publish_user_id = comment2['user_info']['user_id']
                publish_user = comment2['user_info']['uname']
                publish_user_photo = comment2['user_info']['avatar_url']
                publish_time = comment2['create_time']
                publish_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime(int(publish_time)))
                ancestor_id = id_replynodes['ancestor_id']
                parent_id = id_replynodes['id']

                thisnode = {
                    'id': id,
                    'like_count': like_count,
                    'content': content,
                    'publish_user_id': publish_user_id,
                    'publish_user': publish_user,
                    'publish_user_photo': publish_user_photo,
                    'publish_time': publish_time,  #发布时间
                    'parent_id': parent_id,
                    'ancestor_id': ancestor_id,
                    # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                id_replynodes['reply_nodes'].append(thisnode)
            if datajson_comment2['has_more']:
                url_offset = response_in_function.url.split('&offset=')
                offset = int(url_offset[1].split('&')[0]) + 10
                url = url_offset[0] + '&offset=' + str(offset)
                id_replynodes['next_comment_url'] = url
                reply_nodes2 = get_content_in_wenda_comments_comments(
                    id_replynodes)
                return reply_nodes2
            else:
                return id_replynodes['reply_nodes']
Exemple #9
0
        def get_content_inside(data):
            url = data['url']
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
            }
            response1 = get_response_and_text(url=url, headers=headers)
            response_in_function = response1['response_in_function']
            response_in_function_text = response1['response_in_function_text']
            real_url = response_in_function.url
            if 'toutiao' not in real_url:
                logger_toutiao.log(level=logging.WARNING,
                                   msg='toutiao was not in thisurl---------' +
                                   real_url)
                return
            elif 'http://www.toutiao.com/api/pc/subject/' in real_url:
                logger_toutiao.log(
                    level=logging.WARNING,
                    msg=
                    'http://www.toutiao.com/api/pc/subject/ was in thisurl----------'
                    + real_url)
                return
            else:
                url = real_url

            Re_find_chineseTag = re.compile(r"chineseTag: '.*?'")

            #######################################################

            chineseTag = Re_find_chineseTag.findall(response_in_function_text)
            if chineseTag:
                try:
                    # print 'the lenth of response-------',len(response_in_function_text)
                    chineseTag = chineseTag[0].split("'")[1]
                    if chineseTag == '图片':
                        content_time_img = get_content_picture({
                            'response_in_function':
                            response_in_function,
                            'response_in_function_text':
                            response_in_function_text
                        })
                    elif chineseTag == '问答':
                        content_time_img = get_content_wenda(htmldata={
                            'response_in_function':
                            response_in_function,
                            'response_in_function_text':
                            response_in_function_text,
                            'data':
                            data
                        },
                                                             data=data)
                        return
                    else:
                        content_time_img = get_content_news({
                            'response_in_function':
                            response_in_function,
                            'response_in_function_text':
                            response_in_function_text
                        })
                except Exception as e:
                    print e, '在找图片,问答等分类模块的时候出了问题'
                    logger_toutiao.log(level=logging.WARNING,
                                       msg={
                                           'where': '在找板块分类定位的时候除了问题',
                                           'content': e.message
                                       })
            else:
                logger_toutiao.log(level=logging.WARNING, msg=chineseTag)
                print chineseTag
                return
            #如果不是问答,那么就进入到这里边
            Re_find_itmeId = re.compile(r'itemId: \'.*?\'')  # 普通头条
            Re_find_itme_Id = re.compile(r'item_id:\'.*?\'')  # 图片
            if Re_find_itmeId.findall(response_in_function_text):
                try:
                    item_id = Re_find_itmeId.findall(
                        response_in_function_text)[0].split("'")[1]
                except Exception as e:
                    logger_toutiao.log(level=logging.WARNING,
                                       msg={
                                           'where':
                                           'itemid来split失败了',
                                           'contetn':
                                           Re_find_itmeId.findall(
                                               response_in_function_text)[0]
                                       })
                    print e, 'itemid在re中找到了,但是split失败了'
            else:
                try:
                    item_id = Re_find_itme_Id.findall(
                        response_in_function_text)[0].split("'")[1]
                except Exception as e:
                    print e, '在item——id中没找到值,图片的item_id'
                    msg = {
                        'errormsg': e.message + '在item——id中没找到值,图片的item_id',
                        'htmldata': response_in_function_text,
                        'url': response_in_function.url,
                        'code': response_in_function.code,
                        'msg': response_in_function.msg
                    }
                    logger_toutiao.log(level=logging.WARNING, msg=msg)
                    return

            try:
                data['img_urls'] = content_time_img['img_urls']
                data['content'] = content_time_img['content']
                if len(content_time_img['publish_time']) < 12:
                    data['publish_time'] = content_time_img[
                        'publish_time'] + ' 00:00:00'
                else:
                    data['publish_time'] = content_time_img['publish_time']
                data['item_id'] = item_id
                data['reply_nodes'] = []
            except Exception as e:
                print e, 'data合成的时候除了问题'

            self.comments_url_list.append(data)
Exemple #10
0
        def get_comment_inside(data):
            headers = {
                'User-Agent':
                'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
            }
            comment_url = 'http://changyan.sohu.com/api/3/topic/liteload?&client_id=cysYw3AKM&page_size=30&hot_size=10&topic_source_id=' + \
                          data['id']

            response1 = get_response_and_text(url=comment_url, headers=headers)
            response_in_function = response1['response_in_function']
            response_in_function_text = response1['response_in_function_text']

            comments_data = []
            data_json = json.loads(response_in_function_text)
            reply_count_outside = data_json['cmt_sum']

            if data_json['comments']:
                data_json_comments = data_json['comments']

                for someone_comment in data_json_comments:
                    content = someone_comment['content']  # content
                    id = someone_comment['comment_id']  # id
                    publish_user_photo = someone_comment['passport'][
                        'img_url']  # publish_user_photo
                    publish_user = someone_comment['passport'][
                        'nickname']  # publish_user
                    publish_user_id = someone_comment['passport'][
                        'user_id']  # publish_user_id
                    create_time = someone_comment[
                        'create_time']  # publish_time
                    spider_time = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                    parent_id = data['id']
                    ancestor_id = data['id']
                    comments = someone_comment['comments']
                    reply_count = someone_comment['reply_count']
                    like_count = someone_comment['support_count']
                    dislike_count = someone_comment['oppose_count']
                    if comments:
                        parent_id = comments['comment_id']

                    thiscomments = {
                        'content': content,
                        'id': id,
                        'publish_user_photo': publish_user_photo,
                        'publish_user': publish_user,
                        'publish_user_id': publish_user_id,
                        'create_time': create_time,
                        'spider_time': spider_time,
                        'parent_id': parent_id,
                        'ancestor_id': ancestor_id,
                        'reply_count': reply_count,
                        'like_count': like_count,
                        'dislike_count': dislike_count
                    }
                    comments_data.append(thiscomments)

            data['reply_nodes'] = comments_data
            data['reply_count'] = reply_count_outside
            while len(self.result_list) > 600:
                time.sleep(1)
                print 'is waiting the lenth of the result_list to decrease to 300'
            self.result_list.append(data)
Exemple #11
0
        def get_content_inside(
            data
        ):  # 在线程函数中这里使用心得session算了,线程安全,这里是获取的一页的信息,另一个名字相似的函数是获得下一页的content信息
            url = data['url']
            headers = {
                'User-Agent':
                'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
            }
            response1 = get_response_and_text(url=url, headers=headers)
            response_in_function = response1['response_in_function']
            response_in_function_text = response1['response_in_function_text']
            datasoup = BeautifulSoup(response_in_function_text, 'lxml')

            Re_find_isimgpage = re.compile(r'\<ul class\=\"piclist\"\>')
            to_charge_is_picture = Re_find_isimgpage.findall(
                response_in_function_text)
            if to_charge_is_picture:  # 是图片模块,进入图片处理模块
                print to_charge_is_picture, 'and is in deal_pucture and the url is ---', data[
                    'url']
                content_and_img_urls = get_content_inside_picture(datasoup)
                img_urls = content_and_img_urls['img_urls']
                contentall = content_and_img_urls['content']

            else:
                content = ''
                img_urls = []
                for i in datasoup.select(
                        'body > div.scrollBox.mt10 > div.article > div.art_co.sau > p'
                ):
                    content += i.text
                # 8-3添加图片抓取功能
                Re_find_img_url = re.compile(r'src=".*?"/\>')
                content_part_data = datasoup.select('div.article')
                if content_part_data:
                    data_find_by_re = Re_find_img_url.findall(
                        str(content_part_data[0]))
                    for url_img_re in data_find_by_re:
                        img_urls.append(url_img_re.split('"')[1])
                next_page_selector = datasoup.select(
                    'body > div.scrollBox.mt10 > div.article > div.mb10.mt5.fs14 > a.page-next.ml5'
                )
                contentall = ''
                if next_page_selector:
                    next_page_html = next_page_selector[0].get('href')
                    if next_page_html and len(next_page_html) > 3:
                        next_page_url = next_page_html
                        next_url = 'http://m.xilu.com' + next_page_url
                        data['url'] = next_url
                        content_and_img_urls2 = get_content_inside_next_page({
                            'content':
                            content,
                            'nexturl':
                            next_url,
                            'img_urls':
                            img_urls
                        })
                        contentall += content_and_img_urls2['content']
                        img_urls3 = []
                        for i in content_and_img_urls2['img_urls']:
                            img_urls3.append(i)
                        for i in img_urls3:
                            img_urls.append(i)

                else:
                    contentall = content

            publish_time = data['publish_time']
            if publish_time == u'刚刚':
                publish_time = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
            elif u'小时前' in publish_time:
                time_pass = int(publish_time.replace(u'小时前', ''))
                publish_time = (
                    datetime.datetime.now() -
                    timedelta(hours=time_pass)).strftime('%Y-%m-%d %H:%M:%S')
            elif u'分钟前' in publish_time:
                time_pass = int(publish_time.replace(u'分钟前', ''))
                publish_time = (
                    datetime.datetime.now() -
                    timedelta(minutes=time_pass)).strftime('%Y-%m-%d %H:%M:%S')
            elif '-' in publish_time and len(publish_time) == 5:
                publish_time = '2017-' + publish_time + ' 00:00:00'
            data['content'] = contentall
            data['img_urls'] = img_urls
            data['publish_time'] = publish_time
            data['url'] = url

            while len(self.comments_url_list) > 600:  # 防止一个列表中的东西太多,太多了就等等
                time.sleep(1)
                print 'is waiting the lenth of comments_urls_list to decrease to 300'
            self.comments_url_list.append(data)
Exemple #12
0
        def get_content_inside(data):
            #这里不设计去重功能就真的没法停下来了
            #这里就写第一次的代码功能就行
            url = data['url']
            page_num = url.split('/')[-1]
            response1 = get_response_and_text(
                url=url, needupdate=True, update_info={'page_num': page_num})
            response_in_function = response1['response_in_function']
            response_in_function_text = response1['response_in_function_text']
            Re_find_sid = re.compile(r'sid=".*"')
            try:
                datasoup = BeautifulSoup(response_in_function_text, 'lxml')
            except Exception as e:
                print e
                return

            if ('class="swiper-container"' not in response_in_function_text
                ) and ('class="content"'
                       in response_in_function_text):  #这个是文字类的新闻
                sid = Re_find_sid.findall(response_in_function_text)[0].split(
                    '"')[1]
                data['sid'] = sid

                datasoup = BeautifulSoup(response_in_function_text, 'lxml')
                for i in datasoup.select(
                        'body > div.content > div.neirong > h2'):
                    title = i.text
                for j in datasoup.select(
                        'body > div.content > div.neirong > p > span:nth-of-type(4)'
                ):
                    publish_time = j.text
                for k in datasoup.select(
                        'body > div.content > div.neirong > p > span:nth-of-type(3)'
                ):
                    publish_user = k.text.replace(' ', '').replace(
                        '\t', '').replace('\n',
                                          '').replace('\r',
                                                      '').replace(u'来源:', '')
                content = ''
                for l in datasoup.select(
                        'body > div.content > div.neirong > article > p'):
                    content += l.text
                img_urls = []
                neirong_content = datasoup.select(
                    'body > div.content > div.neirong')
                neirong_content = str(neirong_content)
                Re_find_img_url = re.compile(r'src=".*?"')
                img_find_by_re = Re_find_img_url.findall(neirong_content)
                for i in img_find_by_re:
                    img_urls.append(i.split('"')[1])
                try:
                    publish_time += ':00'
                except Exception as e:
                    print e
                data['title'] = title
                data['content'] = content
                data['publish_time'] = publish_time
                data['publish_user'] = publish_user
                data['reply_nodes'] = []
                data['spider_time'] = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
                data['img_urls'] = img_urls
            elif 'class="swiper-container"' in response_in_function_text:  #这里可能是图片新闻
                content = ''
                img_urls = []
                for title_for in datasoup.select('body > div.content > h2'):
                    title = title_for.text
                for publish_time_for in datasoup.select(
                        'body > div.content > p.jieshao > span:nth-of-type(4)'
                ):
                    publish_time = publish_time_for.text + ':00'
                for publish_user_for in datasoup.select(
                        'body > div.content > p.jieshao > span:nth-of-type(3) > a'
                ):
                    publish_user = publish_user_for.text.replace(
                        ' ', '').replace('\t', '').replace('\n', '').replace(
                            '\r', '').replace(u'来源:', '')
                for content_for in datasoup.select(
                        'body > div.content > p.zongjie'):
                    content += content_for.text
                for img_url in datasoup.select(
                        'div.swiper-container > div.swiper-wrapper > div.swiper-slide > div.imgdiv > img'
                ):
                    img_urls.append(img_url.get('src'))
                try:
                    data['title'] = title
                    data['content'] = content
                    data['publish_time'] = publish_time
                    data['publish_user'] = publish_user
                    data['reply_nodes'] = []
                    data['spider_time'] = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                    data['img_urls'] = img_urls
                except Exception as e:
                    print e
                    return

            else:
                print url, '-----not in neirong and picture deal module'
                return

            while len(self.comments_url_list) > LEN_COMMENT_LIST:
                time.sleep(1)
            print data
            self.comments_url_list.append(data)
            pass
Exemple #13
0
        def get_comment_inside(data):  #也是分为两段设计,第一次获得contentid
            topicid = None
            cmspage_taotalnum = 1
            comments_data = []
            cmspagenum = 1

            #额外添加
            request_num = 1

            # comments_data=[]
            while True:
                # reply_count=0
                if not topicid:
                    comment_url_without_id = 'http://changyan.sohu.com/api/3/topic/liteload?&client_id=cyrHnxhFx&page_size=30&hot_size=5&topic_source_id='
                    comment_url = comment_url_without_id + data['sid']
                else:
                    comment_url = 'http://changyan.sohu.com/api/2/topic/comments?client_id=cyrHnxhFx&page_size=30&topic_id=' + str(
                        topicid) + '&page_no=' + str(request_num)

                response1 = get_response_and_text(url=comment_url)
                response_in_function = response1['response_in_function']
                response_in_function_text = response1[
                    'response_in_function_text']
                try:
                    data_json = json.loads(response_in_function_text)
                except Exception as e:
                    print e
                    return
                if data_json['comments']:
                    data_json_comments = data_json['comments']
                    cmspage_taotalnum = data_json['cmt_sum']
                    topicid = data_json['topic_id']

                    for someone_comment in data_json_comments:
                        content = someone_comment['content']  # content
                        id = someone_comment['comment_id']  # id
                        publish_user_photo = someone_comment['passport'][
                            'img_url']  # publish_user_photo
                        publish_user = someone_comment['passport'][
                            'nickname']  # publish_user
                        publish_user_id = someone_comment['passport'][
                            'user_id']  # publish_user_id
                        create_time = someone_comment[
                            'create_time']  # publish_time
                        create_time = time.strftime(
                            '%Y-%m-%d %H:%M:%S',
                            time.localtime(int(int(create_time / 1000))))
                        spider_time = datetime.datetime.now().strftime(
                            '%Y-%m-%d %H:%M:%S')
                        like_count = someone_comment['support_count']
                        parent_id = data['id']  #mark这两个节点到底应该放什么东西呢?
                        ancestor_id = data['id']
                        this_comments = someone_comment['comments']
                        if this_comments:
                            parent_id = this_comments[0]['comment_id']
                        #用堆来解决这种类型的评论8-16
                        # for this_comments
                        cmspagenum += 1

                        thiscomments = {
                            'content': content,
                            'id': id,
                            'publish_user_photo': publish_user_photo,
                            'publish_user': publish_user,
                            'publish_user_id': publish_user_id,
                            'publish_time': create_time,
                            'spider_time': spider_time,
                            'like_count': like_count,
                            'parent_id': parent_id,
                            'ancestor_id': ancestor_id,
                        }
                        comments_data.append(thiscomments)

                    if cmspagenum >= cmspage_taotalnum - 1:
                        break

                request_num += 1

            data['reply_nodes'] = comments_data
            data['reply_count'] = cmspage_taotalnum
            while len(self.result_list) > 600:
                time.sleep(1)
                print 'is waiting the lenth of the result_list to decrease to 300'

            #最后处理,去掉不需要的字段:
            del data['sid']
            self.result_list.append(data)
Exemple #14
0
        def get_comment_inside(data):  #这里的下一页请求只需要在cmspagenum上加1即可。
            #初始化
            topicid = None
            cmspage_taotalnum = 0
            comments_data = []
            cmspagenum = 1
            while True:
                headers = {
                    'User-Agent':
                    'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
                }
                if not topicid:
                    comment_url = 'https://apiv2.sohu.com/api/topic/load?page_size=10&topic_source_id=' + \
                              str(data['cmsid'])+'&page_no=10'
                else:
                    comment_url = 'https://apiv2.sohu.com/api/comment/list?page_size=10&topic_id=' + str(
                        topicid) + '&page_no=' + str(cmspagenum)
                response1 = get_response_and_text(url=comment_url,
                                                  headers=headers)
                response_in_function = response1['response_in_function']
                response_in_function_text = response1[
                    'response_in_function_text']
                data_json = json.loads(response_in_function_text)
                if cmspagenum == 1:
                    try:
                        cmspage_taotalnum = data_json['jsonObject']['cmt_sum']
                        data['reply_count'] = cmspage_taotalnum
                    except:
                        try:
                            cmspage_taotalnum = data_json['jsonObject'][
                                'outer_cmt_sum']
                        except:
                            cmspage_taotalnum = 0  #因为这里边没有返回这个值
                for one_comment in data_json['jsonObject']['comments']:
                    id = one_comment['comment_id']
                    content = one_comment['content']
                    url = response_in_function.url
                    publish_time = one_comment['create_time']
                    publish_time = int(publish_time) / 1000
                    time_format = '%Y-%m-%d %H:%M:%S'
                    publish_time_stamp_9 = time.localtime(float(publish_time))
                    publish_time = time.strftime(time_format,
                                                 publish_time_stamp_9)
                    publish_user_id = one_comment['user_id']
                    like_count = one_comment['support_count']
                    reply_count = one_comment['reply_count']
                    try:
                        publish_user = one_comment['passport']['nickname']
                    except Exception as e:
                        publish_user = '******'
                    publish_user_photo = one_comment['passport']['img_url']
                    ancestor_id = data['id']
                    print ancestor_id, '---------', data['id']
                    print '---------------------', id, '----------------------'
                    if ancestor_id != data['id']:
                        print ancestor_id, '---------', data['id']
                    parent_id = data['id']
                    if one_comment['comments']:
                        parent_id = one_comment['comments'][0]['comment_id']

                    thisnode = {
                        'id': id,
                        'content': content,
                        'url': url,
                        'publish_time': publish_time,
                        'publish_user_id': publish_user_id,
                        'like_count': like_count,
                        'reply_count': reply_count,
                        'publish_user': publish_user,
                        'publish_user_photo': publish_user_photo,
                        'ancestor_id': ancestor_id,
                        'parent_id': parent_id
                    }

                    comments_data.append(thisnode)
                cmspagenum += 1
                if cmspagenum <= int(
                        cmspage_taotalnum / 10) + 1:  #既然每次10条结果,那么
                    if not topicid:
                        topicid = data_json['jsonObject']['topic_id']
                    # get_comment_inside(data,cmspagenum,comments_data,topicid,cmspage_taotalnum)
                else:
                    data['reply_nodes'] = comments_data
                    del data['cmsid']  #删除为获取评论而生成的id
                    self.result_list.append(data)
                    break