Ejemplo n.º 1
0
def get_weibo_info(each, html):
    wb_data = WeiboData()

    try:
        wb_data.weibo_id = each['mid']
    except (AttributeError, IndexError, TypeError):
        parser.error(
            'Failed to get weibo id, the page source is {}'.format(html))
        return None

    imgs = list()
    imgs_url = list()
    try:
        imgs = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('li'))
        imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    if IMG_ALLOW and imgs and imgs_url:
        app.send_task('tasks.downloader.download_img_task',
                      args=(wb_data.weibo_id, imgs_url),
                      queue='download_queue',
                      routing_key='for_download')
        wb_data.weibo_img_path = IMG_PATH
    else:
        wb_data.weibo_img_path = ''

    # todo 没找到vedio的测试数据
    try:
        a_tag = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('a'))
        extracted_url = urllib.parse.unquote(
            re.findall(r"full_url=(.+?)&", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''

    try:
        wb_data.device = each.find(attrs={
            'class': 'from'
        }).find(attrs={
            'rel': 'nofollow'
        }).text
    except AttributeError:
        wb_data.device = ''

    try:
        # todo 日期格式化,会有今日XXX,X分钟前等噪音
        wb_data.create_time = each.find(attrs={
            'class': 'from'
        }).find(attrs={
            'target': '_blank'
        }).text.strip()
        wb_data.weibo_url = 'https:' + each.find(attrs={
            'class': 'from'
        }).find(attrs={'target': '_blank'})['href']
        wb_data.uid = each.find(attrs={
            'class': 'from'
        }).find(attrs={'target': '_blank'})['href'].split('/')[3]
    except (AttributeError, KeyError):
        wb_data.create_time = ''
        wb_data.weibo_url = ''
        wb_data.weibo_uid = ''

    try:
        wb_data.repost_num = int(
            each.find(attrs={
                'class': 'card-act'
            }).find_all('li')[1].find('a').text.split(' ')[-1])
    except (AttributeError, ValueError):
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(
            each.find(attrs={
                'class': 'card-act'
            }).find_all('li')[2].find('a').text.split(' ')[-1])
    except (AttributeError, ValueError):
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(
            each.find(attrs={
                'class': 'card-act'
            }).find_all('li')[3].find('a').find('em').text)
    except (AttributeError, ValueError):
        wb_data.praise_num = 0

    if '展开全文' in str(each):
        is_all_cont = 1
        try:
            wb_data.weibo_cont = each.find(
                attrs={
                    'node-type': 'feed_list_content_full'
                }).text.strip()
        except Exception as why:
            parser.error(
                'Failed to get weibo cont, the error is {}, the page source is {}'
                .format(why, html))
            return None
    else:
        is_all_cont = 1
        try:
            wb_data.weibo_cont = each.find(attrs={
                'node-type': 'feed_list_content'
            }).text.strip()
        except Exception as why:
            parser.error(
                'Failed to get weibo cont, the error is {}, the page source is {}'
                .format(why, html))
            return None
    return wb_data, is_all_cont
Ejemplo n.º 2
0
def get_weibo_info_detail(each, html):
    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    user_info = str(user_cont.find('a'))
    user_pattern = 'id=(\\d+)&amp'
    m = re.search(user_pattern, user_info)
    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning('未提取到用户id,页面源码是{}'.format(html))
        return None

    weibo_pattern = 'mid=(\\d+)'
    m = re.search(weibo_pattern, str(each))
    if m:
        wb_data.weibo_id = m.group(1)
    else:
        parser.warning('未提取到页面的微博id,页面源码是{}'.format(html))
        return None

    time_url = each.find(attrs={'node-type': 'feed_list_item_date'})
    wb_data.create_time = time_url.get('title', '')
    wb_data.weibo_url = time_url.get('href', '')
    if 'weibo.com' not in wb_data.weibo_url:
        wb_data.weibo_url = 'http://weibo.com{}'.format(wb_data.weibo_url)

    wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find\
        (attrs={'node-type': 'feed_list_content'}).text.strip()
    try:
        wb_data.device = each.find(attrs={
            'class': 'WB_from'
        }).find(attrs={
            'action-type': 'app_source'
        }).text
    except Exception as e:
        parser.error('本次解析设备出错,具体是{}'.format(e))
        wb_data.device = ''

    try:
        wb_data.repost_num = int(
            each.find(attrs={
                'action-type': 'fl_forward'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(
            each.find(attrs={
                'action-type': 'fl_comment'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(
            each.find(attrs={
                'action-type': 'fl_like'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.praise_num = 0
    return wb_data
Ejemplo n.º 3
0
def get_weibo_info_detail(each, html):
    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    user_info = str(user_cont.find('a'))
    user_pattern = 'id=(\\d+)&amp'
    m = re.search(user_pattern, user_info)
    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning(
            "fail to get user'sid, the page source is{}".format(html))
        return None

    weibo_pattern = 'mid=(\\d+)'
    m = re.search(weibo_pattern, str(each))
    if m:
        wb_data.weibo_id = m.group(1)
    else:
        parser.warning(
            "fail to get weibo's id,the page source {}".format(html))
        return None

    time_url = each.find(attrs={'node-type': 'feed_list_item_date'})
    wb_data.create_time = time_url.get('title', '')
    wb_data.weibo_url = time_url.get('href', '')
    if ROOT_URL not in wb_data.weibo_url:
        wb_data.weibo_url = '{}://{}{}'.format(PROTOCOL, ROOT_URL,
                                               wb_data.weibo_url)

    def url_filter(url):
        return ':'.join([
            PROTOCOL, url
        ]) if PROTOCOL not in url and ORIGIN not in url else url

    try:
        imgs = str(
            each.find(attrs={
                'node-type': 'feed_content'
            }).find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('img'))
        imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    try:
        li = str(
            each.find(attrs={
                'node-type': 'feed_content'
            }).find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('li'))
        extracted_url = urllib.parse.unquote(
            re.findall(r"video_src=(.+?)&", li)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''

    try:
        wb_data.weibo_cont = each.find(attrs={
            'node-type': 'feed_content'
        }).find(attrs={
            'node-type': 'feed_list_content'
        }).text.strip()
    except Exception:
        wb_data.weibo_cont = ''

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1

    try:
        wb_data.device = each.find(attrs={
            'class': 'WB_from S_txt2'
        }).find(attrs={
            'action-type': 'app_source'
        }).text
    except Exception:
        wb_data.device = ''

    try:
        wb_data.repost_num = int(
            each.find(attrs={
                'action-type': 'fl_forward'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(
            each.find(attrs={
                'action-type': 'fl_comment'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(
            each.find(attrs={
                'action-type': 'fl_like'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.praise_num = 0
    return wb_data, is_all_cont
Ejemplo n.º 4
0
def get_weibo_info_detail(each, html):
    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    user_info = str(user_cont.find('a'))
    user_pattern = 'id=(\\d+)&amp'
    m = re.search(user_pattern, user_info)
    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning('未提取到用户id,页面源码是{}'.format(html))
        return None

    weibo_pattern = 'mid=(\\d+)'
    m = re.search(weibo_pattern, str(each))
    if m:
        wb_data.weibo_id = m.group(1)
    else:
        parser.warning('未提取到页面的微博id,页面源码是{}'.format(html))
        return None

    time_url = each.find(attrs={'node-type': 'feed_list_item_date'})
    wb_data.create_time = time_url.get('title', '')
    wb_data.weibo_url = time_url.get('href', '')
    if 'weibo.com' not in wb_data.weibo_url:
        wb_data.weibo_url = 'http://weibo.com{}'.format(wb_data.weibo_url)

    wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find\
        (attrs={'node-type': 'feed_list_content'}).text.strip()

    # test for weibo_pic capture
    # 先判断这条微博是否有带图片,再进行后续的处理
    try:
        weibo_pic = []
        have_pic = 1
        pic_list = each.find_all(attrs={'action-type': 'fl_pics'})
    except Exception as e:
        have_pic = 0

    if have_pic == 1:
        for pic in pic_list:
            wb_pic = WeiboPic()
            wb_pic.uid = wb_data.uid
            wb_pic.weibo_id = wb_data.weibo_id
            wb_pic.pic_url = pic.find('img').get('src')
            # wb_pic.url_hash = md5Encode(wb_pic.pic_url)
            wb_pic.url_hash = re.match('.*/thumb150/(.*).jpg',
                                       wb_pic.pic_url).group(1)
            wb_pic.dl_flag = 0
            wb_pic.judge_flag = 0
            weibo_pic.append(wb_pic)
    # end

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1

    try:
        wb_data.device = each.find(attrs={
            'class': 'WB_from'
        }).find(attrs={
            'action-type': 'app_source'
        }).text
    except Exception as e:
        parser.error('本次解析设备出错,具体是{}'.format(e))
        wb_data.device = ''

    try:
        wb_data.repost_num = int(
            each.find(attrs={
                'action-type': 'fl_forward'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(
            each.find(attrs={
                'action-type': 'fl_comment'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(
            each.find(attrs={
                'action-type': 'fl_like'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.praise_num = 0

    return wb_data, is_all_cont, weibo_pic
Ejemplo n.º 5
0
def get_weibo_info(each, html):
    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    usercard = user_cont.find('img').get('usercard', '')
    # this only for login user
    if not usercard:
        return None
    wb_data.uid = usercard.split('&')[0][3:]

    try:
        wb_data.weibo_id = each.find(attrs={
            'class': 'WB_screen'
        }).find('a').get('action-data')[4:]
    except (AttributeError, IndexError, TypeError):
        return None

    try:
        wb_data.weibo_url = each.find(
            attrs={'node-type': 'feed_list_item_date'})['href']
    except Exception as e:
        parser.error(
            'Failed to get weibo url, the error is {}, the source page is {}'.
            format(e, html))
        return None

    imgs = list()
    imgs_url = list()
    try:
        imgs = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('li'))
        imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    if IMG_ALLOW and imgs and imgs_url:
        app.send_task('tasks.downloader.download_img_task',
                      args=(wb_data.weibo_id, imgs_url),
                      queue='download_queue',
                      routing_key='for_download')
        wb_data.weibo_img_path = IMG_PATH
    else:
        wb_data.weibo_img_path = ''

    try:
        a_tag = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('a'))
        extracted_url = urllib.parse.unquote(
            re.findall(r"full_url=(.+?)&", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''
    try:
        wb_data.device = each.find(attrs={
            'class': 'feed_from'
        }).find(attrs={
            'rel': 'nofollow'
        }).text
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(
            attrs={'node-type': 'feed_list_item_date'})['date']
    except (AttributeError, KeyError):
        wb_data.create_time = ''
    else:
        create_time = int(create_time) / 1000  # 时间戳单位不同
        create_time = datetime.fromtimestamp(create_time)
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")

    try:
        feed_action = each.find(attrs={'class': 'feed_action'})
    except Exception as why:
        parser.error(
            'Failed to get feed_action, the error is {},the page source is {}'.
            format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(
                feed_action.find(attrs={
                    'action-type': 'feed_list_like'
                }).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        wb_data.weibo_cont = each.find(attrs={
            'class': 'comment_txt'
        }).text.strip()
    except Exception as why:
        parser.error(
            'Failed to get weibo cont, the error is {}, the page source is {}'.
            format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont
Ejemplo n.º 6
0
def get_weibo_info(each, html):
    wb_data = WeiboData()
    try:
        user_cont = each.find(attrs={'class': 'face'})
        user_info = user_cont.find('a')
        m = re.match(user_pattern, user_info.img.get('usercard'))

        if m:
            wb_data.uid = m.group(1)
        else:
            parser.warning('未提取到用户id,页面源码是{}'.format(html))
            return None

    except Exception as why:
        parser.error('解析用户信息出错,出错原因:{},页面源码是{}'.format(why, html))
        return None

    wb_data.weibo_id = each.find(attrs={
        'class': 'WB_screen'
    }).find('a').get('action-data')[4:]
    try:
        wb_data.weibo_url = each.find(
            attrs={'node-type': 'feed_list_item_date'})['href']
    except Exception as e:
        parser.error('解析微博url出错,出错原因是{},页面源码是{}'.format(e, html))
        return None

    try:
        wb_data.device = each.find(attrs={
            'class': 'feed_from'
        }).find(attrs={
            'rel': 'nofollow'
        }).text
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(
            attrs={'node-type': 'feed_list_item_date'})['date']
    except (AttributeError, KeyError):
        wb_data.create_time = ''
    else:
        create_time = int(create_time) / 1000  # 时间戳单位不同
        create_time = datetime.fromtimestamp(create_time)
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")

    try:
        feed_action = each.find(attrs={'class': 'feed_action'})
    except Exception as why:
        parser.error('解析feed_action出错,出错原因:{},页面源码是{}'.format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(
                feed_action.find(attrs={
                    'action-type': 'feed_list_like'
                }).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        wb_data.weibo_cont = each.find(attrs={
            'class': 'comment_txt'
        }).text.strip()
    except Exception as why:
        parser.error('解析微博内容出错:{}, 页面源码是{}'.format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont
Ejemplo n.º 7
0
def get_weibo_info_detail(each, html):
    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    user_info = str(user_cont.find('a'))
    user_pattern = 'id=(\\d+)&amp'
    m = re.search(user_pattern, user_info)
    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning("fail to get user'sid, the page source is{}".format(html))
        return None

    weibo_pattern = 'mid=(\\d+)'
    m = re.search(weibo_pattern, str(each))
    if m:
        wb_data.weibo_id = m.group(1)
    else:
        parser.warning("fail to get weibo's id,the page source {}".format(html))
        return None

    time_url = each.find(attrs={'node-type': 'feed_list_item_date'})
    wb_data.create_time = time_url.get('title', '')
    wb_data.weibo_url = time_url.get('href', '')
    if ROOT_URL not in wb_data.weibo_url:
        wb_data.weibo_url = '{}://{}{}'.format(PROTOCOL, ROOT_URL, wb_data.weibo_url)

    def url_filter(url):
        return ':'.join([PROTOCOL, url]) if PROTOCOL not in url and ORIGIN not in url else url

    try:
        imgs = str(each.find(attrs={'node-type': 'feed_content'}).find(attrs={'node-type': 'feed_list_media_prev'}).
                   find_all('img'))
        imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    try:
        li = str(each.find(attrs={'node-type': 'feed_content'}).find(attrs={'node-type': 'feed_list_media_prev'}).
                 find_all('li'))
        extracted_url = urllib.parse.unquote(re.findall(r"video_src=(.+?)&", li)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''

    try:
        wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find(
            attrs={'node-type': 'feed_list_content'}).text.strip()
    except Exception:
        wb_data.weibo_cont = ''

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1

    try:
        wb_data.device = each.find(attrs={'class': 'WB_from S_txt2'}).find(attrs={'action-type': 'app_source'}).text
    except Exception:
        wb_data.device = ''

    try:
        wb_data.repost_num = int(each.find(attrs={'action-type': 'fl_forward'}).find_all('em')[1].text)
    except Exception:
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(each.find(attrs={'action-type': 'fl_comment'}).find_all('em')[1].text)
    except Exception:
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(each.find(attrs={'action-type': 'fl_like'}).find_all('em')[1].text)
    except Exception:
        wb_data.praise_num = 0
    return wb_data, is_all_cont
Ejemplo n.º 8
0
def get_weibo_info_1(each, html):
    wb_data = WeiboData()

    try:
        wb_data.weibo_id = each['mid']
    except Exception as why:
        parser.error(
            'Failed to get weibo cont, the error is {}, the page source is {}'.
            format(why, html))
        return None

    try:
        feed_action = each.find(attrs={'class': 'card-act'})
    except Exception as why:
        parser.error(
            'Failed to get feed_action, the error is {},the page source is {}'.
            format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(
                feed_action.find(attrs={
                    'action-type': 'feed_list_like'
                }).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0
        try:
            m = re.search(
                'uid=(\\d+)',
                str(
                    feed_action.find(
                        attrs={'action-type': 'feed_list_forward'})
                    ['action-data']))
            wb_data.uid = m.group(1)
        except Exception as why:
            parser.error(
                'Failed to get weibo cont, the error is {}, the page source is {}'
                .format(why, html))
            return None
    try:
        a_tag = each.find(attrs={'class': 'from'})
        wb_data.weibo_url = "https:" + a_tag.a['href']
        create_time = a_tag.a.text.replace("\n", "").strip()
        if "秒前" in create_time:
            create_time = (
                datetime.datetime.now() -
                datetime.timedelta(seconds=int(create_time.replace("秒前", "")))
            ).strftime("%Y-%m-%d %H:%M")
        elif "分钟前" in create_time:
            create_time = (
                datetime.datetime.now() -
                datetime.timedelta(minutes=int(create_time.replace("分钟前", "")))
            ).strftime("%Y-%m-%d %H:%M")
        elif "今天" in create_time:
            create_time = datetime.datetime.now().strftime(
                "%Y-%m-%d") + " " + create_time.replace("今天", "")
        else:
            create_time = str(
                datetime.datetime.now().year) + '-' + create_time.replace(
                    '月', '-').replace('日', '')
        wb_data.create_time = create_time
        if len(a_tag.contents) >= 4:
            wb_data.device = a_tag.contents[3].text
        else:
            wb_data.device = ''
    except Exception as why:
        parser.error(why)
        wb_data.weibo_url = ''

    try:
        wb_data.weibo_cont = each.find(attrs={
            'node-type': 'feed_list_content'
        }).text.strip()
    except Exception as why:
        parser.error(
            'Failed to get weibo cont, the error is {}, the page source is {}'.
            format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont
Ejemplo n.º 9
0
def get_weibo_info(each, html):
    # print ("----------------------")
    wb_data = WeiboData()
    # print ("-------" * 10)
    # print(each)
    # print ("#$#" * 10)
    # print(html)
    # print ("-----" * 10)

    user_cont = each.find(attrs={'class': 'card-feed'})
    user_avator = user_cont.find(attrs={'class': 'avator'})
    #usercard = user_cont.find('img').get('usercard', '')
    usercard = user_avator.find('a').get('href', '')
    # this only for login user
    if not usercard:
        return None
    wb_data.uid = usercard.split('?')[0][12:]
    # print ("uid", wb_data.uid)

    try:
        wb_data.weibo_id = each.find(attrs={'title': '赞'}).get('action-data')[4:]
        # print ("weibo_id", wb_data.weibo_id)
    except (AttributeError, IndexError, TypeError):
        return None

    try:
        wb_data.weibo_url = each.find(attrs={"class": "content"}).find(attrs={"class": "from"}).find('a').get("href", "")[2:]
        # wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href']
        # print ("weibo_url", wb_data.weibo_url)
    except Exception as e:
        parser.error('Failed to get weibo url, the error is {}, the source page is {}'.format(e, html))
        return None

    imgs = list()
    imgs_url = list()
    try:
        imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li'))
        imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    if IMG_ALLOW and imgs and imgs_url:
        app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url),
                      queue='download_queue', routing_key='for_download')
        wb_data.weibo_img_path = IMG_PATH
    else:
        wb_data.weibo_img_path = ''

    try:
        a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a'))
        extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''
    try:
        wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text.strip()
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(attrs={"class": "content"}).find(attrs={"class": "from"}).find('a').text.strip()
        if "年" not in create_time and "月" in create_time:
            create_time = "2019年" + create_time
        elif "今天" in create_time:
            pass
            create_time.replace("今天", datetime.datetime.now().strftime("%Y-%m-%d "))
            print( "啦啦啦啦啦 今天")

        create_time = datetime.datetime.strptime(create_time, "%Y年%m月%d日 %H:%M")
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")
        # print ("create_time", wb_data.create_time)
    except Exception as e:
        traceback.print_exc()
        wb_data.create_time = ''

    try:
        feed_action = each.find(attrs={'class': 'card-act'})
    except Exception as why:
        parser.error('Failed to get feed_action, the error is {},the page source is {}'.format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        try:
            wb_data.weibo_cont = each.find(attrs={"node-type": "feed_list_content_full"}).text.strip()
            # print ("full_weibo_cont", wb_data.weibo_cont)
        except:
            wb_data.weibo_cont = each.find(attrs={'class': 'txt'}).text.strip()
            # print ("weibo_cont", wb_data.weibo_cont)
    except Exception as why:
        parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont
Ejemplo n.º 10
0
def get_weibo_info(each, html):
    wb_data = WeiboData()
    user_cont = each.find(attrs={'class': 'face'})
    user_info = user_cont.find('a')
    m = re.match(USER_PATTERN, user_info.img.get('usercard'))

    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning("fail to get user'sid, the page source is{}".format(html))
        return None
    try:
        wb_data.weibo_id = each.find(attrs={'class': 'WB_screen'}).find('a').get('action-data')[4:]
    except (AttributeError, IndexError, TypeError):
        return None

    try:
        wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href']
    except Exception as e:
        parser.error('fail to get weibo url, the error is {}, the source page is {}'.format(e, html))
        return None

    def url_filter(url):
        return ':'.join([PROTOCOL, url]) if PROTOCOL not in url and ORIGIN not in url else url

    try:
        imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li'))
        imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    try:
        a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a'))
        extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''
    try:
        wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(attrs={'node-type': 'feed_list_item_date'})['date']
    except (AttributeError, KeyError):
        wb_data.create_time = ''
    else:
        create_time = int(create_time) / 1000  # 时间戳单位不同
        create_time = datetime.fromtimestamp(create_time)
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")

    try:
        feed_action = each.find(attrs={'class': 'feed_action'})
    except Exception as why:
        parser.error('failt to get feed_action, the error is {},the page source is {}'.format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        wb_data.weibo_cont = each.find(attrs={'class': 'comment_txt'}).text.strip()
    except Exception as why:
        parser.error('fail to get weibo cont, the error is {}, the page source is {}'.format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont
Ejemplo n.º 11
0
def get_weibo_forward_info_detail(mid, each, html):
    wb_data = WeiboData()

    if str(each).find('抱歉,此微博已被作者删除') != -1:
        wb_data.weibo_id = mid
        wb_data.is_delete = 1
        return wb_data, 0

    try:
        each = each.find(attrs={'node-type': 'feed_list_forwardContent'})
    except:
        return
    try:
        user_cont = each.find(attrs={'class': 'WB_info'})
    except:
        return

    try:
        user_info = str(user_cont.find('a'))
    except:
        return

    user_pattern = 'id=(\\d+)&amp'
    m = re.search(user_pattern, user_info)
    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning(
            "fail to get user'sid, the page source is{}".format(html))
        return None

    weibo_pattern = 'mid=(\\d+)'
    m = re.search(weibo_pattern, str(each))
    if m:
        wb_data.weibo_id = m.group(1)
    else:
        parser.warning(
            "fail to get weibo's id,the page source {}".format(html))
        return None

    try:
        time_url = each.find(attrs={'node-type': 'feed_list_item_date'})
    except:
        pass

    wb_data.create_time = time_url.get('title', '')
    wb_data.weibo_url = time_url.get('href', '')
    if ROOT_URL not in wb_data.weibo_url:
        wb_data.weibo_url = '{}://{}{}'.format(PROTOCOL, ROOT_URL,
                                               wb_data.weibo_url)

    def url_filter(url):
        return ':'.join([
            PROTOCOL, url
        ]) if PROTOCOL not in url and ORIGIN not in url else url

    try:
        full_imgs = each.find(attrs={
            'node-type': 'feed_list_media_prev'
        }).find(attrs={'node-type': 'fl_pic_list'})
        if full_imgs.has_attr('action-data'):
            url_param = full_imgs['action-data']
            full_imgs_url = urllib.parse.parse_qs(url_param)['clear_picSrc'][0]
            full_imgs_url_arr = full_imgs_url.split(',')
            for i, url in enumerate(full_imgs_url_arr):
                full_imgs_url_arr[i] = "https:" + url
            wb_data.weibo_img = ';'.join(full_imgs_url_arr)
    except Exception:
        wb_data.weibo_img = ''

    try:
        imgs = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('img'))
        imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))
        wb_data.weibo_preview_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_preview_img = ''

    try:
        video = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('video'))
        video_url = map(url_filter, re.findall(r"src=\"(.+?)\"", video))
        wb_data.weibo_video = ';'.join(video_url)
    except Exception:
        wb_data.weibo_video = ''

    try:
        li = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('li'))
        extracted_url = urllib.parse.unquote(
            re.findall(r"video_src=(.+?)&", li)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''

    try:
        wb_data.weibo_cont = each.find(attrs={
            'node-type': 'feed_list_reason'
        }).text.strip()
    except Exception:
        wb_data.weibo_cont = ''

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1

    try:
        wb_data.device = each.find(attrs={
            'class': 'WB_from S_txt2'
        }).find(attrs={
            'action-type': 'app_source'
        }).text
    except Exception:
        wb_data.device = ''

    try:
        wb_data.repost_num = int(
            each.find(attrs={
                'action-type': 'fl_forward'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(
            each.find(attrs={
                'action-type': 'fl_comment'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(
            each.find(attrs={
                'action-type': 'fl_like'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.praise_num = 0

    return wb_data, is_all_cont
Ejemplo n.º 12
0
def get_weibo_info_detail(each, html):

    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    user_info = str(user_cont.find('a'))
    user_pattern = 'id=(\\d+)&amp'
    m = re.search(user_pattern, user_info)
    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning(
            "fail to get user'sid, the page source is{}".format(html))
        return None

    mid = each['mid']
    weibo_pattern = 'mid=(\\d+)'
    m = re.search(weibo_pattern, str(each))
    if m:
        wb_data.weibo_id = m.group(1)
    else:
        parser.warning(
            "fail to get weibo's id,the page source {}".format(html))
        return None

    if each.has_attr('omid'):
        omid = each['omid']
        wb_data.is_origin = 0
        wb_data.weibo_forward_id = omid

    time_url = each.find(attrs={'node-type': 'feed_list_item_date'})
    wb_data.create_time = time_url.get('title', '')
    wb_data.weibo_url = time_url.get('href', '')
    if ROOT_URL not in wb_data.weibo_url:
        wb_data.weibo_url = '{}://{}{}'.format(PROTOCOL, ROOT_URL,
                                               wb_data.weibo_url)

    def url_filter(url):
        return ':'.join([
            PROTOCOL, url
        ]) if PROTOCOL not in url and ORIGIN not in url else url

    try:
        full_imgs = each.find(attrs={
            'node-type': 'feed_list_media_prev'
        }).find(attrs={'node-type': 'fl_pic_list'})
        if full_imgs.has_attr('action-data'):
            url_param = full_imgs['action-data']
            full_imgs_url = urllib.parse.parse_qs(url_param)['clear_picSrc'][0]
            full_imgs_url_arr = full_imgs_url.split(',')
            for i, url in enumerate(full_imgs_url_arr):
                full_imgs_url_arr[i] = "https:" + url
            wb_data.weibo_img = ';'.join(full_imgs_url_arr)
    except Exception:
        wb_data.weibo_img = ''

    try:
        imgs = str(
            each.find(attrs={
                'node-type': 'feed_content'
            }).find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('img'))
        imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))
        wb_data.weibo_preview_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_preview_img = ''

    try:
        li = str(
            each.find(attrs={
                'node-type': 'feed_content'
            }).find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('li'))
        extracted_url = urllib.parse.unquote(
            re.findall(r"video_src=(.+?)&", li)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''

    try:
        wb_data.weibo_cont = each.find(attrs={
            'node-type': 'feed_content'
        }).find(attrs={
            'node-type': 'feed_list_content'
        }).text.strip()
    except Exception:
        wb_data.weibo_cont = ''

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1

    try:
        wb_data.device = each.find(attrs={
            'class': 'WB_from S_txt2'
        }).find(attrs={
            'action-type': 'app_source'
        }).text
    except Exception:
        wb_data.device = ''

    try:
        wb_data.repost_num = int(
            each.find(attrs={
                'action-type': 'fl_forward'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(
            each.find(attrs={
                'action-type': 'fl_comment'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(
            each.find(attrs={
                'action-type': 'fl_like'
            }).find_all('em')[1].text)
    except Exception:
        wb_data.praise_num = 0

    praise = each.find(
        attrs={'suda-uatrack': "key=tblog_profile_v6&value=like_title"})
    if praise:
        praise_m = re.search("weibo.com/(\d+)/like", praise['href'])
        if praise_m:
            uid = praise_m.group(1)
            wb_praise = WeiboPraise()
            wb_praise.user_id = uid
            wb_praise.weibo_id = wb_data.weibo_id
            PraiseOper.add_one(wb_praise)
    return wb_data, is_all_cont
Ejemplo n.º 13
0
def get_weibo_info(each, html):
    wb_data = WeiboData()
    try:
        try:
            user_cont = each.find(attrs={'class': 'face'})
            user_info = user_cont.find('a')
            m = re.match(user_pattern, user_info.img.get('usercard'))

            if m:
                wb_data.uid = m.group(1)
            else:
                parser.warning('未提取到用户id,页面源码是{}'.format(html))
                return None

        except Exception as why:
            parser.error('解析用户信息出错,出错原因:{},页面源码是{}'.format(why, html))
            return None

        wb_data.weibo_id = each.find(attrs={
            'class': 'WB_screen'
        }).find('a').get('action-data')[4:]
        try:
            wb_data.weibo_url = each.find(
                attrs={'node-type': 'feed_list_item_date'})['href']
        except Exception as e:
            parser.error('解析微博url出错,出错原因是{},页面源码是{}'.format(e, html))
            return None

        try:
            feed_action = each.find(attrs={'class': 'feed_action'})
            wb_data.create_time = each.find(
                attrs={'node-type': 'feed_list_item_date'})['title']

        except Exception as why:
            parser.error('解析feed_action出错,出错原因:{},页面源码是{}'.format(why, html))
            wb_data.device = ''

        else:
            try:
                wb_data.repost_num = int(
                    feed_action.find(attrs={
                        'action-type': 'feed_list_forward'
                    }).find('em').text)
            except (AttributeError, ValueError):
                wb_data.repost_num = 0
            try:
                wb_data.comment_num = int(
                    feed_action.find(attrs={
                        'action-type': 'feed_list_comment'
                    }).find('em').text)
            except (AttributeError, ValueError):
                wb_data.comment_num = 0
            try:
                wb_data.praise_num = int(
                    feed_action.find(attrs={
                        'action-type': 'feed_list_like'
                    }).find('em').text)
            except (AttributeError, ValueError):
                wb_data.praise_num = 0

        try:
            wb_data.weibo_cont = each.find(attrs={
                'class': 'comment_txt'
            }).text.strip()
        except Exception as why:
            parser.error('解析微博内容出错:{}, 页面源码是{}'.format(why, html))
            return None

    except Exception as why:
        parser.error('整条解析出错,原因为:{}, 页面源码是{}'.format(why, html))
        return None
    else:
        return wb_data
Ejemplo n.º 14
0
def get_weibo_info(each, html):
    wb_data = WeiboData()

    try:
        wb_data.weibo_id = each['mid']
    except (AttributeError, IndexError, TypeError):
        parser.error('Failed to get weibo id, the page source is {}'.format(html))
        return None

    imgs = list()
    imgs_url = list()
    try:
        imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li'))
        imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    if IMG_ALLOW and imgs and imgs_url:
        app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url),
                      queue='download_queue', routing_key='for_download')
        wb_data.weibo_img_path = IMG_PATH
    else:
        wb_data.weibo_img_path = ''

    # todo 没找到vedio的测试数据
    try:
        a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a'))
        extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''

    try:
        wb_data.device = each.find(attrs={'class': 'from'}).find(attrs={'rel': 'nofollow'}).text
    except AttributeError:
        wb_data.device = ''

    try:
        # todo 日期格式化,会有今日XXX,X分钟前等噪音
        wb_data.create_time = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'}).text.strip()
        wb_data.weibo_url = 'https:'+each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href']
        wb_data.uid = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href'].split('/')[3]
    except (AttributeError, KeyError):
        wb_data.create_time = ''
        wb_data.weibo_url = ''
        wb_data.weibo_uid = ''

    try:
        wb_data.repost_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[0].find('a').text.split('/')[-1])
    except (AttributeError, ValueError):
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[1].find('a').text.split('/')[-1])
    except (AttributeError, ValueError):
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[2].find('a').find('em').text)
    except (AttributeError, ValueError):
        wb_data.praise_num = 0

    if '展开全文' in str(each):
        is_all_cont = 1
        try:
            wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content_full'}).text.strip()
        except Exception as why:
            parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html))
            return None
    else:
        is_all_cont = 1
        try:
            wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content'}).text.strip()
        except Exception as why:
            parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html))
            return None
    return wb_data, is_all_cont
Ejemplo n.º 15
0
def get_weibo_info(each, html):
    wb_data = WeiboData()
    user_cont = each.find(attrs={'class': 'face'})
    user_info = user_cont.find('a')
    m = re.match(USER_PATTERN, user_info.img.get('usercard'))

    if m:
        wb_data.uid = m.group(1)
    else:
        parser.warning("fail to get user'sid, the page source is{}".format(html))
        return None
    try:
        wb_data.weibo_id = each.find(attrs={'class': 'WB_screen'}).find('a').get('action-data')[4:]
    except (AttributeError, IndexError, TypeError):
        return None

    try:
        wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href']
    except Exception as e:
        parser.error('fail to get weibo url, the error is {}, the source page is {}'.format(e, html))
        return None

    def url_filter(url):
        return ':'.join([PROTOCOL, url]) if PROTOCOL not in url and ORIGIN not in url else url

    try:
        imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li'))
        imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    try:
        a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a'))
        extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''
    try:
        wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(attrs={'node-type': 'feed_list_item_date'})['date']
    except (AttributeError, KeyError):
        wb_data.create_time = ''
    else:
        create_time = int(create_time) / 1000  # 时间戳单位不同
        create_time = datetime.fromtimestamp(create_time)
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")

    try:
        feed_action = each.find(attrs={'class': 'feed_action'})
    except Exception as why:
        parser.error('failt to get feed_action, the error is {},the page source is {}'.format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        wb_data.weibo_cont = each.find(attrs={'class': 'comment_txt'}).text.strip()
    except Exception as why:
        parser.error('fail to get weibo cont, the error is {}, the page source is {}'.format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont