Python url_filterの例、utils.url_filter Pythonの例

コード例 #1

0

ファイルを表示

def get_headimg(html):
    """
    Get the head img url of current user
    :param html: page source
    :return: head img url
    """
    soup = BeautifulSoup(_get_header(html), 'html.parser')
    try:
        headimg = url_filter(soup.find(attrs={'class': 'photo_wrap'}).find(attrs={'class': 'photo'})['src'])
    except AttributeError:
        headimg = ''
    return headimg

コード例 #2

0

ファイルを表示

ファイル: weibo_detail.py プロジェクト: Iamnvincible/WeiboSpider

def get_weibo_clip(feed_content, is_repost):
    """
    获取微博配的视频
    :param feed_content: 单条微博feed_content下的html
    :param is_repost: 是否为转发
    :return: 微博所配视频链接
    """
    clip = ""
    if is_repost:
        return clip
    else:
        try:
            li = str(
                feed_content.find(attrs={
                    'node-type': 'feed_list_media_prev'
                }).find_all('li'))
            extracted_url = urllib.parse.unquote(
                re.findall(r"video_src=(.+?)&amp;", li)[0])
            return url_filter(extracted_url)
        except:
            return clip

コード例 #3

0

ファイルを表示

def get_weibo_info(each, html):
    wb_data = WeiboData()

    try:
        wb_data.weibo_id = each['mid']
    except (AttributeError, IndexError, TypeError):
        parser.error(
            'Failed to get weibo id, the page source is {}'.format(html))
        return None

    imgs = list()
    imgs_url = list()
    try:
        imgs = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('li'))
        imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    if IMG_ALLOW and imgs and imgs_url:
        app.send_task('tasks.downloader.download_img_task',
                      args=(wb_data.weibo_id, imgs_url),
                      queue='download_queue',
                      routing_key='for_download')
        wb_data.weibo_img_path = IMG_PATH
    else:
        wb_data.weibo_img_path = ''

    # todo 没找到vedio的测试数据
    try:
        a_tag = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('a'))
        extracted_url = urllib.parse.unquote(
            re.findall(r"full_url=(.+?)&amp;", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''

    try:
        wb_data.device = each.find(attrs={
            'class': 'from'
        }).find(attrs={
            'rel': 'nofollow'
        }).text
    except AttributeError:
        wb_data.device = ''

    try:
        # todo 日期格式化,会有今日XXX，X分钟前等噪音
        wb_data.create_time = each.find(attrs={
            'class': 'from'
        }).find(attrs={
            'target': '_blank'
        }).text.strip()
        wb_data.weibo_url = 'https:' + each.find(attrs={
            'class': 'from'
        }).find(attrs={'target': '_blank'})['href']
        wb_data.uid = each.find(attrs={
            'class': 'from'
        }).find(attrs={'target': '_blank'})['href'].split('/')[3]
    except (AttributeError, KeyError):
        wb_data.create_time = ''
        wb_data.weibo_url = ''
        wb_data.weibo_uid = ''

    try:
        wb_data.repost_num = int(
            each.find(attrs={
                'class': 'card-act'
            }).find_all('li')[1].find('a').text.split(' ')[-1])
    except (AttributeError, ValueError):
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(
            each.find(attrs={
                'class': 'card-act'
            }).find_all('li')[2].find('a').text.split(' ')[-1])
    except (AttributeError, ValueError):
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(
            each.find(attrs={
                'class': 'card-act'
            }).find_all('li')[3].find('a').find('em').text)
    except (AttributeError, ValueError):
        wb_data.praise_num = 0

    if '展开全文' in str(each):
        is_all_cont = 1
        try:
            wb_data.weibo_cont = each.find(
                attrs={
                    'node-type': 'feed_list_content_full'
                }).text.strip()
        except Exception as why:
            parser.error(
                'Failed to get weibo cont, the error is {}, the page source is {}'
                .format(why, html))
            return None
    else:
        is_all_cont = 1
        try:
            wb_data.weibo_cont = each.find(attrs={
                'node-type': 'feed_list_content'
            }).text.strip()
        except Exception as why:
            parser.error(
                'Failed to get weibo cont, the error is {}, the page source is {}'
                .format(why, html))
            return None
    return wb_data, is_all_cont

コード例 #4

0

ファイルを表示

ファイル: search.py プロジェクト: cptBTptpwbct/weibo

def get_weibo_info(each, html):
    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    usercard = user_cont.find('img').get('usercard', '')
    # this only for login user
    if not usercard:
        return None
    wb_data.uid = usercard.split('&')[0][3:]

    try:
        wb_data.weibo_id = each.find(attrs={
            'class': 'WB_screen'
        }).find('a').get('action-data')[4:]
    except (AttributeError, IndexError, TypeError):
        return None

    try:
        wb_data.weibo_url = each.find(
            attrs={'node-type': 'feed_list_item_date'})['href']
    except Exception as e:
        parser.error(
            'Failed to get weibo url, the error is {}, the source page is {}'.
            format(e, html))
        return None

    imgs = list()
    imgs_url = list()
    try:
        imgs = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('li'))
        imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    if IMG_ALLOW and imgs and imgs_url:
        app.send_task('tasks.downloader.download_img_task',
                      args=(wb_data.weibo_id, imgs_url),
                      queue='download_queue',
                      routing_key='for_download')
        wb_data.weibo_img_path = IMG_PATH
    else:
        wb_data.weibo_img_path = ''

    try:
        a_tag = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('a'))
        extracted_url = urllib.parse.unquote(
            re.findall(r"full_url=(.+?)&amp;", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''
    try:
        wb_data.device = each.find(attrs={
            'class': 'feed_from'
        }).find(attrs={
            'rel': 'nofollow'
        }).text
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(
            attrs={'node-type': 'feed_list_item_date'})['date']
    except (AttributeError, KeyError):
        wb_data.create_time = ''
    else:
        create_time = int(create_time) / 1000  # 时间戳单位不同
        create_time = datetime.fromtimestamp(create_time)
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")

    try:
        feed_action = each.find(attrs={'class': 'feed_action'})
    except Exception as why:
        parser.error(
            'Failed to get feed_action, the error is {},the page source is {}'.
            format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(
                feed_action.find(attrs={
                    'action-type': 'feed_list_like'
                }).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        wb_data.weibo_cont = each.find(attrs={
            'class': 'comment_txt'
        }).text.strip()
    except Exception as why:
        parser.error(
            'Failed to get weibo cont, the error is {}, the page source is {}'.
            format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont

コード例 #5

0

ファイルを表示

ファイル: test_utils.py プロジェクト: ResolveWang/WeiboSpider

def test_url_filter(url, expect):
    assert url_filter(url) == expect

コード例 #6

0

ファイルを表示

ファイル: test_utils.py プロジェクト: Doraying1230/Python-Study

def test_url_filter(url, expect):
    assert url_filter(url) == expect

コード例 #7

0

ファイルを表示

ファイル: search.py プロジェクト: Endlex-net/weibospider

def get_weibo_info(each, html):
    # print ("----------------------")
    wb_data = WeiboData()
    # print ("-------" * 10)
    # print(each)
    # print ("#$#" * 10)
    # print(html)
    # print ("-----" * 10)

    user_cont = each.find(attrs={'class': 'card-feed'})
    user_avator = user_cont.find(attrs={'class': 'avator'})
    #usercard = user_cont.find('img').get('usercard', '')
    usercard = user_avator.find('a').get('href', '')
    # this only for login user
    if not usercard:
        return None
    wb_data.uid = usercard.split('?')[0][12:]
    # print ("uid", wb_data.uid)

    try:
        wb_data.weibo_id = each.find(attrs={'title': '赞'}).get('action-data')[4:]
        # print ("weibo_id", wb_data.weibo_id)
    except (AttributeError, IndexError, TypeError):
        return None

    try:
        wb_data.weibo_url = each.find(attrs={"class": "content"}).find(attrs={"class": "from"}).find('a').get("href", "")[2:]
        # wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href']
        # print ("weibo_url", wb_data.weibo_url)
    except Exception as e:
        parser.error('Failed to get weibo url, the error is {}, the source page is {}'.format(e, html))
        return None

    imgs = list()
    imgs_url = list()
    try:
        imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li'))
        imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    if IMG_ALLOW and imgs and imgs_url:
        app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url),
                      queue='download_queue', routing_key='for_download')
        wb_data.weibo_img_path = IMG_PATH
    else:
        wb_data.weibo_img_path = ''

    try:
        a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a'))
        extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&amp;", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''
    try:
        wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text.strip()
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(attrs={"class": "content"}).find(attrs={"class": "from"}).find('a').text.strip()
        if "年" not in create_time and "月" in create_time:
            create_time = "2019年" + create_time
        elif "今天" in create_time:
            pass
            create_time.replace("今天", datetime.datetime.now().strftime("%Y-%m-%d "))
            print( "啦啦啦啦啦 今天")

        create_time = datetime.datetime.strptime(create_time, "%Y年%m月%d日 %H:%M")
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")
        # print ("create_time", wb_data.create_time)
    except Exception as e:
        traceback.print_exc()
        wb_data.create_time = ''

    try:
        feed_action = each.find(attrs={'class': 'card-act'})
    except Exception as why:
        parser.error('Failed to get feed_action, the error is {},the page source is {}'.format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        try:
            wb_data.weibo_cont = each.find(attrs={"node-type": "feed_list_content_full"}).text.strip()
            # print ("full_weibo_cont", wb_data.weibo_cont)
        except:
            wb_data.weibo_cont = each.find(attrs={'class': 'txt'}).text.strip()
            # print ("weibo_cont", wb_data.weibo_cont)
    except Exception as why:
        parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont

コード例 #8

0

ファイルを表示

ファイル: search.py プロジェクト: ResolveWang/WeiboSpider

def get_weibo_info(each, html):
    wb_data = WeiboData()

    try:
        wb_data.weibo_id = each['mid']
    except (AttributeError, IndexError, TypeError):
        parser.error('Failed to get weibo id, the page source is {}'.format(html))
        return None

    imgs = list()
    imgs_url = list()
    try:
        imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li'))
        imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    if IMG_ALLOW and imgs and imgs_url:
        app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url),
                      queue='download_queue', routing_key='for_download')
        wb_data.weibo_img_path = IMG_PATH
    else:
        wb_data.weibo_img_path = ''

    # todo 没找到vedio的测试数据
    try:
        a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a'))
        extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&amp;", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''

    try:
        wb_data.device = each.find(attrs={'class': 'from'}).find(attrs={'rel': 'nofollow'}).text
    except AttributeError:
        wb_data.device = ''

    try:
        # todo 日期格式化,会有今日XXX，X分钟前等噪音
        wb_data.create_time = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'}).text.strip()
        wb_data.weibo_url = 'https:'+each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href']
        wb_data.uid = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href'].split('/')[3]
    except (AttributeError, KeyError):
        wb_data.create_time = ''
        wb_data.weibo_url = ''
        wb_data.weibo_uid = ''

    try:
        wb_data.repost_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[0].find('a').text.split('/')[-1])
    except (AttributeError, ValueError):
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[1].find('a').text.split('/')[-1])
    except (AttributeError, ValueError):
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[2].find('a').find('em').text)
    except (AttributeError, ValueError):
        wb_data.praise_num = 0

    if '展开全文' in str(each):
        is_all_cont = 1
        try:
            wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content_full'}).text.strip()
        except Exception as why:
            parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html))
            return None
    else:
        is_all_cont = 1
        try:
            wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content'}).text.strip()
        except Exception as why:
            parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html))
            return None
    return wb_data, is_all_cont

コード例 #9

0

ファイルを表示

def get_weibo_info(each, html):
    wb_data = WeiboData()

    user_cont = each.find(attrs={'class': 'face'})
    user_href = user_cont.find('a').get('href', '')
    if not user_href:
        parser.warning('Failed to get user id')
        return None
    wb_data.uid = parse_url(user_href).path[3:]

    try:
        wb_data.weibo_id = each.find(attrs={
            'class': 'WB_screen'
        }).find('a').get('action-data')[4:]
    except (AttributeError, IndexError, TypeError):
        return None

    try:
        wb_data.weibo_url = each.find(
            attrs={'node-type': 'feed_list_item_date'})['href']
    except Exception as e:
        parser.error(
            'Failed to get weibo url, the error is {}, the source page is {}'.
            format(e, html))
        return None
    try:
        imgs = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('li'))
        imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    try:
        a_tag = str(
            each.find(attrs={
                'node-type': 'feed_list_media_prev'
            }).find_all('a'))
        extracted_url = urllib.parse.unquote(
            re.findall(r"full_url=(.+?)&amp;", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''
    try:
        wb_data.device = each.find(attrs={
            'class': 'feed_from'
        }).find(attrs={
            'rel': 'nofollow'
        }).text
    except AttributeError:
        wb_data.device = ''

    try:
        create_time = each.find(
            attrs={'node-type': 'feed_list_item_date'})['date']
    except (AttributeError, KeyError):
        wb_data.create_time = ''
    else:
        create_time = int(create_time) / 1000  # 时间戳单位不同
        create_time = datetime.fromtimestamp(create_time)
        wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M")

    try:
        feed_action = each.find(attrs={'class': 'feed_action'})
    except Exception as why:
        parser.error(
            'Failed to get feed_action, the error is {},the page source is {}'.
            format(why, each))
    else:
        feed_infos = feed_action.find_all('li')
        try:
            wb_data.repost_num = get_feed_info(feed_infos, '转发')
        except (AttributeError, ValueError):
            wb_data.repost_num = 0
        try:
            wb_data.comment_num = get_feed_info(feed_infos, '评论')
        except (AttributeError, ValueError):
            wb_data.comment_num = 0
        try:
            wb_data.praise_num = int(
                feed_action.find(attrs={
                    'action-type': 'feed_list_like'
                }).find('em').text)
        except (AttributeError, ValueError):
            wb_data.praise_num = 0

    try:
        wb_data.weibo_cont = each.find(attrs={
            'class': 'comment_txt'
        }).text.strip()
    except Exception as why:
        parser.error(
            'Failed to get weibo cont, the error is {}, the page source is {}'.
            format(why, html))
        return None

    if '展开全文' in str(each):
        is_all_cont = 0
    else:
        is_all_cont = 1
    return wb_data, is_all_cont