def get_gzh_by_search(text): """从搜索公众号获得的文本 提取公众号信息 Parameters ---------- text : str or unicode 搜索公众号获得的文本 Returns ------- list[dict] { 'open_id': '', # 微信号唯一ID 'profile_url': '', # 最近10条群发页链接 'headimage': '', # 头像 'wechat_name': '', # 名称 'wechat_id': '', # 微信id 'post_perm': '', # 最近一月群发数 'view_perm': '', # 最近一月阅读量 'qrcode': '', # 二维码 'introduction': '', # 介绍 'authentication': '' # 认证 } """ post_view_perms = WechatSogouStructuring.__get_post_view_perm(text) page = etree.HTML(text) lis = page.xpath('//ul[@class="news-list2"]/li') relist = [] for li in lis: url = get_first_of_element(li, 'div/div[1]/a/@href') headimage = format_image_url(get_first_of_element(li, 'div/div[1]/a/img/@src')) wechat_name = get_elem_text(get_first_of_element(li, 'div/div[2]/p[1]')) info = get_elem_text(get_first_of_element(li, 'div/div[2]/p[2]')) qrcode = get_first_of_element(li, 'div/div[3]/span/img[1]/@src') introduction = get_elem_text(get_first_of_element(li, 'dl[1]/dd')) authentication = get_first_of_element(li, 'dl[2]/dd/text()') relist.append({ 'open_id': headimage.split('/')[-1], 'profile_url': url, 'headimage': headimage, 'wechat_name': wechat_name.replace('red_beg', '').replace('red_end', ''), 'wechat_id': info.replace('微信号:', ''), 'qrcode': qrcode, 'introduction': introduction.replace('red_beg', '').replace('red_end', ''), 'authentication': authentication, 'post_perm': -1, 'view_perm': -1, }) if post_view_perms: for i in relist: if i['open_id'] in post_view_perms: post_view_perm = post_view_perms[i['open_id']].split(',') if len(post_view_perm) == 2: i['post_perm'] = int(post_view_perm[0]) i['view_perm'] = int(post_view_perm[1]) return relist
def get_gzh_by_search(text): """从搜索公众号获得的文本 提取公众号信息 Parameters ---------- text : str or unicode 搜索公众号获得的文本 Returns ------- list[dict] { 'open_id': '', # 微信号唯一ID 'profile_url': '', # 最近10条群发页链接 'headimage': '', # 头像 'wechat_name': '', # 名称 'wechat_id': '', # 微信id 'post_perm': '', # 最近一月群发数 'qrcode': '', # 二维码 'introduction': '', # 介绍 'authentication': '' # 认证 } """ page = etree.HTML(text) lis = page.xpath('//ul[@class="news-list2"]/li') relist = [] for li in lis: url = get_first_of_element(li, 'div/div[1]/a/@href') headimage = get_first_of_element(li, 'div/div[1]/a/img/@src') wechat_name = get_elem_text( get_first_of_element(li, 'div/div[2]/p[1]')) info = get_elem_text(get_first_of_element(li, 'div/div[2]/p[2]')) post_perm = 0 # TODO 月发文 <script>var account_anti_url = "/websearch/weixin/pc/anti_account.jsp?.......";</script> qrcode = get_first_of_element(li, 'div/div[3]/span/img[1]/@src') introduction = get_elem_text(get_first_of_element(li, 'dl[1]/dd')) authentication = get_first_of_element(li, 'dl[2]/dd/text()') relist.append({ 'open_id': headimage.split('/')[-1], 'profile_url': url, 'headimage': headimage, 'wechat_name': wechat_name.replace('red_beg', '').replace('red_end', ''), 'wechat_id': info.replace('微信号:', ''), 'post_perm': post_perm, 'qrcode': qrcode, 'introduction': introduction.replace('red_beg', '').replace('red_end', ''), 'authentication': authentication }) return relist
def get_gzh_by_search(text): """从搜索公众号获得的文本 提取公众号信息 Parameters ---------- text : str or unicode 搜索公众号获得的文本 Returns ------- list[dict] { 'open_id': '', # 微信号唯一ID 'profile_url': '', # 最近10条群发页链接 'headimage': '', # 头像 'wechat_name': '', # 名称 'wechat_id': '', # 微信id 'post_perm': '', # 最近一月群发数 'qrcode': '', # 二维码 'introduction': '', # 介绍 'authentication': '' # 认证 } """ page = etree.HTML(text) lis = page.xpath('//ul[@class="news-list2"]/li') relist = [] for li in lis: url = li.xpath('div/div[1]/a/@href') headimage = li.xpath('div/div[1]/a/img/@src') wechat_name = get_elem_text(li.xpath('div/div[2]/p[1]')[0]) info = get_elem_text(li.xpath('div/div[2]/p[2]')[0]) post_perm = 0 # TODO 月发文 <script>var account_anti_url = "/websearch/weixin/pc/anti_account.jsp?.......";</script> qrcode = li.xpath('div/div[3]/span/img[1]/@src') introduction = get_elem_text(li.xpath('dl[1]/dd')[0]) authentication = li.xpath('dl[2]/dd/text()') relist.append({ 'open_id': headimage[0].split('/')[-1], 'profile_url': url[0], 'headimage': headimage[0], 'wechat_name': wechat_name.replace('red_beg', '').replace('red_end', ''), 'wechat_id': info.replace('微信号:', ''), 'post_perm': post_perm, 'qrcode': qrcode[0] if qrcode else '', 'introduction': introduction.replace('red_beg', '').replace('red_end', ''), 'authentication': authentication[0] if authentication else '' }) return relist
def test_get_elem_text(self): html = ''' <div> <div>111</div> <div>222</div> </div> ''' elem = etree.HTML(html) assert_equal(get_elem_text(elem), '111222')
def test_get_elem_text(self): html = ''' <div> <div>111</div> <div>222</div> </div> ''' elem = etree.HTML(html) assert_equal(get_elem_text(elem), '111222')
def get_gzh_by_search(text): """从搜索公众号获得的文本 提取公众号信息 Parameters ---------- text : str or unicode 搜索公众号获得的文本 Returns ------- list[dict] { 'open_id': '', # 微信号唯一ID 'profile_url': '', # 最近10条群发页链接 'headimage': '', # 头像 'wechat_name': '', # 名称 'wechat_id': '', # 微信id 'post_perm': '', # 最近一月群发数 'view_perm': '', # 最近一月阅读量 'qrcode': '', # 二维码 'introduction': '', # 介绍 'authentication': '' # 认证 } """ post_view_perms = WechatSogouStructuring.__get_post_view_perm(text) page = etree.HTML(text) lis = page.xpath('//ul[@class="news-list2"]/li') relist = [] for li in lis: url = get_first_of_element(li, 'div/div[1]/a/@href') headimage = get_first_of_element(li, 'div/div[1]/a/img/@src') wechat_name = get_elem_text( get_first_of_element(li, 'div/div[2]/p[1]')) info = get_elem_text(get_first_of_element(li, 'div/div[2]/p[2]')) qrcode = get_first_of_element(li, 'div/div[3]/span/img[1]/@src') introduction = get_elem_text(get_first_of_element(li, 'dl[1]/dd')) authentication = get_first_of_element(li, 'dl[2]/dd/text()') relist.append({ 'open_id': headimage.split('/')[-1], 'profile_url': url, 'headimage': headimage, 'wechat_name': wechat_name.replace('red_beg', '').replace('red_end', ''), 'wechat_id': info.replace('微信号:', ''), 'qrcode': qrcode, 'introduction': introduction.replace('red_beg', '').replace('red_end', ''), 'authentication': authentication, 'post_perm': -1, 'view_perm': -1, }) if post_view_perms: for i in relist: if i['open_id'] in post_view_perms: post_view_perm = post_view_perms[i['open_id']].split(',') if len(post_view_perm) == 2: i['post_perm'] = int(post_view_perm[0]) i['view_perm'] = int(post_view_perm[1]) return relist
def get_article_by_search(text): """从搜索文章获得的文本 提取章列表信息 Parameters ---------- text : str or unicode 搜索文章获得的文本 Returns ------- list[dict] { 'article': { 'title': '', # 文章标题 'url': '', # 文章链接 'imgs': '', # 文章图片list 'abstract': '', # 文章摘要 'time': '' # 文章推送时间 }, 'gzh': { 'profile_url': '', # 公众号最近10条群发页链接 'headimage': '', # 头像 'wechat_name': '', # 名称 'isv': '', # 是否加v } } """ page = etree.HTML(text) lis = page.xpath('//ul[@class="news-list"]/li') articles = [] for li in lis: url = get_first_of_element(li, 'div[1]/a/@href') if url: title = get_first_of_element(li, 'div[2]/h3/a') imgs = li.xpath('div[1]/a/img/@src') abstract = get_first_of_element(li, 'div[2]/p') time = get_first_of_element(li, 'div[2]/div/span/script/text()') gzh_info = li.xpath('div[2]/div/a')[0] else: url = get_first_of_element(li, 'div/h3/a/@href') title = get_first_of_element(li, 'div/h3/a') imgs = [] spans = li.xpath('div/div[1]/a') for span in spans: img = span.xpath('span/img/@src') if img: imgs.append(img) abstract = get_first_of_element(li, 'div/p') time = get_first_of_element(li, 'div/div[2]/span/script/text()') gzh_info = li.xpath('div/div[2]/a')[0] if title is not None: title = get_elem_text(title).replace("red_beg", "").replace( "red_end", "") if abstract is not None: abstract = get_elem_text(abstract).replace("red_beg", "").replace( "red_end", "") time = re.findall('timeConvert\(\'(.*?)\'\)', time) time = list_or_empty(time, int) profile_url = get_first_of_element(gzh_info, '@href') headimage = get_first_of_element(gzh_info, '@data-headimage') wechat_name = get_first_of_element(gzh_info, 'text()') gzh_isv = get_first_of_element(gzh_info, '@data-isv', int) articles.append({ 'article': { 'title': title, 'url': url, 'imgs': imgs, 'abstract': abstract, 'time': time }, 'gzh': { 'profile_url': profile_url, 'headimage': headimage, 'wechat_name': wechat_name, 'isv': gzh_isv, } }) return articles
def get_article_by_search(text): """从搜索文章获得的文本 提取章列表信息 Parameters ---------- text : str or unicode 搜索文章获得的文本 Returns ------- list[dict] { 'article': { 'title': '', # 文章标题 'url': '', # 文章链接 'imgs': '', # 文章图片list 'abstract': '', # 文章摘要 'time': '' # 文章推送时间 }, 'gzh': { 'profile_url': '', # 公众号最近10条群发页链接 'headimage': '', # 头像 'wechat_name': '', # 名称 'isv': '', # 是否加v } } """ page = etree.HTML(text) lis = page.xpath('//ul[@class="news-list"]/li') articles = [] for li in lis: url = get_first_of_element(li, 'div[1]/a/@href') if url: title = get_first_of_element(li, 'div[2]/h3/a') imgs = li.xpath('div[1]/a/img/@src') abstract = get_first_of_element(li, 'div[2]/p') time = get_first_of_element(li, 'div[2]/div/span/script/text()') gzh_info = li.xpath('div[2]/div/a')[0] else: url = get_first_of_element(li, 'div/h3/a/@href') title = get_first_of_element(li, 'div/h3/a') imgs = [] spans = li.xpath('div/div[1]/a') for span in spans: img = span.xpath('span/img/@src') if img: imgs.append(img) abstract = get_first_of_element(li, 'div/p') time = get_first_of_element(li, 'div/div[2]/span/script/text()') gzh_info = li.xpath('div/div[2]/a')[0] if title is not None: title = get_elem_text(title).replace("red_beg", "").replace("red_end", "") if abstract is not None: abstract = get_elem_text(abstract).replace("red_beg", "").replace("red_end", "") time = re.findall('timeConvert\(\'(.*?)\'\)', time) time = list_or_empty(time, int) profile_url = get_first_of_element(gzh_info, '@href') headimage = get_first_of_element(gzh_info, '@data-headimage') wechat_name = get_first_of_element(gzh_info, 'text()') gzh_isv = get_first_of_element(gzh_info, '@data-isv', int) articles.append({ 'article': { 'title': title, 'url': url, 'imgs': format_image_url(imgs), 'abstract': abstract, 'time': time }, 'gzh': { 'profile_url': profile_url, 'headimage': headimage, 'wechat_name': wechat_name, 'isv': gzh_isv, } }) return articles