def test_replace_html(self): html = ''''"&¥amp;<> \\''' assert_equal(replace_html(html), '\'"&¥<> ') html = [''', '"', '&', '¥', 'amp;', '<', '>', ' ', '\\'] assert_equal(replace_html(html), ['\'', '"', '&', '¥', '', '<', '>', ' ', '']) html = {''': '"'} assert_equal(replace_html(html), {'\'': '"'})
def __handle_content_url(content_url): content_url = replace_html(content_url) return ('http://mp.weixin.qq.com{}'.format(content_url) if 'http://mp.weixin.qq.com' not in content_url else content_url) if content_url else ''
def get_article_by_history_json(text, article_json=None, **kwargs): """从 历史消息页的文本 提取文章列表信息 Parameters ---------- text : str or unicode 历史消息页的文本 article_json : dict 历史消息页的文本 提取出来的文章json dict kwargs ?? Returns ------- list of dict { ?? } """ # TODO 加上返回的数据的文档 if article_json is None: article_json = find_article_json_re.findall(text) article_json = article_json[0] + '}}]}' article_json = json.loads(article_json) biz = kwargs.get('biz', '') uin = kwargs.get('uin', '') key = kwargs.get('key', '') items = list() for listdic in article_json['list']: item = dict() comm_msg_info = listdic['comm_msg_info'] item['send_id'] = comm_msg_info.get('id', '') # 不可判重,一次群发的消息的id是一样的 item['datetime'] = comm_msg_info.get('datetime', '') item['type'] = str(comm_msg_info.get('type', '')) if item['type'] == '1': # 文字 item['content'] = comm_msg_info.get('content', '') elif item['type'] == '3': # 图片 item[ 'img_url'] = 'https://mp.weixin.qq.com/mp/getmediadata?__biz=' + biz + '&type=img&mode=small&msgid=' + \ str(item['qunfa_id']) + '&uin=' + uin + '&key=' + key elif item['type'] == '34': # 音频 item['play_length'] = listdic['voice_msg_ext_info'].get( 'play_length', '') item['fileid'] = listdic['voice_msg_ext_info'].get( 'fileid', '') item['audio_src'] = 'https://mp.weixin.qq.com/mp/getmediadata?__biz=' + biz + '&type=voice&msgid=' + \ str(item['qunfa_id']) + '&uin=' + uin + '&key=' + key elif item['type'] == '49': # 图文 app_msg_ext_info = listdic['app_msg_ext_info'] url = app_msg_ext_info.get('content_url') url = replace_html(url) if url: url = 'http://mp.weixin.qq.com' + url if 'http://mp.weixin.qq.com' not in url else url else: url = '' item['main'] = 1 item['title'] = app_msg_ext_info.get('title', '') item['digest'] = app_msg_ext_info.get('digest', '') item['fileid'] = app_msg_ext_info.get('fileid', '') item['content_url'] = url item['source_url'] = app_msg_ext_info.get('source_url', '') item['cover'] = app_msg_ext_info.get('cover', '') item['author'] = app_msg_ext_info.get('author', '') item['copyright_stat'] = app_msg_ext_info.get( 'copyright_stat', '') items.append(item) if app_msg_ext_info.get('is_multi', 0) == 1: for multidic in app_msg_ext_info[ 'multi_app_msg_item_list']: url = multidic.get('content_url') if url: url = 'http://mp.weixin.qq.com' + url if 'http://mp.weixin.qq.com' not in url else url else: url = '' itemnew = dict() itemnew['send_id'] = item[ 'send_id'] # TODO send_id 和 qunfa_id 只有一个可以通过测试 itemnew['datetime'] = item['datetime'] itemnew['type'] = item['type'] itemnew['main'] = 0 itemnew['title'] = multidic.get('title', '') itemnew['digest'] = multidic.get('digest', '') itemnew['fileid'] = multidic.get('fileid', '') itemnew['content_url'] = url.replace('&', '&') itemnew['source_url'] = multidic.get('source_url', '') itemnew['cover'] = multidic.get('cover', '') itemnew['author'] = multidic.get('author', '') itemnew['copyright_stat'] = multidic.get( 'copyright_stat', '') items.append(itemnew) continue elif item['type'] == '62': item['cdn_videoid'] = listdic['video_msg_ext_info'].get( 'cdn_videoid', '') item['thumb'] = listdic['video_msg_ext_info'].get('thumb', '') item[ 'video_src'] = 'https://mp.weixin.qq.com/mp/getcdnvideourl?__biz=' + biz + '&cdn_videoid=' + item[ 'cdn_videoid'] + '&thumb=' + item[ 'thumb'] + '&uin=' + uin + '&key=' + key items.append(item) items_new = [] # 删除搜狗本身携带的空数据 for item in items: if (int(item['type']) == 49) and (not item['content_url']): pass else: items_new.append(item) return items_new
def __handle_content_url(content_url): content_url = replace_html(content_url) return ('http://mp.weixin.qq.com{}'.format( content_url) if 'http://mp.weixin.qq.com' not in content_url else content_url) if content_url else ''