def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html = tools.get_html_by_urllib(root_url) title = '<tr height="25"><td><a href=".*?" title="(.*?)"' video_url = ['<tr height="25"><td><a href="(.*?)"'] author = ['<a href="user-.*?.html" target="_blank">(.*?)</a>'] watched_count = ['浏览次数: </span>(.*?) '] file_size = ['资料大小: </span>(.*?) '] download_count = ['下载次数: </span>(.*?) '] titles = tools.get_info(html, title, allow_repeat = True) video_urls = tools.get_info(html, video_url, allow_repeat = True) authors = tools.get_info(html, author, allow_repeat = True) watched_counts = tools.get_info(html, watched_count, allow_repeat = True) file_sizes = tools.get_info(html, file_size, allow_repeat= True) download_counts = tools.get_info(html, download_count, allow_repeat = True) for i in range(len(titles)): title = titles[i] title = tools.del_html_tag(title) video_url = video_urls[i] video_url = tools.get_full_url('http://www.sobaidupan.com', video_url) author = authors[i] watched_count = watched_counts[i] file_size = file_sizes[i] download_count = download_counts[i] log.debug(''' 标题: %s 视频地址: %s 作者: %s 观看数 %s 资料大小 %s 下载次数 %s '''%(title, video_url, author, watched_count, file_size, download_count)) contained_key, contained_key_count = base_parser.get_contained_key(title, '', remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, video_url, title, file_size = file_size, file_name = title, author = author, watched_count = watched_count, download_count = download_count, search_type = search_type, keyword = contained_key, keyword_count = contained_key_count, task_id = remark['task_id']) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html, requests = tools.get_html_by_requests(root_url, headers=HEADER) titles = tools.get_tag( html, 'div', {'id': tools.re.compile('id_cse_content_item_mid_.')}) for i in range(0, len(titles)): try: url = tools.get_tag(titles[i].previous_sibling.previous_sibling, 'a', find_all=False) url = url['href'] html2 = tools.get_html_by_urllib(url) regexs = ['<title>(.+?)</title>'] mark = ''.join(tools.get_info(html2, regexs)) regexs = ['不存在', '取消'] if not tools.get_info(mark, regexs): title = tools.get_text( titles[i].previous_sibling.previous_sibling) title = tools.del_html_tag(title) info = tools.get_text(titles[i]) file_name = tools.del_html_tag(''.join( tools.get_info(info, '文件名:(.+?)文'))) file_size = tools.del_html_tag(''.join( tools.get_info(info, '文件大小:(.+?)分'))) author = tools.del_html_tag(''.join( tools.get_info(info, '分享者:(.+?)时'))) release_time = ''.join(tools.get_info(info, '时间:(.+?)下')).replace( '\n', '') download_count = tools.del_html_tag(''.join( tools.get_info(info, '下载次数:(.+?)\.'))) except: continue log.debug(''' 标题: %s 文件大小:%s 文件名字:%s 作者: %s 原文url: %s 下载数量:%s 日期: %s ''' % (title, file_size, file_name, author, url, download_count, release_time)) contained_key, contained_key_count = base_parser.get_contained_key( title, '', remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, url, title, file_size=file_size, file_name=file_name, author=author, release_time=release_time, download_count=download_count, search_type=search_type, keyword=contained_key, keyword_count=contained_key_count, task_id=remark['task_id']) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html, requests = tools.get_html_by_requests(root_url) titles = tools.get_tag(html, 'h3') video_infos = tools.get_tag(html, 'dt') for i in range(0, len(titles)): title = tools.get_text(titles[i]) title = tools.del_html_tag(title) try: url = titles[i].a['href'] except: continue url = 'http://www.bturls.net' + url release_time = video_infos[i].span release_time = tools.get_text(release_time) file_size = video_infos[i].span.next_sibling.next_sibling file_size = tools.get_text(file_size) watched_count = video_infos[ i].span.next_sibling.next_sibling.next_sibling.next_sibling watched_count = tools.get_text(watched_count) regexs = ['t/(.+?)\.'] magnet_link = 'magnet:?xt=urn:btih:' + ''.join( tools.get_info(url, regexs)) log.debug( ''' 标题: %s 文件大小:%s 原文url: %s 观看数量:%s 磁力链接:%s 日期: %s ''' % (title, file_size, url, watched_count, magnet_link, release_time)) contained_key, contained_key_count = base_parser.get_contained_key( title, '', remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, url, title, file_size=file_size, release_time=release_time, watched_count=watched_count, magnet_link=magnet_link, search_type=search_type, keyword=contained_key, keyword_count=contained_key_count, task_id=remark['task_id']) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html = tools.get_html_by_webdirver(root_url) headers = tools.get_tag(html, 'h3', {'class': 't'}) for i in range(0, len(headers)): title = tools.get_text(headers[i]) title = tools.del_html_tag(title) if tools.re.compile('的相关视频在线观看_百度视频').findall(title): continue try: ssurl = headers[i].a["href"] except: continue r = tools.requests.head(ssurl) url = r.headers['Location'] try: img = headers[i].next_sibling()[0].img['src'] except: img = '' try: release_time = headers[i].next_sibling()[0] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) if not release_time: release_time = headers[i].next_sibling()[1] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) if not release_time: release_time = headers[i].next_sibling()[2] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) if not release_time: release_time = headers[i].next_sibling()[3] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) release_time = release_time.replace('年', '-').replace('月', '-').replace( '日', '') except: release_time = '' content = '' for content in headers[i].next_sibling(): content = tools.get_tag(content, 'div', {'class': 'c-abstract'}, find_all=False) if content: content = tools.get_text(content) break else: content = '' log.debug(''' 标题: %s 内容: %s 原文url:%s 图片url:%s 日期: %s ''' % (title, content, url, img, release_time)) contained_key, contained_key_count = base_parser.get_contained_key( title, content, remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue is_video1 = base_parser.is_have_video_by_site(url) if not is_video1: is_video2 = base_parser.is_have_video_by_judge(title, content) if is_video2: html2 = tools.get_html_by_requests(url) is_video3 = base_parser.is_have_video_by_common(html2) if not is_video3: continue else: continue base_parser.add_content_info('VA_content_info', SITE_ID, url=url, title=title, content=content, image_url=img, release_time=release_time, search_type=search_type, keyword=contained_key, keyword_count=contained_key_count) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36', 'Cookie': '_T_WM=6f2675cab22ec5e018673192b46dd73b; SUB=_2A251adqIDeRxGeNO7FER8ivPyziIHXVWlebArDV6PUJbkdANLXHjkW2Eu3heWA8h0QMZFxI0_fe7-s2Isw..; SUHB=0HThVFDsKbumHU; SCF=AvoSYQqv89TMIxx4YQUcoIdBp2-sjJbx28qHTTnKAHOymGxToTyDJijAZJl_Nqe3ve0x2U-Yk5poeuVn7bSqyt0.; M_WEIBOCN_PARAMS=featurecode%3D20000180%26oid%3D4060829337409043%26luicode%3D10000011%26lfid%3D106003type%253D1%26fid%3D100103type%253D1%2526q%253D%25E5%25A5%25B3%25E4%25B8%25BB%25E6%2592%25AD%26uicode%3D10000011', 'Host': 'm.weibo.cn', 'Accept': 'application/json, text/plain, */*', 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest', 'Referer': 'https://m.weibo.cn/p/100103type%3D1%26q%3D%E5%A5%B3%E4%B8%BB%E6%92%AD?type=all&queryVal=%E5%A5%B3%E4%B8%BB%E6%92%AD&luicode=10000011&lfid=106003type%3D1&title=%E5%A5%B3%E4%B8%BB%E6%92%AD' } resp = tools.requests.get('%s' % root_url, headers=headers) infos = resp.json() try: cards = infos['cards'] except: base_parser.update_url('VA_urls', root_url, Constance.DONE) return for i in cards: try: card_group = i['card_group'] # print(card_group) # print('****************************') # print('****************************') for group in card_group: if group['mblog']: mblog = group['mblog'] else: continue if not get_video_url(mblog): continue url = get_url(mblog) page_url = get_page_url(mblog) origin = get_origin(mblog) content = get_content(mblog) reposts_count = get_reposts_count(mblog) comment_count = get_comments_count(mblog) attitudes_count = get_attitudes_count(mblog) author = get_author(mblog) image_url = get_image_url(mblog) release_time = get_release_time(mblog) video_url = get_video_url(mblog) log.debug(''' 内容: %s 原文url:%s 作者: %s 来源: %s 视频封面:%s 视频地址:%s 日期: %s 转发数: %s 评论数: %s 点赞数: %s ''' % (content, url, author, origin, image_url, video_url, release_time, str(reposts_count), str(comment_count), str(attitudes_count))) contained_key, contained_key_count = base_parser.get_contained_key( '', content, remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, url=video_url, release_time=release_time, origin=origin, title=content, reposts_count=reposts_count, comment_count=comment_count, attitudes_count=attitudes_count, author=author, image_url=image_url, video_url=video_url, search_type=search_type, keyword=contained_key, keyword_count=contained_key_count, task_id=remark['task_id']) except: pass base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html = tools.get_html_by_urllib(root_url) regex = '<tr class="item">(.*?)</tr>' infos = tools.get_info(html, regex) for info in infos: title = ['<a href=".*?" onclick=.*?; class="">(.*?)</a>'] title = tools.get_info(info, title, allow_repeat=True) title = ''.join(title) title = tools.del_html_tag(title) video_url = ['<a href="(.*?)" onclick=.*?; class="">'] video_url = tools.get_info(info, video_url, allow_repeat=True) video_url = ''.join(video_url) comment_count = ['<span class="pl">\((\d*?)人评价\)</span>'] comment_count = tools.get_info(info, comment_count, allow_repeat=True) comment_count = ''.join(comment_count) comment_count = int(comment_count) if comment_count else 0 release_time = '<p class="pl">(\d{4}-\d{2}-\d{2}).*?</p>' release_time = tools.get_info(info, release_time, allow_repeat=True) release_time = ''.join(release_time) image_url = '<img src="(.*?)" alt=".*?" class=""/>' image_url = tools.get_info(info, image_url, allow_repeat=True) image_url = ''.join(image_url) content = '<p class="pl">(.*?)</p>' content = tools.get_info(info, content, allow_repeat=True) content = ''.join(content) contained_key, contained_key_count = base_parser.get_contained_key( title, content, remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) log.debug(''' 标题: %s 内容: %s 评论数 %d 视频地址 %s 图片地址 %s 发布时间 %s 关键字: %s 关键字数:%d ''' % (title, content, comment_count, video_url, image_url, release_time, contained_key, contained_key_count)) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, url=video_url, title=title, content=content, image_url=image_url, comment_count=comment_count, release_time=release_time, keyword=contained_key, keyword_count=contained_key_count) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] # 解析 html, request = tools.get_html_by_requests(root_url, headers=HEADER) if not html: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return news_box = tools.get_tag(html, name='div', attrs={'class': "news-box"})[0] news_list = tools.get_tag(news_box, name='li') for news in news_list: try: # 图片 image = tools.get_tag(news, name='img')[0] image = tools.get_json_value(image, 'src') # url url = tools.get_tag(news, name='h3')[0] try: url = tools.get_json_value(url.a, 'href') except: url = '' # 标题 title = tools.get_tag(news, name='h3')[0] title = tools.get_text(title) title = tools.del_html_tag(title) # 内容 content = tools.get_tag(news, name='p', attrs={'class': "txt-info"})[0] content = tools.get_text(content) content = tools.del_html_tag(content) # 观看数 watched_count = '' # 来源 origin = tools.get_tag(news, name='div', attrs={'class': "s-p"})[0] origin = ''.join(tools.get_info(origin, '<a.*?>(.*?)<')) # 日期 release_time = tools.get_tag(news, name='div', attrs={'class': "s-p"})[0] release_time = tools.get_json_value(release_time, 't') release_time = tools.timestamp_to_date(int(release_time)) # 判断是否有视频 根据视频播放图标判断 regex = '<div class="img-box">.*?<i></i>.*?</div>' play_icon = tools.get_info(news, regex) except: continue contained_key, contained_key_count = base_parser.get_contained_key( title, content, remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) log.debug( ''' 标题: %s 内容: %s 来源: %s 原文url:%s 图片url:%s 观看数: %s 日期: %s 有视频: %d 关键词: %s 关键词数:%s ''' % (title, content, origin, url, image, watched_count, release_time, play_icon and True or False, contained_key, contained_key_count)) if not contained_key or not play_icon: continue base_parser.add_content_info('VA_content_info', SITE_ID, url, title, content, image_url=image, release_time=release_time, origin=origin, watched_count=watched_count, search_type=SEARCH_TYPE, keyword=contained_key, keyword_count=contained_key_count) base_parser.update_url('VA_urls', root_url, Constance.DONE)