def get_refresh_video_barrage(self, cid, row_barrages): barrage_file_path = FileUtil.get_barrage_file_path(cid) # 检查该cid的弹幕文件是否存在,如果不存在,那么此时的row_barrages数据将全部写入文件中, # 如果存在,那么就只要找到更新的弹幕记录。 barrage_count = 0 if FileUtil.is_file_exists(barrage_file_path): last_barrage_index = -1 # 记录文件中最后一条弹幕在row_barrages中的下标。 barrage_count = FileUtil.get_file_line_count(barrage_file_path) last_n_barrages = FileUtil.get_file_last_n_line_content(barrage_file_path, 5) Logger.print_console_info(u"当前文件的最后n条弹幕:\n" + u"\n".join(last_n_barrages)) for index in xrange(len(row_barrages) - 1, -1, -1): if self.__is_same_barrage(last_n_barrages, row_barrages[index]): # 获得存储在弹幕文件中的最后一条弹幕,在更新弹幕序列中的位置。 last_barrage_index = index break # 当前弹幕数据没有更新 if last_barrage_index == (len(row_barrages) - 1): row_barrages = [] Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) + u"\t" + u"弹幕数据没有更新。") # 此时部分的弹幕数据需要更新 elif last_barrage_index >= 0: Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) + u"\t" + u"有弹幕数据更新:" + u"\t" + str(len(row_barrages) - last_barrage_index - 1)) row_barrages = row_barrages[last_barrage_index + 1: len(row_barrages)] # 弹幕全文都要更新 elif last_barrage_index == -1: Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) + u"\t" + u"有弹幕数据更新:" + u"\t" + str(len(row_barrages))) barrage_count += len(row_barrages) Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) + u" 当前弹幕总条数:" + unicode(barrage_count) + u"\n\n") return row_barrages
def __get_response_content_internal(self, resp): # 获得返回网页的相关信息 try: response_content = resp.read() except Exception as exception: logger.debug(unicode(exception)) return None resp_info = resp.info() if "Content-Encoding" in resp_info: Logger.print_console_info(u"网页:" + unicode(resp.url) + u"\t压缩格式: " + unicode(resp_info["Content-Encoding"])) try: if resp_info["Content-Encoding"] == "deflate": response_content = zlib.decompress(response_content, -zlib.MAX_WBITS) elif resp_info["Content-Encoding"] == "gzip": response_content = zlib.decompress(response_content, zlib.MAX_WBITS | 16) elif resp_info["Content-Encoding"] == "zlib": response_content = zlib.decompress(response_content, zlib.MAX_WBITS) except zlib.error as exception: logger.debug(exception) return None # 在这里还要加上一步,因为有些页面的html代码是由js生成并填充上去的,所以这里需要去执行html代码里面的js代码获得全部的html代码 response_content = response_content.decode("utf-8", "ignore") return response_content
def start_spider_barrage(self, video_url, is_save_to_db=True, is_corpus=False): print u"进入 start_spider_barrage 函数。" # 视频网页的html源码信息。 video_html_content = self.get_html_content(video_url) if video_html_content is None: # 说明网络连接可能有问题,导致无法获得网页源码。 Logger.print_console_info(u"无法获得网页html代码,请检查网址是否输入正确,或检查网络连接是否正常!!") return None # 获得视频的相关信息 aid = self.get_video_aid(video_url) cid = self.get_video_cid(video_html_content) tags = self.get_video_tags(video_html_content) title = self.get_video_title(video_html_content) # 获取弹幕信息。 barrages = self.get_row_video_barrage(self.barrage_xml_url(cid)) if barrages is None: # 弹幕xml文件解析失败的时候,会返回none return # 将更新后的弹幕信息写入数据库。 if is_save_to_db: # 将视频信息存储入数据库中 VideoDao.add_video(cid, title, tags, aid, unicode(video_url)) # 获取更新的弹幕信息。 barrages = self.get_refresh_video_barrage(cid, barrages) BarrageDao.add_barrages(barrages, cid) # 将更新后的弹幕信息写入本地文件。 self.save_barrages_to_local(cid, barrages, is_corpus)
def main(): arg_parser = argparse.ArgumentParser(u"BilibiliSpider", description=u"grabs the barrages from bilibili video" + u" and store barrages to db.") arg_parser.add_argument("-u", "-urls", required=False, action="append", metavar="BILIBILI_VIDEO_URLS", default=[], dest="video_urls", help="the bilibili video urls.") arg_parser.add_argument("-i", "--internal", required=False, metavar="INTERNAL_TIME", default=5, dest="internal_time", help="the internal minute for grabing the bilibili barrages") opts = arg_parser.parse_args() video_urls = opts.video_urls # 获得url的list列表。 video_urls = ["http://www.bilibili.com/video/av2218236/index_1.html", "http://www.bilibili.com/video/av2218236/index_2.html", "http://www.bilibili.com/video/av2218236/index_3.html", "http://www.bilibili.com/video/av2218236/index_4.html", "http://www.bilibili.com/video/av2218236/index_5.html", "http://www.bilibili.com/video/av2218236/index_6.html", "http://www.bilibili.com/video/av2218236/index_7.html"] print video_urls Logger.print_console_info(u"开始抓取弹幕信息。\n父进程id:%s" % os.getpid()) pool = Pool(7) for video_url in video_urls: print video_url pool.apply_async(grab_barrage_task, args=(video_url,)) pool.close() pool.join() Logger.print_console_info(u"弹幕信息抓取结束!")
def access_url(self, req, timeout=60): resp = self.__access_url_internal(req, timeout) if resp is False: Logger.print_console_info(u"无法连接:" + unicode(req.get_full_url())) return None else: return resp
def get_xin_fan_info(self, update_flag=False): xin_fan_list = [] # 新番的基本信息列表 if update_flag: # 1. 首先用默认的连接请求新番列表 xin_fan_count = self.__get_xin_fan_count() # 2. 获取新番剧集的页数,page_size默认是30 xin_fan_page_count = self.__get_xin_fan_page_count(xin_fan_count) # xin_fan_page_count = 2 for index in xrange(1, xin_fan_page_count + 1): json_data = self.get_response_content( self.__construct_xin_fan_list_url(page=str(index))) res_dict = json.loads(json_data, encoding='utf-8') if res_dict is None: continue xin_fan_list += self.__convert_dict_to_xin_fan( res_dict['result']['list']) # 3.将所有新番的基本信息写入数据库。 XinFanDao.add_xin_fans(xin_fan_list) else: xin_fan_list = XinFanDao.get_all_xinfan() xin_fan_info_list = [] for xin_fan in xin_fan_list: xin_fan_info_list.append((xin_fan.season_id, xin_fan.url)) for index in xrange(0, len(xin_fan_info_list)): xin_fan = xin_fan_info_list[index] Logger.print_console_info(u"第" + str(index) + u"次抓取-网页地址:" + xin_fan[0]) # 获得新番所有剧集的av链接信息 json_data = self.get_response_content( self.__construct_xin_fan_detail_url(xin_fan[0])) try_times = 5 while try_times > 0: anime_episodes = self.__get_anime_episodes(json_data) if anime_episodes is not None: break else: try_times -= 1 if anime_episodes is None: continue for av_episode in anime_episodes: av_url = "http://www.bilibili.com/video/av" + av_episode[ "av_id"] + "/" episode_index = av_episode["index"] episode_id = av_episode["episode_id"] episode_title = av_episode["index_title"] self.bilibili_spider.start_spider_barrage( video_url=av_url, is_save_to_db=True, season_id=xin_fan[0], season_index=episode_index, episode_id=episode_id, episode_title=episode_title) time.sleep(1) return xin_fan_list
def __access_url_internal(self, req, timeout=60, try_times=1): try: if try_times <= self.try_times: opener = urllib2.build_opener(SmartRedirectHandler) resp = opener.open(req, timeout=timeout) return resp else: return False except Exception as exception: logger.debug(unicode(exception)) Logger.print_console_info(u"连接失败!" + unicode(str(try_times)) + u" ,正在重新连接……") # 发现发生 HTTPError 502 错误时,重试链接并没有效果。 self.__access_url_internal(req, timeout, try_times + 1)
def __is_same_barrage(self, last_n_barrages, barrage): # barrage 格式:(row_id, play_timestamp, ... , content) # last_n_barrages 格式:[last_barrage, last_barrage, ...] for last_barrage in last_n_barrages: # last_barrage 格式: (row_id\tplay_timestamp\t...\tcontent) last_barrage = last_barrage.split(u"\t") if len(last_barrage) != len(barrage): Logger.print_console_info(u"Error,弹幕格式有误,无法两条弹幕是否相同。") continue is_same = True for index in xrange(1, len(last_barrage)): if last_barrage[index] != barrage[index]: is_same = False break if is_same: return True return False
def start_spider_barrage(self, video_url, is_save_to_db=True, is_corpus=False, season_id=None, season_index=None, episode_id=None, episode_title=None): print u"进入 start_spider_barrage 函数。" # 视频网页的html源码信息。 video_html_content = self.get_response_content(video_url) if video_html_content is None: # 说明网络连接可能有问题,导致无法获得网页源码。 Logger.print_console_info(u"无法获得网页html代码,请检查网址是否输入正确,或检查网络连接是否正常!!") return None # 获得视频的相关信息 try_times = 5 while try_times>0: aid,cid = self.get_video_ids(episode_id) if (aid is not None) & (cid is not None): break else: try_times -= 1 if (aid is None) & (cid is None): Logger.print_console_info(u"获取视频av_id和cid失败") return tags = self.get_video_tags(video_html_content) title = episode_title meta_keywords = self.get_video_meta_keywords(video_html_content) # 获取弹幕信息。 barrages = self.get_row_video_barrage(self.barrage_xml_url(cid)) if barrages is None: # 弹幕xml文件解析失败的时候,会返回none return # 将更新后的弹幕信息写入数据库。 if is_save_to_db: # 将视频信息存储入数据库中 VideoDao.add_video(cid, title, tags, meta_keywords, aid, unicode(video_url), season_id, season_index) # 获取更新的弹幕信息。 # barrages = self.get_refresh_video_barrage(cid, barrages) BarrageDao.add_barrages(barrages, aid) # 将更新后的弹幕信息写入本地文件。 self.save_barrages_to_local(cid, barrages, is_corpus)
from multiprocessing import Pool from db.dao.barragedao import BarrageDao from db.dao.videodao import VideoDao from spider import BarrageSpider from util.datetimeutil import DateTimeUtil from util.fileutil import FileUtil from util.loggerutil import Logger """ 抓取bilibili站点的视频信息(标题,分类)以及视频对应的弹幕信息。 """ __author__ = "*****@*****.**" logger = Logger("video-url-spider.log").get_logger() """ BilibiliBarrageSpider 爬取b站的弹幕信息,通过调用start函数爬取相关视频的弹幕信息,video_url 为b站的url信息。 如果输入其他的链接,那么将会出错。 """ class BilibiliSpider(BarrageSpider): def __init__(self): # 确保父类被正确初始化了 # http://stackoverflow.com/questions/21063228/typeerror-in-python-single-inheritance-with-super-attribute super(BilibiliSpider, self).__init__() # ----------------------------------------------------------------------------------------------------------------- # 视频信息获取部分
根据情感词词典的方式,分析情感强烈片段的情感信息。 """ import codecs import os import gensim import wordsegment.wordseg as wordseg from analysis.model.dictconfig import DictConfig from analysis.model.timewindow import TimeWindow from util.fileutil import FileUtil from util.loggerutil import Logger from zscore import Zscore logger = Logger(console_only=True).get_logger() class Emotion(object): __EMOTION_CATEGORY_DICT = {u"乐": 0, u"好": 1, u"怒": 2, u"哀": 3, u"惧": 4, u"恶": 5, u"惊": 6} # 参数: cid 电影对应的cid号,对哪一步电影进行情感分析 def __init__(self, cid): self.cid = cid # load_high_emotion_clips_from_file # 返回:(high_emotion_clips, global_zscore_threshold, left_zscore_threshold, right_zscore_threshould) 的元组 # high_emotion_clips [(left_border, right_border, left_border_seconds, right_border_seconds)] self.high_emotion_clips, self.global_zscore_threshold, self.left_zscore_threshold, \ self.right_zscore_threshould = Zscore.load_high_emotion_clips_from_file(cid) # 获得做好分词处理、替换词处理、停词过滤、颜文字替换的弹幕分词列表 self.barrage_seg_list = wordseg.load_segment_barrages(cid)
#! /usr/bin/env python2.7 # -*- coding: utf-8 -*- import codecs from spider import Spider from util.loggerutil import Logger import re logger = Logger("file_download.log", console_only=True).get_logger() class FileDownload(Spider): def __init__(self): super(FileDownload, self).__init__() # 获取视频的下载链接信息。 # 从这网站获取b站的视频下载链接信息:http://www.ibilibili.com/video/av5394711/ def get_download_links(self, video_link): html_content = self.get_response_content(video_link) # 原来的正则匹配语句:<ul\sclass="list-group"\sid="download">.*?<a\shref ="(.*?)".*?>视频下载.*?</a>.*? # <a\shref ="(.*?)".*?>MP3下载.*?</a>.*?</ul> 因为包含有中文,一致匹配不上,具体原因未知。 pattern = re.compile( r'<ul\sclass="list-group"\sid="download">.*?' r'<a href ="(.*?)".*?>.*?</a>.*?<a href ="(.*?)".*?>MP3.*?</a>.*?</ul>', re.S) match = re.search(pattern, html_content) if match is None: return None video_download_link = match.group(1) mp3_download_link = match.group(2) return video_download_link, mp3_download_link
def grab_barrage_task(video_url): Logger.print_console_info(u"子进程id:%s,抓取网页:%s。开始……" % (os.getpid(), video_url)) bili_spider = BilibiliSpider() bili_spider.start_spider_barrage(video_url) Logger.print_console_info(u"子进程id:%s,抓取网页:%s。结束……" % (os.getpid(), video_url))
import codecs import math import os import wordsegment.wordseg as wordseg from analysis.model.timewindow import TimeWindow from util.datetimeutil import DateTimeUtil from util.fileutil import FileUtil from util.loggerutil import Logger """ 主要对于matlab计算出的zscore信息进行处理,找出相应的zscore对应的时间窗口, 找出zscore较高的视频片断区间等等。 """ logger = Logger("zscore.log").get_logger() class Zscore(object): # 初始化函数,构建zscore数据。 # 参数: time_window_size 时间窗口的大小,以秒为单位 # slide_time_interval 以多少秒为时间间隔滑动,创建时间窗口,以秒为单位 # analysis_unit_capacity 以多少个时间窗口为单位进行分析zscore的值。 # zscore_file_path 以cid-zscore.txt命名 def __init__(self, cid, zscore_file_path, time_window_size, slide_time_interval, analysis_unit_capacity): self.cid = cid # b站视频对应的cid信息 self.zscore_file_path = zscore_file_path self.zscore_list = [] # 其中的元素为 (time_window_index, zscore) 这样的元组信息 self.time_window_size = time_window_size # 时间窗口的大小 self.slide_time_interval = slide_time_interval # 滑动时间窗口的大小
# -*- coding: utf-8 -*- import sys import urllib import urllib2 import zlib from decimal import Decimal, getcontext from util.loggerutil import Logger """ 提供爬虫类使用到的一些基本方法。 """ __author__ = "*****@*****.**" logger = Logger(console_only=True).get_logger() class SmartRedirectHandler(urllib2.HTTPRedirectHandler): def __init__(self): self.result = "" def http_error_301(self, req, fp, code, msg, headers): self.result = urllib2.HTTPRedirectHandler.http_error_301( self, req, fp, code, msg, headers) self.result.status = code return self.result def http_error_302(self, req, fp, code, msg, headers): self.result = urllib2.HTTPRedirectHandler.http_error_302( self, req, fp, code, msg, headers)