def __init__(self): self.database = SpiderDao() suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_FILENAME_SUFFIX) ts = TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT) self.difffile = '{path}/{dt}/{file}'.format( path=SpiderConfigure.getinstance().getconfig( const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH), dt=TimeUtility.getcurrentdate(), file=DiffController.DIFF_FILE_NAME_FORMAT.format(suffix=suffix, ts=ts))
def waibuetl(self): waibubackup = SpiderConfigure.getwaibubaup() if not FileUtility.exists(waibubackup): FileUtility.mkdirs(waibubackup) waibufile = self.etl.getqueryfromdb() if not FileUtility.exists(waibufile): Logger.getlogging().warning( '{waibufile} not generate!'.format(waibufile=waibufile)) return outtime = 0 self.wdownloader.upload(waibufile) continueflag = True while continueflag: downloadfiles = [] while True: Logger.getlogging().info( 'sleeping {sec}s......'.format(sec=self.waitingperiod)) #time.sleep(self.waitingperiod) outtime += self.waitingperiod if self.wdownloader.iscompleted(): continueflag = False break try: downloadfiles = self.wdownloader.download() if downloadfiles: break except: Logger.printexception() if outtime >= self.waibutimeout: Logger.getlogging().warning( 'Waibu Data Download Timeout! Spending {sec}s'.format( sec=outtime)) continueflag = False break for dfile in downloadfiles: starttime = TimeUtility.getcurrentdate( TimeUtility.TIME_FORMAT_DEFAULT) self.etl.wb_analysis(dfile) #if FileUtility.exists(waibubackup+FileUtility.getfilename(dfile)): #FileUtility.remove(waibubackup+FileUtility.getfilename(dfile)) FileUtility.move(dfile, waibubackup) logstring = 'PROCESSWAIBUFILE:\t{file}\t{start}\t{end}'.format( file=FileUtility.getfilename(dfile), start=starttime, end=TimeUtility.getcurrentdate()) Logger.getlogging().info(logstring) if outtime >= self.waibutimeout: Logger.getlogging().warning( 'Waibu Data Download Timeout! Spending {sec}s'.format( sec=outtime)) continueflag = False break
def flush(): # dump s1 download failed url SpiderConfigure.getinstance().setchannel(constant.SPIDER_CHANNEL_S1) SpiderConfigure.getinstance().setquery('') for url in SpiderReport.getinstance().s1urls: Logger.log(url, constant.ERRORCODE_FAIL_LOAD_DOWN) # dump none url got from website for query querynositemap = {} for query in SpiderReport.getinstance().querysitesmap.keys(): querynositemap[query] = 0 for site in SpiderReport.getinstance().querysitesmap[query]: SpiderReport.s2queryurl(query, site, None, True) querynositemap[query] += 1 # for query in SpiderReport.getinstance().querysitesmap.keys(): if query in querynositemap: SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum, SpiderReport.getinstance().s2sitenum - querynositemap[query], True) else: SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum, SpiderReport.getinstance().s2sitenum, True) # # report filename = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_INFO_REPORT_FILE).format( date=TimeUtility.getcurrentdate()) FileUtility.remove(filename) FileUtility.writeline(filename, SpiderReport.REPORT_FORMAT.format( ch='CHANNEL', query='QUERY', type='TYPE', v1='UPLOAD', v2='DOWNLOAD', v3='NO_TEMPLATE', v4='NO_SITE', v5='WITH_CMT', v6='FAILED' )) for key in SpiderReport.getinstance().reportlist.keys(): for type in SpiderReport.getinstance().reportlist[key].keys(): r = SpiderReport.getinstance().reportlist[key][type] FileUtility.writeline(filename, r.tostring()) for key in SpiderReport.getinstance().s2sitereportlist.keys(): for type in SpiderReport.getinstance().s2sitereportlist[key].keys(): r = SpiderReport.getinstance().s2sitereportlist[key][type] FileUtility.writeline(filename, r.tostring()) FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring()) FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring2()) FileUtility.flush() threshold = float(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN, const.SPIDER_FAILED_THRESHOLD)) rate = SpiderReport.getinstance().totalreport.getsuccess() if rate < threshold: Logger.getlogging().warning('success rate is lower than threshold') param = NotifyParam() param.code = NotifyParam.SPIDER_NOTIFY_OVER_FAILED param.message = 'success rate {rate} is lower than threshold {th}'.format(rate=Common.float2percent(rate), th=Common.float2percent( threshold)) SpiderNotify.notify(param)
def __init__(self, taskinfo=None, download_path=None): self.taskinfo = taskinfo self.maxfilenum = 100 self.cache_path = Storage.getstoragelocation( const.SPIDER_DONE_TEMP_PATH) path = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH) if download_path: self.download_path = download_path else: self.download_path = PUCDownloader.DOWNLOAD_PATH.format( path=path, taskid=self.taskinfo.taskid) self.parse_tool = SpiderConfigure.getconfig( const.SPIDER_TENCENT_PLATFORM_DOMAIN, const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG) #self.json_path = Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH) self.pucbackpath = SpiderConfigure.getconfig( const.SPIDER_STORAGE_DOMAIN, const.SPIDER_PUC_BACKUP_PATH) + self.taskinfo.taskid self.pucbacktoday = os.path.join(self.pucbackpath, TimeUtility.getcurrentdate()) if not FileUtility.exists(self.pucbackpath): FileUtility.mkdirs(self.pucbackpath) if not FileUtility.exists(self.pucbacktoday): FileUtility.mkdirs(self.pucbacktoday) self.done_file = self.pucbacktoday + '/done/' self.json_path = self.pucbacktoday + '/json/' if not FileUtility.exists(self.done_file): FileUtility.mkdirs(self.done_file) if not FileUtility.exists(self.json_path): FileUtility.mkdirs(self.json_path) self.pucsavedays = 0 self.clear()
def __init__(self): self.factory = SiteFactory() self.conf = SpiderConfigure.getinstance() self.urlbackuppath = SpiderConfigure.getconfig( const.SPIDER_STORAGE_DOMAIN, const.SPIDER_URL_BACKUP_PATH) + TimeUtility.getcurrentdate() self.period = int(SpiderConfigure.getinstance().getlastdays())
def getsearchresult(self, params): info = params.customized['query'] xpath = XPathUtility(html=params.content) hrefs = xpath.xpath('//li/h3/a/@href') titles = xpath.getlist('//li/h3/a') pubtimes = xpath.xpath('//li/p') today = datetime.datetime.strptime( TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT).date() urllist = [] for index in range(0, len(titles), 1): # 标题中包含指定要查询的关键字 # if titles[index].find(info) > -1: if Common.checktitle(info, titles[index]): pubtimestr = TimeUtility.getuniformdate(pubtimes[index].text) pubtime = datetime.datetime.strptime( pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date() inteveral = today - pubtime # 时间在指定周期内 if inteveral.days <= self.querylastdays: urllist.append(hrefs[index]) else: # 因为是按照时间排序的,第一条时间不满足检索周期的话,后面所有的都不满足。 break if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def loop(self): # 循环URL,包括S1以及S2 continueflag = True while continueflag: downloadfiles = [] while True: # check time out if self.istimeout(): param = NotifyParam() param.code = NotifyParam.SPIDER_NOTIFY_TIMEOUT param.message = 'Spider timeout for %s o\'clock, stop' % constant.SPIDER_RUN_TIMEOUT_HOUR SpiderNotify.notify(param) continueflag = False break if self.downloader.iscompleted(): continueflag = False break try: downloadfiles = self.downloader.download() self.upload() if len(downloadfiles) > 0: break else: Logger.getlogging().info('sleeping {0}s......'.format( self.waitingperiod)) time.sleep(self.waitingperiod) except: Logger.printexception() for dfile in downloadfiles: starttime = TimeUtility.getcurrentdate( TimeUtility.TIME_FORMAT_DEFAULT) self.etl.processfile(dfile) logstring = 'PROCESSFILE:\t{file}\t{start}\t{end}'.format( file=FileUtility.getfilename(dfile), start=starttime, end=TimeUtility.getcurrentdate()) Logger.getlogging().info(logstring) if self.istimeout(): param = NotifyParam() param.code = NotifyParam.SPIDER_NOTIFY_TIMEOUT param.message = 'Spider timeout for %s o\'clock, stop' % constant.SPIDER_RUN_TIMEOUT_HOUR SpiderNotify.notify(param) continueflag = False break self.upload()
def __init__(self): self.reportlist = {} self.s2sitereportlist = {} self.s2urlfilepath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_S2_QUERY_URLS_FILE).format( date=TimeUtility.getcurrentdate()) FileUtility.remove(self.s2urlfilepath) self.totalreport = Report() self.totalreport.channel = 'SUM' self.s1urls = [] self.querysitesmap = {} self.s2sitenum = 0 self.s2urlsitemap = {}
def getfilename(self, url): # 渠道 self.channel = SpiderConfigure.getinstance().getchannel() # S2查询信息 self.query = SpiderConfigure.getinstance().getquery() # S2页面类型 self.type = SpiderConfigure.getinstance().gettype() if self.channel == SPIDER_CHANNEL_S2: q = Common.md5(self.query) else: q = self.query return Storage.SPIDER_STORE_FILENAME_FORMAT.format( path = self.cache_path, date = TimeUtility.getcurrentdate(), channel = self.channel, query = q, filename = Common.md5(url))
def __init__(self): self.url_beforenewsinfo_map = {SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: {}, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM: {}, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM: {}, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM: {}} self.url_beforenewsnum_map = {} self.url_curcmtcontent_map = {} self.url_curcmtnum_map = {} self.url_beforecmtnum_map = {} date = TimeUtility.getcurrentdate() path = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_PATH), date) suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_FILENAME_SUFFIX) self.outputpath = FileFormat.OUTPUTPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time())) self.errorinfopath = FileFormat.ERRORINFOPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time())) self.pushpath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_PUSH_PATH_MASTER), date) if not FileUtility.exists(path): FileUtility.mkdirs(path)
def step2(self,params): try: page = params.customized['page'] soup = BeautifulSoup(params.content,'html5lib') subject = soup.find(attrs={'id':re.compile(self.commentCsskey['subject_idkey'])}) if subject: subject = subject.get_text() else: pass tables = soup.find_all('table',attrs={'id':re.compile(self.commentCsskey['table_idkey']),'summary':re.compile(self.commentCsskey['table_summarykey'])}) if page ==1: tables = tables[1:] if tables: #初始列表赋一个当前时间的值,避免后续报错 publishlist = [TimeUtility.getcurrentdate(TimeUtility.DEFAULTFORMAT)] for table in tables: try: nick = table.select_one('.xw1').get_text() except: nick = 'anonymous' try: curtimeobj = table.find(attrs={'id':re.compile(self.commentCsskey['time_idkey'])}) if curtimeobj.select_one('span'): curtime = curtimeobj.select_one('span').get('title') else: curtime = curtimeobj.get_text() except: curtime = TimeUtility.getuniformtime(0) try: content = table.find(attrs={'id':re.compile(self.commentCsskey['content_idkey'])}).get_text() except: content = '' publishlist.append(curtime) CMTStorage.storecmt(params.originalurl, content, curtime, nick) if not self.isnewesttime(params.originalurl, min(publishlist)): return False else: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return True except: Logger.printexception() Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
def __init__(self): self.upload_file_list = {} self.impls = [] self.implsindex = 0 self.initcommon() self.wimpls = [] self.wimplsindoex = 0 self.initwebkit() self.limpls = [] self.limplsindex = 0 self.initlocal() self.tempurlpath = Storage.getstoragelocation( const.SPIDER_URLS_TEMP_PATH) self.urlbackuppath = SpiderConfigure.getconfig( const.SPIDER_STORAGE_DOMAIN, const.SPIDER_URL_BACKUP_PATH) + TimeUtility.getcurrentdate() #文件下载失败重试机制 self.retransmissionfiles = {} self.all_retransmissionfiles = {} self.retransmissionlimitnum = 3 self.filetime = 0
def getwaibubaup(): return SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_WAIBU_BACKUP_PATH) + TimeUtility.getcurrentdate()
def processVideo(self, params): if params.step == MofangS2Query.S2QUERY_FIRST_PAGE: #Step2: 根据返回json内容,comments['totalnums'] 得到视频总数 #一个json返回20条数据,需要使用总数除以20获得总页数,然后在写到page参数里面 info = params.customized['query'] keyvalue = Common.urlenc(info) try: jsondate = json.loads(params.content) comments_count = jsondate['totalnums'] except: Logger.getlogging().warning('{}:40000'.format(params.url)) return # 获取不到,则返回 if int(comments_count) == 0: return page_count = int( math.ceil(float(comments_count) / self.DEFAULT_PAGE_SIZE)) # 根据上面的page_count数,拼出所有的搜索结果url(最新1周) querylist = [] if page_count > 0: for page in range(1, page_count + 1, 1): url = MofangS2Query.QUERY_TEMPLATE.format( key=keyvalue, pageno=page, pagesize=self.DEFAULT_PAGE_SIZE) Logger.getlogging().debug(url) querylist.append(url) self.__storeqeuryurllist__(querylist, MofangS2Query.S2QUERY_EACH_PAGE, {'query': info}) elif params.step == MofangS2Query.S2QUERY_EACH_PAGE: # Step3: 根据Step2的返回jason数据,获取 # 标题:comments['data'][0开始到19]['title'] # 连接:comments['data'][0开始到19]['url'] # 视频发布时间:comments['data'][0开始到19]['inputtime'] 这个需要截断前10位,只能对比日期 info = params.customized['query'] try: jsondate = json.loads(params.content) searchresult = jsondate['data'] except: Logger.getlogging().warning('{}:40000'.format(params.url)) return # 获取当前日(日期类型) today = datetime.datetime.strptime(TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT) urllist = [] for index in range(0, len(searchresult), 1): #print searchresult[index]['title'] #print searchresult[index]['inputtime'] if searchresult[index]['title'] is not None: # 标题中包含指定要查询的关键字,对应的url保存 # if searchresult[index]['title'].find(info) > -1: if Common.checktitle(info, searchresult[index]['title']): if searchresult[index]['inputtime'] is not None: #inputtime = datetime.datetime.strptime(TimeUtility.getuniformtime2(int(searchresult[index]['inputtime'])), TimeUtility.TIME_FORMAT_DEFAULT) #intervaldays = today - inputtime #if intervaldays.days <= int(self.querylastdays): pubtime = getuniformtime( str(searchresult[index]['inputtime'])) if compareNow(pubtime, int(self.querylastdays)): urllist.append(searchresult[index]['url']) else: # 获取不到发布时间,则默认为周期以内 urllist.append(searchresult[index]['url']) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is ishangmanComments.STEP_1: # 取得url中的参数值 articleIds = re.findall( r'^http://(\w+)\.ishangman\.com/\w+/(\d+)', proparam.url).__getitem__(0) articleId1 = articleIds.__getitem__(0) articleId2 = articleIds.__getitem__(1) # 评论类型 commenttype = int( self.r.parse(ur'commenttype = (.*);', proparam.content)[0]) #第一页评论 url = ishangmanComments.COMMENTS_URL % (articleId1, articleId2, commenttype, 1) self.storeurl( url, proparam.originalurl, ishangmanComments.STEP_2, { 'articleId1': articleId1, 'articleId2': articleId2, 'commenttype': commenttype }) elif proparam.step == ishangmanComments.STEP_2: articleId1 = proparam.customized['articleId1'] articleId2 = proparam.customized['articleId2'] commenttype = proparam.customized['commenttype'] # 取得评论件数 xhtml = XPathUtility(html=proparam.content) if articleId1.__eq__('comic'): comments_count = int( xhtml.getlist( '//*[contains(@class,"ismcartondiv1")]/p/strong') [0]) if comments_count: NewsStorage.setcmtnum(proparam.originalurl, comments_count) else: comments_count = int( self.r.parse( ur'(\d+).*', xhtml.getlist('//*[@class="comment_lctwidl"]/p') [0])[0]) if comments_count: NewsStorage.setcmtnum(proparam.originalurl, comments_count) # 取得评论的页数 cmtnum = CMTStorage.getcount(proparam.originalurl, True) if int(comments_count) == 0: return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 取得评论的url for page in range(1, page_num + 1, 1): url = ishangmanComments.COMMENTS_URL % ( articleId1, articleId2, commenttype, page) self.storeurl(url, proparam.originalurl, ishangmanComments.STEP_3, {'articleId1': articleId1}) elif proparam.step == ishangmanComments.STEP_3: try: Logger.getlogging().debug(proparam.originalurl) commentsInfo = [] articleId1 = proparam.customized['articleId1'] xparser = XPathUtility(proparam.content) # 取得评论件数 if articleId1.__eq__('comic'): # 论坛评论 soup = BeautifulSoup(proparam.content, 'html5lib') comments = soup.select('.ismcartondiv2') else: # 论坛评论 comments = xparser.getcomments( '/html/body/div/span[2]/p[1]') # 论坛评论时间 updateTime = xparser.getcomments( '/html/body/div/span[2]/div[1]') # 取得评论 for index in range(0, int(len(comments)), 1): cmti = [] if articleId1.__eq__('comic'): publictime = self.r.parse( ur'(\d{2}-\d+ \d+:\d+)', comments[index].get_text())[0] # publictime = TimeUtility.getuniformtime(publictime) if publictime: cmt_month = publictime.split("-")[0] curmonth = time.localtime().tm_mon if (int(cmt_month) < curmonth): publictime = TimeUtility.getcurrentdate( )[0:4] + '-' + publictime else: publictime = '2016' + '-' + publictime curtime = TimeUtility.getuniformtime(publictime) content = comments[index].text.split( '\n')[0].get_text() # # print comments; # return # content = self.r.parse(ur'class=\".*\"',comments[index].get_text())[0] # nick = comments[1].get('nickname', 'anonymous') # # if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): # CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) # if NewsStorage.storeupdatetime(proparam.originalurl, tm): # cmti.content = comments[index].get_text() # commentsInfo.append(cmti) else: publictime = updateTime[index][:-8] #publictime = TimeUtility.getcurrentdate()[0:4] + '-'+ publictime #tm = TimeUtility.getuniformtime(publictime, u'%Y-%m-%d %H:%M') tm = getuniformtime(publictime) if NewsStorage.storeupdatetime( proparam.originalurl, tm): cmti.content = comments[index] commentsInfo.append(cmti) # 保存获取的评论i if len(commentsInfo) > 0: self.commentstorage.store(proparam.originalurl, commentsInfo) except: Logger.printexception() Logger.getlogging().error( 'extract comment error from {site}'.format( site=proparam.url)) else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()
def removecachefile(): cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY) databackupfolder = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH) + TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT) if FileUtility.exists(cache): FileUtility.move(cache, databackupfolder) FileUtility.rmdir(cache) limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT)) databackuppath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH) if FileUtility.exists(databackuppath): validdate = TimeUtility.getdatebefore(limit, '%Y%m%d000000') for s in os.listdir(databackuppath): fullpath = os.path.join(databackuppath, s) #Logger.getlogging().info('remove cach folder ' + fullpath) #FileUtility.rmdir(fullpath) if s < validdate: fullpath = os.path.join(databackuppath, s) Logger.getlogging().info('remove cach folder ' + fullpath) FileUtility.rmdir(fullpath)
def scanning(): whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI) scanningPath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH) donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.removefiles(donepath) backupPath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_URL_BACKUP), TimeUtility.getcurrentdate()) interval = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_INTERVAL) FileUtility.mkdirs(scanningPath) FileUtility.mkdirs(backupPath) while True: Logger.getlogging().debug('scanning') flag = False for filename in os.listdir(scanningPath): try: urlfilepath = os.path.join(scanningPath, filename) backupfile = os.path.join(backupPath, filename) if os.path.isfile(urlfilepath) and 'tmp' not in filename: Logger.getlogging().info('Get url file:{file}'.format(file=filename)) FileUtility.copy(urlfilepath, backupfile) download(urlfilepath) if not flag: flag = True except: Logger.printexception() if not flag: Logger.getlogging().debug('scanning interval sleeping {interval}s'.format(interval=interval)) time.sleep(int(interval))