def process(self, params): # S2 Query Process if SPIDER_CHANNEL_S2 == SpiderConfigure.getinstance().getchannel(): if SPIDER_S2_WEBSITE_TYPE not in params.customized: return True xparser = XPathUtility(params.content) maxitmes = 0 pageinfo = PageBasicInfo() template = None for template in TemplateManager.getxpaths(params.url): Logger.getlogging().debug('URL_TEMPLATE {url}\t{template}'.format( url=params.url, template=template[TemplateManager.XPATH_KEY_URL_TEMPLATE])) pageinfo, items = self.parsefromcontent(params, template, xparser) if constant.SPIDER_S2_WEBSITE_TYPE in params.customized: pageinfo.type = params.customized[ constant.SPIDER_S2_WEBSITE_TYPE] #if not params.page_title and not pageinfo.title and not params.lastretry: #return False if template is None: Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_TEMPLATE) #值覆盖 pageinfo.url = params.url if not pageinfo.title: pageinfo.title = params.page_title if not pageinfo.body: pageinfo.body = params.page_body if not pageinfo.pubtime: pageinfo.pubtime = params.html_time NewsStorage.seturlinfos(pageinfo)
def __storeurllist__(self, urllist, type=constant.SPIDER_S2_WEBSITE_VIDEO, referlist=[]): count = 0 index = 0 for url in urllist: params = PageBasicInfo() params.url = url params.type = type #检查是否在cold数据库中 #如果不在cold数据库中则插入hot数据库中 if not NewsStorage.exist_cold(url): NewsStorage.seturlinfos(params) #params = {constant.SPIDER_S2_WEBSITE_TYPE: type, #constant.SPIDER_CHANNEL: constant.SPIDER_CHANNEL_S1} #url = url.strip() #if not URLManager.getinstance().exist(url): #count += 1 #if referlist: #params[SiteS2Query.REFER_URL] = referlist[index] #urlcontext = URLContext() #urlcontext.url = url #urlcontext.type = URLContext.S1_MAIN_BODY #urlcontext.originalurl = url #urlcontext.customized = params #URLManager.getinstance().storeurl(url, urlcontext, constant.REQUEST_TYPE_WEBKIT) index += 1
def __storeurl__(self, url, publishdate, type=constant.SPIDER_S2_WEBSITE_VIDEO): params = PageBasicInfo() params.url = url params.type = type params.pubtime = publishdate #检查是否在cold数据库中 #如果不在cold数据库中则插入hot数据库中 if not NewsStorage.exist_cold(url): NewsStorage.seturlinfos(params)
def process(self, params): title = params.page_title if not title: title = XPathUtility(params.content).gettitle('/html/head/title') # 设置URL的标题、正文、发布时间信息 dict = { MongoDAO.SPIDER_COLLECTION_NEWS_TITLE: title, MongoDAO.SPIDER_COLLECTION_NEWS_BODY: params.page_body, MongoDAO.SPIDER_COLLECTION_NEWS_PUBLISH_DATE: params.html_time } NewsStorage.seturlinfos(params.url, dict)
def wb_analysis(self, filepath): Logger.getlogging().info( 'Now, Start to analysis Waibu file {fl}'.format(fl=filepath)) if '302_tencent_video' in filepath: type = constant.SPIDER_S2_WEBSITE_VIDEO else: type = constant.SPIDER_S2_WEBSITE_NEWS self.conf.setchannel(constant.SPIDER_CHANNEL_S2) lines = FileUtility.readlines(filepath) tempwaibustorage = {} for line in lines: try: line = json.loads(line) params = PageBasicInfo() params.query = line['query'] params.url = line['url'] params.title = Common.strfilter(line['title']) params.body = Common.strfilter(line['body']) params.pubtime = line['pubtime'] clicknum = line.get('clicknum', 0) if clicknum: params.clicknum = int(clicknum) params.type = type if params.query not in URLManager.waibustorage: URLManager.waibustorage[params.query] = [] if params.query not in tempwaibustorage: tempwaibustorage[params.query] = [] URLManager.waibustorage[params.query].append(params) tempwaibustorage[params.query].append(params) except: Logger.printexception() Logger.getlogging().debug( 'Now, Starting Select url to Insert and Update for uploading WAIBU data!' ) for query in tempwaibustorage: paramslist = tempwaibustorage[query] for params in paramslist: self.conf.setquery(query) NewsStorage.seturlinfos(params)
def dumpurls(self): #dump本台机器query对应的urllsit, 并存储到对应的文件中 s2file = SpiderConfigure.getinstance().gets2file() s2temppath = Storage.getstoragelocation( const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file) #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG) querys = QueryStorage.getinstance().getlocalquerys( s2temppath, ETLController.LOCALMACHINEFLAG) for query in querys: Logger.getlogging().debug( 'Now, Starting Select url to Insert and Update for uploading location urlfile!' ) self.conf.setchannel(constant.SPIDER_CHANNEL_S2) self.conf.setquery(query) #此处注释请勿删除 #1.转换周期内数据 # 1.1pulishdate存在,时间为最近一周 # 2.1publistdate为0,使用创建时间,时间为最近一周 #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \ #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \ #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))' #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG, #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"', #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #starttime = SpiderConfigure.getinstance().starttime(), #time0='\"'+TimeUtility.getuniformtime(0)+'\"', #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"', #secs =self.period * 24*60*60 #) where = { SQLDAO.SPIDER_TABLE_NEWS_QUERY: query, SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG } Logger.getlogging().debug( 'Query condition: {where}'.format(where=str(where))) results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) urltemplist = [] for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE] createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \ (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60): if url not in urltemplist: urltemplist.append(url) params = PageBasicInfo() params.url = url NewsStorage.seturlinfos(params) #2.抽取createdate为本次开始时间的数据 URLFileManager.getinstance().generateurlfilepath() where = { SQLDAO.SPIDER_TABLE_NEWS_QUERY: query, SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG, SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE: SpiderConfigure.getinstance().starttime() } results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) urllist = [] linecount = 0 for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() urllist.append(url) context = URLContext() context.originalurl = url context.type = URLContext.S1_MAIN_BODY context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[ SQLDAO.SPIDER_TABLE_NEWS_TYPE] Logger.getlogging().debug(url) URLManager.getinstance().storeurl(url, context, REQUEST_TYPE_WEBKIT) linecount += 1