def download(urlfilepath): whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI) donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH) FileUtility.mkdirs(donepath) filename = os.path.basename(urlfilepath) writeTmpfile = os.path.join(donepath, filename+'.temp') writefile = os.path.join(donepath, filename + '.txt.' + str(int(time.time())) + '.done') if os.path.exists(writeTmpfile): os.remove(writeTmpfile) if os.path.exists(writefile): os.remove(writefile) httpsflag = False if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS: readlines = FileUtility.readlines(urlfilepath) for line in readlines: if line.strip().startswith('https'): httpsflag = True break #创建空文件 with open(writeTmpfile,'a+') as filetemp: filetemp.write('') if urlfilepath.endswith(constant.WEBKIT_FILE_SUFFIX) or httpsflag: downWebkit(urlfilepath, writeTmpfile) elif urlfilepath.endswith(constant.POST_FILE_SUFFIX): downPost(urlfilepath, writeTmpfile) else: downGet(urlfilepath, writeTmpfile) if os.path.exists(writeTmpfile): os.rename(writeTmpfile, writefile) Logger.getlogging().debug('DoneFile Download Success: {f}'.format(f=writefile)) FileUtility.remove(urlfilepath)
def storagequery(self): QueryStorage.updatedb() SpiderConfigure.getinstance().setchannel(SPIDER_CHANNEL_S2) s2file = SpiderConfigure.getinstance().gets2file() if FileUtility.exists(s2file): lines = FileUtility.readlines(s2file) for strquery in lines: QueryStorage.getinstance().storequery(strquery) QueryStorage.getinstance().storewaibuquery(strquery) tiebafile = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_S3_INPUT_FILE) if FileUtility.exists(tiebafile): lines = FileUtility.readlines(tiebafile) for strquery in lines: if not self.checks3query(strquery): continue query = strquery.split('\t')[0].strip() url = strquery.split('\t')[1].strip() QueryStorage.getinstance().storetiebaquery(query, url)
def s2upload(self, sfile): if FileUtility.exists(sfile): lines = FileUtility.readlines(sfile) for line in lines: try: query = line.strip() self.conf.setchannel(SPIDER_CHANNEL_S2) self.conf.setquery(query) URLFileManager.getinstance().generateurlfilepath() allsite = self.factory.getall() for site in allsite: site.s2query(query) except: Logger.printexception()
def downGet(urlfilepath, writeTmpfile, second=2): get = Get() try: lines = FileUtility.readlines(urlfilepath) for line in lines: try: jsonstr = get.get(line.strip()) if not jsonstr: Logger.getlogging().debug('Download URL failed: {url}'.format(url=line.strip())) continue with open(writeTmpfile,'a+') as filetemp: filetemp.write(jsonstr+'\n') time.sleep(second) except: Logger.printexception() except: Logger.printexception()
def s3upload(self, tiebafile): lines = FileUtility.readlines(tiebafile) querylist = [] sitelist = [] self.conf.setchannel(SPIDER_CHANNEL_S2) for strquery in lines: query = strquery.split('\t')[0].strip() url = strquery.split('\t')[1].strip() Logger.getlogging().debug(query) Logger.getlogging().debug(url) self.conf.setquery(query) URLFileManager.getinstance().generateurlfilepath() querylist.append(query) site = self.factory.getsite(url) site.s2query(url) if site not in sitelist: sitelist.append(site)
def s1upload(self, sfile): if FileUtility.exists(sfile): lines = FileUtility.readlines(sfile) self.conf.setchannel(SPIDER_CHANNEL_S1) self.conf.setquery('') URLFileManager.getinstance().generateurlfilepath() for line in lines: try: url = line.strip() params = PageBasicInfo() params.url = url #NewsStorage.seturlinfos(params) context = URLContext() context.originalurl = url context.type = URLContext.S1_MAIN_BODY Logger.getlogging().debug(url) URLManager.getinstance().storeurl(url, context, REQUEST_TYPE_WEBKIT) except: Logger.printexception()
def wb_analysis(self, filepath): Logger.getlogging().info( 'Now, Start to analysis Waibu file {fl}'.format(fl=filepath)) if '302_tencent_video' in filepath: type = constant.SPIDER_S2_WEBSITE_VIDEO else: type = constant.SPIDER_S2_WEBSITE_NEWS self.conf.setchannel(constant.SPIDER_CHANNEL_S2) lines = FileUtility.readlines(filepath) tempwaibustorage = {} for line in lines: try: line = json.loads(line) params = PageBasicInfo() params.query = line['query'] params.url = line['url'] params.title = Common.strfilter(line['title']) params.body = Common.strfilter(line['body']) params.pubtime = line['pubtime'] clicknum = line.get('clicknum', 0) if clicknum: params.clicknum = int(clicknum) params.type = type if params.query not in URLManager.waibustorage: URLManager.waibustorage[params.query] = [] if params.query not in tempwaibustorage: tempwaibustorage[params.query] = [] URLManager.waibustorage[params.query].append(params) tempwaibustorage[params.query].append(params) except: Logger.printexception() Logger.getlogging().debug( 'Now, Starting Select url to Insert and Update for uploading WAIBU data!' ) for query in tempwaibustorage: paramslist = tempwaibustorage[query] for params in paramslist: self.conf.setquery(query) NewsStorage.seturlinfos(params)
def downPost(urlfilepath, writeTmpfile, second=10): post = Post() try: lines = FileUtility.readlines(urlfilepath) lenth = len(lines) for line in lines: jsonline = json.loads(line.strip()) try: url = jsonline['url'] data = jsonline['data'] #cookie = post.createCookie(url) jsonstr = post.post(url, data, cookie=None) if not jsonstr: Logger.getlogging().debug('Download URL failed: url:{url}\tdata:{data}'.format(url=url, data=str(data))) continue with open(writeTmpfile,'a+') as filetemp: filetemp.write(jsonstr+'\n') if lenth > 1: time.sleep(second) lenth = lenth - 1 except: Logger.printexception() except: Logger.printexception()