Exemple #1
0
 def __init__(self):
     self.database = SpiderDao()
     suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                        const.SPIDER_OUTPUT_FILENAME_SUFFIX)
     ts = TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT)
     self.difffile = '{path}/{dt}/{file}'.format(
         path=SpiderConfigure.getinstance().getconfig(
             const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH),
         dt=TimeUtility.getcurrentdate(),
         file=DiffController.DIFF_FILE_NAME_FORMAT.format(suffix=suffix,
                                                          ts=ts))
Exemple #2
0
    def waibuetl(self):
        waibubackup = SpiderConfigure.getwaibubaup()
        if not FileUtility.exists(waibubackup):
            FileUtility.mkdirs(waibubackup)

        waibufile = self.etl.getqueryfromdb()
        if not FileUtility.exists(waibufile):
            Logger.getlogging().warning(
                '{waibufile} not generate!'.format(waibufile=waibufile))
            return

        outtime = 0
        self.wdownloader.upload(waibufile)
        continueflag = True
        while continueflag:
            downloadfiles = []
            while True:
                Logger.getlogging().info(
                    'sleeping {sec}s......'.format(sec=self.waitingperiod))
                #time.sleep(self.waitingperiod)
                outtime += self.waitingperiod
                if self.wdownloader.iscompleted():
                    continueflag = False
                    break
                try:
                    downloadfiles = self.wdownloader.download()
                    if downloadfiles:
                        break
                except:
                    Logger.printexception()
                if outtime >= self.waibutimeout:
                    Logger.getlogging().warning(
                        'Waibu Data Download Timeout! Spending {sec}s'.format(
                            sec=outtime))
                    continueflag = False
                    break
            for dfile in downloadfiles:
                starttime = TimeUtility.getcurrentdate(
                    TimeUtility.TIME_FORMAT_DEFAULT)
                self.etl.wb_analysis(dfile)
                #if FileUtility.exists(waibubackup+FileUtility.getfilename(dfile)):
                #FileUtility.remove(waibubackup+FileUtility.getfilename(dfile))
                FileUtility.move(dfile, waibubackup)
                logstring = 'PROCESSWAIBUFILE:\t{file}\t{start}\t{end}'.format(
                    file=FileUtility.getfilename(dfile),
                    start=starttime,
                    end=TimeUtility.getcurrentdate())
                Logger.getlogging().info(logstring)
                if outtime >= self.waibutimeout:
                    Logger.getlogging().warning(
                        'Waibu Data Download Timeout! Spending {sec}s'.format(
                            sec=outtime))
                    continueflag = False
                    break
    def flush():
        # dump s1 download failed url
        SpiderConfigure.getinstance().setchannel(constant.SPIDER_CHANNEL_S1)
        SpiderConfigure.getinstance().setquery('')
        for url in SpiderReport.getinstance().s1urls:
            Logger.log(url, constant.ERRORCODE_FAIL_LOAD_DOWN)
        # dump none url got from website for query
        querynositemap = {}
        for query in SpiderReport.getinstance().querysitesmap.keys():
            querynositemap[query] = 0
            for site in SpiderReport.getinstance().querysitesmap[query]:
                SpiderReport.s2queryurl(query, site, None, True)
                querynositemap[query] += 1
#
        for query in SpiderReport.getinstance().querysitesmap.keys():
            if query in querynositemap:
                SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum,
                                        SpiderReport.getinstance().s2sitenum - querynositemap[query], True)
            else:
                SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum,
                                        SpiderReport.getinstance().s2sitenum, True)
#
        # report
        filename = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                             const.SPIDER_INFO_REPORT_FILE).format(
            date=TimeUtility.getcurrentdate())
        FileUtility.remove(filename)
        FileUtility.writeline(filename, SpiderReport.REPORT_FORMAT.format(
            ch='CHANNEL',
            query='QUERY',
            type='TYPE',
            v1='UPLOAD',
            v2='DOWNLOAD',
            v3='NO_TEMPLATE',
            v4='NO_SITE',
            v5='WITH_CMT',
            v6='FAILED'
        ))
        for key in SpiderReport.getinstance().reportlist.keys():
            for type in SpiderReport.getinstance().reportlist[key].keys():
                r = SpiderReport.getinstance().reportlist[key][type]
                FileUtility.writeline(filename, r.tostring())
        for key in SpiderReport.getinstance().s2sitereportlist.keys():
            for type in SpiderReport.getinstance().s2sitereportlist[key].keys():
                r = SpiderReport.getinstance().s2sitereportlist[key][type]
                FileUtility.writeline(filename, r.tostring())
        FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring())
        FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring2())
        FileUtility.flush()
        threshold = float(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                                    const.SPIDER_FAILED_THRESHOLD))
        rate = SpiderReport.getinstance().totalreport.getsuccess()
        if rate < threshold:
            Logger.getlogging().warning('success rate is lower than threshold')
            param = NotifyParam()
            param.code = NotifyParam.SPIDER_NOTIFY_OVER_FAILED
            param.message = 'success rate {rate} is lower than threshold {th}'.format(rate=Common.float2percent(rate),
                                                                                      th=Common.float2percent(
                                                                                          threshold))
            SpiderNotify.notify(param)
Exemple #4
0
 def __init__(self, taskinfo=None, download_path=None):
     self.taskinfo = taskinfo
     self.maxfilenum = 100
     self.cache_path = Storage.getstoragelocation(
         const.SPIDER_DONE_TEMP_PATH)
     path = SpiderConfigure.getconfig(
         const.SPIDER_TENCENT_PLATFORM_DOMAIN,
         const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH)
     if download_path:
         self.download_path = download_path
     else:
         self.download_path = PUCDownloader.DOWNLOAD_PATH.format(
             path=path, taskid=self.taskinfo.taskid)
     self.parse_tool = SpiderConfigure.getconfig(
         const.SPIDER_TENCENT_PLATFORM_DOMAIN,
         const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG)
     #self.json_path = Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH)
     self.pucbackpath = SpiderConfigure.getconfig(
         const.SPIDER_STORAGE_DOMAIN,
         const.SPIDER_PUC_BACKUP_PATH) + self.taskinfo.taskid
     self.pucbacktoday = os.path.join(self.pucbackpath,
                                      TimeUtility.getcurrentdate())
     if not FileUtility.exists(self.pucbackpath):
         FileUtility.mkdirs(self.pucbackpath)
     if not FileUtility.exists(self.pucbacktoday):
         FileUtility.mkdirs(self.pucbacktoday)
     self.done_file = self.pucbacktoday + '/done/'
     self.json_path = self.pucbacktoday + '/json/'
     if not FileUtility.exists(self.done_file):
         FileUtility.mkdirs(self.done_file)
     if not FileUtility.exists(self.json_path):
         FileUtility.mkdirs(self.json_path)
     self.pucsavedays = 0
     self.clear()
Exemple #5
0
 def __init__(self):
     self.factory = SiteFactory()
     self.conf = SpiderConfigure.getinstance()
     self.urlbackuppath = SpiderConfigure.getconfig(
         const.SPIDER_STORAGE_DOMAIN,
         const.SPIDER_URL_BACKUP_PATH) + TimeUtility.getcurrentdate()
     self.period = int(SpiderConfigure.getinstance().getlastdays())
    def getsearchresult(self, params):
        info = params.customized['query']

        xpath = XPathUtility(html=params.content)
        hrefs = xpath.xpath('//li/h3/a/@href')
        titles = xpath.getlist('//li/h3/a')
        pubtimes = xpath.xpath('//li/p')

        today = datetime.datetime.strptime(
            TimeUtility.getcurrentdate(),
            TimeUtility.DATE_FORMAT_DEFAULT).date()

        urllist = []
        for index in range(0, len(titles), 1):
            # 标题中包含指定要查询的关键字
            # if titles[index].find(info) > -1:
            if Common.checktitle(info, titles[index]):
                pubtimestr = TimeUtility.getuniformdate(pubtimes[index].text)
                pubtime = datetime.datetime.strptime(
                    pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date()
                inteveral = today - pubtime
                # 时间在指定周期内
                if inteveral.days <= self.querylastdays:
                    urllist.append(hrefs[index])
                else:
                    # 因为是按照时间排序的,第一条时间不满足检索周期的话,后面所有的都不满足。
                    break

        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Exemple #7
0
    def loop(self):
        # 循环URL,包括S1以及S2
        continueflag = True
        while continueflag:
            downloadfiles = []
            while True:
                # check time out
                if self.istimeout():
                    param = NotifyParam()
                    param.code = NotifyParam.SPIDER_NOTIFY_TIMEOUT
                    param.message = 'Spider timeout for %s o\'clock, stop' % constant.SPIDER_RUN_TIMEOUT_HOUR
                    SpiderNotify.notify(param)
                    continueflag = False
                    break
                if self.downloader.iscompleted():
                    continueflag = False
                    break
                try:
                    downloadfiles = self.downloader.download()
                    self.upload()
                    if len(downloadfiles) > 0:
                        break
                    else:
                        Logger.getlogging().info('sleeping {0}s......'.format(
                            self.waitingperiod))
                        time.sleep(self.waitingperiod)
                except:
                    Logger.printexception()

            for dfile in downloadfiles:
                starttime = TimeUtility.getcurrentdate(
                    TimeUtility.TIME_FORMAT_DEFAULT)
                self.etl.processfile(dfile)
                logstring = 'PROCESSFILE:\t{file}\t{start}\t{end}'.format(
                    file=FileUtility.getfilename(dfile),
                    start=starttime,
                    end=TimeUtility.getcurrentdate())
                Logger.getlogging().info(logstring)
                if self.istimeout():
                    param = NotifyParam()
                    param.code = NotifyParam.SPIDER_NOTIFY_TIMEOUT
                    param.message = 'Spider timeout for %s o\'clock, stop' % constant.SPIDER_RUN_TIMEOUT_HOUR
                    SpiderNotify.notify(param)
                    continueflag = False
                    break
                self.upload()
 def __init__(self):
     self.reportlist = {}
     self.s2sitereportlist = {}
     self.s2urlfilepath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                                    const.SPIDER_S2_QUERY_URLS_FILE).format(
         date=TimeUtility.getcurrentdate())
     FileUtility.remove(self.s2urlfilepath)
     self.totalreport = Report()
     self.totalreport.channel = 'SUM'
     self.s1urls = []
     self.querysitesmap = {}
     self.s2sitenum = 0
     self.s2urlsitemap = {}
Exemple #9
0
 def getfilename(self, url):
     # 渠道
     self.channel = SpiderConfigure.getinstance().getchannel()
     # S2查询信息
     self.query = SpiderConfigure.getinstance().getquery()
     # S2页面类型
     self.type = SpiderConfigure.getinstance().gettype()
     if self.channel == SPIDER_CHANNEL_S2:
         q = Common.md5(self.query)
     else:
         q = self.query
     return Storage.SPIDER_STORE_FILENAME_FORMAT.format(
         path = self.cache_path,
         date = TimeUtility.getcurrentdate(),
         channel = self.channel,
         query = q,
         filename = Common.md5(url))
Exemple #10
0
 def __init__(self):
     self.url_beforenewsinfo_map = {SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_VOTENUM: {},
                                    SQLDAO.SPIDER_TABLE_NEWS_FANSNUM: {}}
     self.url_beforenewsnum_map = {}
     self.url_curcmtcontent_map = {}
     self.url_curcmtnum_map = {}
     self.url_beforecmtnum_map = {}
     date = TimeUtility.getcurrentdate()
     path = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_PATH), date)     
     suffix = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_OUTPUT_FILENAME_SUFFIX)  
     self.outputpath = FileFormat.OUTPUTPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time()))
     self.errorinfopath = FileFormat.ERRORINFOPATH.format(path=path, suffix=suffix, date=date.replace('-', '_'), ts=int(time.time()))
     self.pushpath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,const.SPIDER_PUSH_PATH_MASTER), date)
     if not FileUtility.exists(path):
         FileUtility.mkdirs(path)    
    def  step2(self,params):
        try:
            page = params.customized['page']
            soup = BeautifulSoup(params.content,'html5lib')
            subject = soup.find(attrs={'id':re.compile(self.commentCsskey['subject_idkey'])})
            if subject:
                subject = subject.get_text()
            else:
                pass
            tables = soup.find_all('table',attrs={'id':re.compile(self.commentCsskey['table_idkey']),'summary':re.compile(self.commentCsskey['table_summarykey'])})
        
            if page ==1:
                tables = tables[1:]

            if tables:
                #初始列表赋一个当前时间的值,避免后续报错
                publishlist = [TimeUtility.getcurrentdate(TimeUtility.DEFAULTFORMAT)]                   
                for table in tables:
                    try:
                        nick = table.select_one('.xw1').get_text()
                    except:
                        nick = 'anonymous'
                    try:
                        curtimeobj = table.find(attrs={'id':re.compile(self.commentCsskey['time_idkey'])})
                        if curtimeobj.select_one('span'):
                            curtime = curtimeobj.select_one('span').get('title')
                        else:
                            curtime = curtimeobj.get_text()
                    except:
                        curtime = TimeUtility.getuniformtime(0)
                    try:
                        content = table.find(attrs={'id':re.compile(self.commentCsskey['content_idkey'])}).get_text()
                    except:
                        content = ''
                    publishlist.append(curtime)
                    CMTStorage.storecmt(params.originalurl, content, curtime, nick)
                if not self.isnewesttime(params.originalurl, min(publishlist)):
                    return False         
            else:
                Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS)
            return True
        except:
            Logger.printexception()
            Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
Exemple #12
0
 def __init__(self):
     self.upload_file_list = {}
     self.impls = []
     self.implsindex = 0
     self.initcommon()
     self.wimpls = []
     self.wimplsindoex = 0
     self.initwebkit()
     self.limpls = []
     self.limplsindex = 0
     self.initlocal()
     self.tempurlpath = Storage.getstoragelocation(
         const.SPIDER_URLS_TEMP_PATH)
     self.urlbackuppath = SpiderConfigure.getconfig(
         const.SPIDER_STORAGE_DOMAIN,
         const.SPIDER_URL_BACKUP_PATH) + TimeUtility.getcurrentdate()
     #文件下载失败重试机制
     self.retransmissionfiles = {}
     self.all_retransmissionfiles = {}
     self.retransmissionlimitnum = 3
     self.filetime = 0
Exemple #13
0
 def getwaibubaup():
     return SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_WAIBU_BACKUP_PATH) + TimeUtility.getcurrentdate()
     
Exemple #14
0
    def processVideo(self, params):
        if params.step == MofangS2Query.S2QUERY_FIRST_PAGE:
            #Step2: 根据返回json内容,comments['totalnums'] 得到视频总数
            #一个json返回20条数据,需要使用总数除以20获得总页数,然后在写到page参数里面
            info = params.customized['query']
            keyvalue = Common.urlenc(info)
            try:
                jsondate = json.loads(params.content)
                comments_count = jsondate['totalnums']
            except:
                Logger.getlogging().warning('{}:40000'.format(params.url))
                return
            # 获取不到,则返回
            if int(comments_count) == 0:
                return

            page_count = int(
                math.ceil(float(comments_count) / self.DEFAULT_PAGE_SIZE))
            # 根据上面的page_count数,拼出所有的搜索结果url(最新1周)
            querylist = []
            if page_count > 0:
                for page in range(1, page_count + 1, 1):
                    url = MofangS2Query.QUERY_TEMPLATE.format(
                        key=keyvalue,
                        pageno=page,
                        pagesize=self.DEFAULT_PAGE_SIZE)
                    Logger.getlogging().debug(url)
                    querylist.append(url)
                self.__storeqeuryurllist__(querylist,
                                           MofangS2Query.S2QUERY_EACH_PAGE,
                                           {'query': info})

        elif params.step == MofangS2Query.S2QUERY_EACH_PAGE:
            # Step3: 根据Step2的返回jason数据,获取
            # 标题:comments['data'][0开始到19]['title']
            # 连接:comments['data'][0开始到19]['url']
            # 视频发布时间:comments['data'][0开始到19]['inputtime'] 这个需要截断前10位,只能对比日期

            info = params.customized['query']
            try:
                jsondate = json.loads(params.content)
                searchresult = jsondate['data']
            except:
                Logger.getlogging().warning('{}:40000'.format(params.url))
                return

            # 获取当前日(日期类型)
            today = datetime.datetime.strptime(TimeUtility.getcurrentdate(),
                                               TimeUtility.DATE_FORMAT_DEFAULT)

            urllist = []
            for index in range(0, len(searchresult), 1):
                #print searchresult[index]['title']
                #print searchresult[index]['inputtime']
                if searchresult[index]['title'] is not None:
                    # 标题中包含指定要查询的关键字,对应的url保存
                    # if searchresult[index]['title'].find(info) > -1:
                    if Common.checktitle(info, searchresult[index]['title']):
                        if searchresult[index]['inputtime'] is not None:
                            #inputtime = datetime.datetime.strptime(TimeUtility.getuniformtime2(int(searchresult[index]['inputtime'])), TimeUtility.TIME_FORMAT_DEFAULT)
                            #intervaldays = today - inputtime
                            #if intervaldays.days <= int(self.querylastdays):
                            pubtime = getuniformtime(
                                str(searchresult[index]['inputtime']))

                            if compareNow(pubtime, int(self.querylastdays)):
                                urllist.append(searchresult[index]['url'])
                        else:
                            # 获取不到发布时间,则默认为周期以内
                            urllist.append(searchresult[index]['url'])

            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
    def process(self, proparam):
        Logger.getlogging().info(proparam.url)
        try:
            if proparam.step is ishangmanComments.STEP_1:
                # 取得url中的参数值
                articleIds = re.findall(
                    r'^http://(\w+)\.ishangman\.com/\w+/(\d+)',
                    proparam.url).__getitem__(0)
                articleId1 = articleIds.__getitem__(0)
                articleId2 = articleIds.__getitem__(1)
                # 评论类型
                commenttype = int(
                    self.r.parse(ur'commenttype = (.*);', proparam.content)[0])
                #第一页评论
                url = ishangmanComments.COMMENTS_URL % (articleId1, articleId2,
                                                        commenttype, 1)
                self.storeurl(
                    url, proparam.originalurl, ishangmanComments.STEP_2, {
                        'articleId1': articleId1,
                        'articleId2': articleId2,
                        'commenttype': commenttype
                    })

            elif proparam.step == ishangmanComments.STEP_2:
                articleId1 = proparam.customized['articleId1']
                articleId2 = proparam.customized['articleId2']
                commenttype = proparam.customized['commenttype']
                # 取得评论件数
                xhtml = XPathUtility(html=proparam.content)
                if articleId1.__eq__('comic'):
                    comments_count = int(
                        xhtml.getlist(
                            '//*[contains(@class,"ismcartondiv1")]/p/strong')
                        [0])
                    if comments_count:
                        NewsStorage.setcmtnum(proparam.originalurl,
                                              comments_count)
                else:
                    comments_count = int(
                        self.r.parse(
                            ur'(\d+).*',
                            xhtml.getlist('//*[@class="comment_lctwidl"]/p')
                            [0])[0])
                    if comments_count:
                        NewsStorage.setcmtnum(proparam.originalurl,
                                              comments_count)
                # 取得评论的页数
                cmtnum = CMTStorage.getcount(proparam.originalurl, True)
                if int(comments_count) == 0:
                    return
                page_num = int(
                    math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
                if page_num >= self.maxpages:
                    page_num = self.maxpages

                # 取得评论的url
                for page in range(1, page_num + 1, 1):
                    url = ishangmanComments.COMMENTS_URL % (
                        articleId1, articleId2, commenttype, page)
                    self.storeurl(url, proparam.originalurl,
                                  ishangmanComments.STEP_3,
                                  {'articleId1': articleId1})

            elif proparam.step == ishangmanComments.STEP_3:
                try:
                    Logger.getlogging().debug(proparam.originalurl)
                    commentsInfo = []
                    articleId1 = proparam.customized['articleId1']
                    xparser = XPathUtility(proparam.content)
                    # 取得评论件数
                    if articleId1.__eq__('comic'):
                        # 论坛评论
                        soup = BeautifulSoup(proparam.content, 'html5lib')
                        comments = soup.select('.ismcartondiv2')
                    else:
                        # 论坛评论
                        comments = xparser.getcomments(
                            '/html/body/div/span[2]/p[1]')
                        # 论坛评论时间
                        updateTime = xparser.getcomments(
                            '/html/body/div/span[2]/div[1]')

                    # 取得评论
                    for index in range(0, int(len(comments)), 1):
                        cmti = []
                        if articleId1.__eq__('comic'):
                            publictime = self.r.parse(
                                ur'(\d{2}-\d+ \d+:\d+)',
                                comments[index].get_text())[0]
                            # publictime  = TimeUtility.getuniformtime(publictime)
                            if publictime:
                                cmt_month = publictime.split("-")[0]
                                curmonth = time.localtime().tm_mon
                                if (int(cmt_month) < curmonth):
                                    publictime = TimeUtility.getcurrentdate(
                                    )[0:4] + '-' + publictime
                                else:
                                    publictime = '2016' + '-' + publictime
                            curtime = TimeUtility.getuniformtime(publictime)
                            content = comments[index].text.split(
                                '\n')[0].get_text()

                            # # print comments;
                            # return
                            # content = self.r.parse(ur'class=\".*\"',comments[index].get_text())[0]
                            # nick = comments[1].get('nickname', 'anonymous')
                            #
                            # if not CMTStorage.exist(proparam.originalurl, content, curtime, nick):
                            #     CMTStorage.storecmt(proparam.originalurl, content, curtime, nick)
                            # if NewsStorage.storeupdatetime(proparam.originalurl, tm):
                            #     cmti.content = comments[index].get_text()
                            #     commentsInfo.append(cmti)
                        else:
                            publictime = updateTime[index][:-8]
                            #publictime = TimeUtility.getcurrentdate()[0:4] + '-'+ publictime
                            #tm = TimeUtility.getuniformtime(publictime, u'%Y-%m-%d %H:%M')
                            tm = getuniformtime(publictime)
                            if NewsStorage.storeupdatetime(
                                    proparam.originalurl, tm):
                                cmti.content = comments[index]
                                commentsInfo.append(cmti)

                        # 保存获取的评论i
                    if len(commentsInfo) > 0:
                        self.commentstorage.store(proparam.originalurl,
                                                  commentsInfo)

                except:
                    Logger.printexception()
                    Logger.getlogging().error(
                        'extract comment error from {site}'.format(
                            site=proparam.url))
            else:
                Logger.getlogging().error("proparam.step == %d", proparam.step)

        except Exception, e:
            traceback.print_exc()
Exemple #16
0
 def removecachefile():
     cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY)
     databackupfolder = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                                    const.SPIDER_DATA_BACKUP_PATH) + TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT)
     if FileUtility.exists(cache):
         FileUtility.move(cache, databackupfolder)
         FileUtility.rmdir(cache)
     limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT))
     databackuppath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH)
     if FileUtility.exists(databackuppath):
         validdate = TimeUtility.getdatebefore(limit, '%Y%m%d000000')
         for s in os.listdir(databackuppath):
             fullpath = os.path.join(databackuppath, s)
             #Logger.getlogging().info('remove cach folder ' + fullpath)
             #FileUtility.rmdir(fullpath)
             if s < validdate:
                 fullpath = os.path.join(databackuppath, s)
                 Logger.getlogging().info('remove cach folder ' + fullpath)
                 FileUtility.rmdir(fullpath)
def scanning():
    whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI)
    scanningPath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_URL_PATH)
    donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.removefiles(donepath)
    backupPath = os.path.join(SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_URL_BACKUP), TimeUtility.getcurrentdate())
    interval = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.DOWNLOADER_INTERVAL)
    FileUtility.mkdirs(scanningPath)
    FileUtility.mkdirs(backupPath) 
    while True:
        Logger.getlogging().debug('scanning')
        flag = False
        for filename in os.listdir(scanningPath):
            try:
                urlfilepath = os.path.join(scanningPath, filename)
                backupfile  = os.path.join(backupPath, filename)
                if os.path.isfile(urlfilepath) and 'tmp' not in filename:
                    Logger.getlogging().info('Get url file:{file}'.format(file=filename))
                    FileUtility.copy(urlfilepath, backupfile)
                    download(urlfilepath)
                if not flag:
                    flag = True
            except:
                Logger.printexception()
        if not flag:
            Logger.getlogging().debug('scanning interval sleeping {interval}s'.format(interval=interval))
            time.sleep(int(interval))