Ejemplo n.º 1
0
 def removecachefile():
     cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY)
     databackupfolder = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                                    const.SPIDER_DATA_BACKUP_PATH) + TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT)
     if FileUtility.exists(cache):
         FileUtility.move(cache, databackupfolder)
         FileUtility.rmdir(cache)
     limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT))
     databackuppath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH)
     if FileUtility.exists(databackuppath):
         validdate = TimeUtility.getdatebefore(limit, '%Y%m%d000000')
         for s in os.listdir(databackuppath):
             fullpath = os.path.join(databackuppath, s)
             #Logger.getlogging().info('remove cach folder ' + fullpath)
             #FileUtility.rmdir(fullpath)
             if s < validdate:
                 fullpath = os.path.join(databackuppath, s)
                 Logger.getlogging().info('remove cach folder ' + fullpath)
                 FileUtility.rmdir(fullpath)
 def getpagecomments_step2(self, params):
     try:
         page = params.customized['page']
         soup = BeautifulSoup(params.content, "html5lib")
         d_post_content_main = soup.select('#j_p_postlist > div.j_l_post')
         if page == 1:
             main_item = d_post_content_main[0]
             #print main_item
             pubtimes = ''
             pubtimesobj = main_item.select('.tail-info')
             if pubtimesobj:
                 pubtimes = getuniformtime(
                     pubtimesobj[-1].get_text().strip())
             else:
                 pubtimeslist = re.findall('\d+-\d+-\d+ \d+:\d+',
                                           str(main_item))
                 if pubtimeslist:
                     pubtimes = getuniformtime(pubtimeslist[0])
             if pubtimes:
                 NewsStorage.setpublishdate(params.originalurl, pubtimes)
                 if not compareNow(pubtimes, self.COMMENT_LIMIT_DAYS):
                     Logger.log(params.originalurl,
                                constant.ERRORCODE_WARNNING_NOMATCHTIME)
                     #超过7天的帖子,不在取回复/评论了
                     return False
             d_post_content_main = d_post_content_main[1:]
         comments = []
         for item in d_post_content_main:
             try:
                 comment = item.find(
                     attrs={'id': re.compile("post_content")})
                 if not comment:
                     continue
                 content = comment.get_text().strip()
                 pubtimes = ''
                 pubtimesobj = item.select('.tail-info')
                 if pubtimesobj:
                     pubtimes = getuniformtime(
                         pubtimesobj[-1].get_text().strip())
                 else:
                     pubtimeslist = re.findall('\d+-\d+-\d+ \d+:\d+',
                                               str(item))
                     if pubtimeslist:
                         pubtimes = getuniformtime(pubtimeslist[0])
                 if not pubtimes:
                     if not CMTStorage.exist(params.originalurl, content,
                                             TimeUtility.getdatebefore(0),
                                             'nick'):
                         CMTStorage.storecmt(params.originalurl, content,
                                             TimeUtility.getdatebefore(0),
                                             'nick')
                     continue
                 #判断评论是否是前一天的
                 Logger.getlogging().debug(pubtimes)
                 if self.isyestoday(pubtimes):
                     if not CMTStorage.exist(params.originalurl, content,
                                             pubtimes, 'nick'):
                         CMTStorage.storecmt(params.originalurl, content,
                                             pubtimes, 'nick')
             except:
                 Logger.printexception()
         return True
     except:
         Logger.printexception()
         return False
Ejemplo n.º 3
0
 def clear(self):
     dirlist = os.listdir(self.pucbackpath)
     for tm in dirlist:
         if tm < TimeUtility.getdatebefore(self.pucsavedays,
                                           TimeUtility.DATE_FORMAT_DEFAULT):
             FileUtility.rmdir(os.path.join(self.pucbackpath, tm))