def updatedb(self): items = SpiderDao().getall() if not items: return validdate = TimeUtility.getuniformdatebefore(SpiderConfigure.getinstance().getvalidperiod()) removelist = [] for key in items.keys(): info = URLCommentInfo.fromstring(items[key]) if info.timestamp < validdate: Logger.getlogging().debug(items[key]) removelist.append(key) SpiderDao().remove(removelist)
def __init__(self): SiteComments.__init__(self) self.page_size = 50 self.page_size_yunqi = 10 self.COMMENTS_URL = 'http://coral.qq.com/article/{0}/comment?commentid={1}&reqnum={2}' # self.AC_COMMENTS_URL = 'http://ac.qq.com/Community/topicList?targetId={0}&page={1}&type=0&_={2}' self.AC_COMMENTS_URL = 'http://ac.qq.com/Community/topicList?targetId={0}&page={1}' self.EBOOK_COMMENTS_URL = 'http://ebook.qq.com/{site}/getComment.html?bid={bid}&pageIndex={page}' self.YUNQI_COMMENT_URL = 'http://yunqi.qq.com/bk/gdyq/%s-b.html?hot=0&p=%d' self.STEP_DEFAULT_VALUE = None self.STEP_COMMENT_FIRST_PAGE = 1 self.STEP_COMMENT_NEXT_PAGE = 2 self.hasnext = True self.cmtlastdays = TimeUtility.getuniformdatebefore( int(SpiderConfigure.getinstance().getlastdays())) self.comment_maxnum = 5000
def step2(self, params): info = params.customized['query'] query = Common.urldec(info) urllist = [] soup = BeautifulSoup(params.content, 'html5lib') items = soup.select('.cf > li > a.ui-list-ct') for item in items: try: url = item.get('href') title = item.get('title') if self.checktitle(query, title): #urllist.append(url) self.__storeurl__(url, TimeUtility.getuniformdatebefore(0), SPIDER_S2_WEBSITE_VIDEO) else: Logger.log(params.originalurl, constant.ERRORCODE_WARNNING_NOMATCHTITLE) except: Logger.printexception()
def mkcachedir(): cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY) FileUtility.rmdir(cache) FileUtility.mkdirs(cache) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_WAIBU_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_TIEBA_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_URLS_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_DONE_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH)) FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_OUTPUT_TEMP_PATH)) limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT)) outputpath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH) if FileUtility.exists(outputpath): validdate = TimeUtility.getuniformdatebefore(limit) for s in os.listdir(outputpath): if s < validdate: fullpath = os.path.join(outputpath, s) FileUtility.rmdir(fullpath)
def __init__(self): self.r = RegexUtility() self.website = None self.cmtlastdays = TimeUtility.getuniformdatebefore(delta=int(SpiderConfigure.getinstance().getlastdays()))[:10] + u' 00:00:00' self.maxpages = int(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN, const.SPIDER_S1_MAX_COMMENT_PAGES))