def __init__(self, appids): threading.Thread.__init__(self) self.__appids = appids # a list self.__headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/57.0'} self.__pattern = re.compile('<entry>.*?<updated>(.*?)</updated>.*?<id>(.*?)</id>.*?' '<title>(.*?)</title>.*?<content.*?>(.*?)</content>.*?<im:voteSum>(.*?)</im:voteSum>.*?' '<im:voteCount>(.*?)</im:voteCount>.*?<im:rating>(.*?)</im:rating>.*?' '<im:version>(.*?)</im:version>.*?<name>(.*?)</name>', re.M | re.S) self.__appCommentsHandler = AppCommentsDbHandler() self.__count = 0
class WebToDb(object): def __init__(self): self.__headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/57.0' } self.__pattern = re.compile( '<entry>.*?<updated>(.*?)</updated>.*?<id>(.*?)</id>.*?' '<title>(.*?)</title>.*?<content.*?>(.*?)</content>.*?<im:voteSum>(.*?)</im:voteSum>.*?' '<im:voteCount>(.*?)</im:voteCount>.*?<im:rating>(.*?)</im:rating>.*?' '<im:version>(.*?)</im:version>.*?<name>(.*?)</name>', re.M | re.S) self.__appCommentsHandler = AppCommentsDbHandler() self.__appleAppHandler = AppleAppDbHandler() def executeAll(self): counter = 0 appleApps = self.__appleAppHandler.queryAll() for appleApp in appleApps: print('\n正在获取苹果应用: %s-%s 的最新评论······' % (appleApp[0], appleApp[1])) added = self.executeByAppId(appleApp[0]) print('新增%d条评论' % added) counter += added return counter def executeByAppId(self, appId): count_before = self.__appCommentsHandler.count() # get comment entries from page 1 to 10 for currPage in range(1, 11): url = "https://itunes.apple.com/rss/customerreviews/page=" + str(currPage) + \ "/id=" + str(appId) + "/sortby=mostrecent/xml?l=en&&cc=cn" spider = MySpider(url, self.__headers, self.__pattern) comments = spider.getMsgs() try: # insert comment entries from current website page one by one for comment in comments: commentItemList = list(comment) commentItemList.append(appId) # app_id commentItemList.append(str('')) # isSpam try: self.__appCommentsHandler.insertAppComment( commentItemList) except UserWarning: raise UserWarning('Outdated comments!') except Exception as errStr: print(errStr) except (Exception, UserWarning) as errStr: print(errStr, 'Update next app\'s comment!') break count_after = self.__appCommentsHandler.count() return count_after - count_before
class ThreadOfWebToDb(threading.Thread): def __init__(self, appids): threading.Thread.__init__(self) self.__appids = appids # a list self.__headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/57.0'} self.__pattern = re.compile('<entry>.*?<updated>(.*?)</updated>.*?<id>(.*?)</id>.*?' '<title>(.*?)</title>.*?<content.*?>(.*?)</content>.*?<im:voteSum>(.*?)</im:voteSum>.*?' '<im:voteCount>(.*?)</im:voteCount>.*?<im:rating>(.*?)</im:rating>.*?' '<im:version>(.*?)</im:version>.*?<name>(.*?)</name>', re.M | re.S) self.__appCommentsHandler = AppCommentsDbHandler() self.__count = 0 def countOfNewComments(self): return self.__count def run(self): # get comments from websits and save them to database for appid in self.__appids: count_befor = self.__appCommentsHandler.countByAppId(appid) for currPage in range(1, 11): url = "https://itunes.apple.com/rss/customerreviews/page=" + str(currPage) + \ "/id=" + str(appid) + "/sortby=mostrecent/xml?l=en&&cc=cn" spider = MySpider(url, self.__headers, self.__pattern) comments = spider.getMsgs() try: # insert comment entries from current website page one by one for comment in comments: commentItemList = list(comment) commentItemList.append(appid) # app_id commentItemList.append(str('')) # isSpam try: self.__appCommentsHandler.insertAppComment(commentItemList) except UserWarning: raise UserWarning('{} Outdated comments!'.format(appid)) except Exception as errStr: print(errStr) except (Exception,UserWarning) as errStr: # print(errStr) break count_after = self.__appCommentsHandler.countByAppId(appid) print('{} get {} comments.Update next app\'s comment!' .format(appid, count_after-count_befor)) self.__count += count_after-count_befor
def __init__(self): self.__appleAppHandler = AppleAppDbHandler() self.__appCommentsHandler = AppCommentsDbHandler()
def __init__(self): # connet to database self.__appCommentsHandler = AppCommentsDbHandler() self.__appleAppHandler = AppleAppDbHandler() self.__signedCommentsDbHandler = SignedCommentsDbHandler()
class DbToXlsx(object): def __init__(self): # connet to database self.__appCommentsHandler = AppCommentsDbHandler() self.__appleAppHandler = AppleAppDbHandler() self.__signedCommentsDbHandler = SignedCommentsDbHandler() def exportAllComments(self): currTime = time.localtime(time.time()) xlsxFileName = str(currTime.tm_year) + "_" + str(currTime.tm_mon) + \ "_" + str(currTime.tm_mday) + "_" + str(currTime.tm_hour) +\ "_" + str(currTime.tm_min) + "_" + str(currTime.tm_sec) + '.xlsx' filePath = os.path.join(config.RESOURCES_PATH,'output',xlsxFileName) comments = self.__appCommentsHandler.queryAll() wb = Workbook() ws = wb.active # app_id,time,comment_id,title,content,voteSum,voteCount,rating,version,user_name,isSpam,app_name head = ('app_id', 'time', 'comment_id', 'title', 'content', 'voteSum', 'voteCount', 'rating', 'version', 'user_name', 'isSpam', 'app_name') for j in range(len(head)): ws.cell(row=1, column=j + 1).value = head[j] for comment in comments: ws.append(comment) wb.save(filename = filePath) # export comments to several xlsx files named by app_id-app_name # multi thread def exportCommentsEachApp(self): # create output dir currTime = time.localtime(time.time()) # get current time dirName = str(currTime.tm_year) + "_" + str(currTime.tm_mon) + "_" +\ str(currTime.tm_mday) + "_" + str(currTime.tm_hour) + "_" + str(currTime.tm_min) +\ "_" + str(currTime.tm_sec) + 'xlsx' dirPath = os.path.join(config.RESOURCES_PATH, 'output', dirName) os.mkdir(dirPath) appTuple = self.__appleAppHandler.queryAll() # ((app_id,app_name),(app_id,app_name),...) threads = [] for i in range(len(appTuple)): print(appTuple[i]) myThread = ThreadExportCommentOfOneApp(i,appTuple[i],dirPath,self.__appCommentsHandler) threads.append(myThread) for thread in threads: thread.start() for thread in threads: thread.join() def exportSignedCommentsEachApp(self): # create output dir currTime = time.localtime(time.time()) # get current time dirName = str(currTime.tm_year) + "_" + str(currTime.tm_mon) + "_" +\ str(currTime.tm_mday) + "_" + str(currTime.tm_hour) + "_" + str(currTime.tm_min) +\ "_" + str(currTime.tm_sec) + 'Signedxlsx' dirPath = os.path.join(config.RESOURCES_PATH, 'output', dirName) os.mkdir(dirPath) appTuple = self.__appleAppHandler.queryAll() # ((app_id,app_name),(app_id,app_name),...) threads = [] for i in range(len(appTuple)): print(appTuple[i]) myThread = ThreadExportCommentOfOneApp(i,appTuple[i],dirPath,self.__signedCommentsDbHandler) threads.append(myThread) for thread in threads: thread.start() for thread in threads: thread.join()