Exemple #1
0
 def __init__(self, appids):
     threading.Thread.__init__(self)
     self.__appids = appids    # a list
     self.__headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/57.0'}
     self.__pattern = re.compile('<entry>.*?<updated>(.*?)</updated>.*?<id>(.*?)</id>.*?'
         '<title>(.*?)</title>.*?<content.*?>(.*?)</content>.*?<im:voteSum>(.*?)</im:voteSum>.*?'
         '<im:voteCount>(.*?)</im:voteCount>.*?<im:rating>(.*?)</im:rating>.*?'
         '<im:version>(.*?)</im:version>.*?<name>(.*?)</name>', re.M | re.S)
     self.__appCommentsHandler = AppCommentsDbHandler()
     self.__count = 0
Exemple #2
0
class WebToDb(object):
    def __init__(self):
        self.__headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/57.0'
        }
        self.__pattern = re.compile(
            '<entry>.*?<updated>(.*?)</updated>.*?<id>(.*?)</id>.*?'
            '<title>(.*?)</title>.*?<content.*?>(.*?)</content>.*?<im:voteSum>(.*?)</im:voteSum>.*?'
            '<im:voteCount>(.*?)</im:voteCount>.*?<im:rating>(.*?)</im:rating>.*?'
            '<im:version>(.*?)</im:version>.*?<name>(.*?)</name>', re.M | re.S)
        self.__appCommentsHandler = AppCommentsDbHandler()
        self.__appleAppHandler = AppleAppDbHandler()

    def executeAll(self):
        counter = 0
        appleApps = self.__appleAppHandler.queryAll()
        for appleApp in appleApps:
            print('\n正在获取苹果应用: %s-%s 的最新评论······' % (appleApp[0], appleApp[1]))
            added = self.executeByAppId(appleApp[0])
            print('新增%d条评论' % added)
            counter += added
        return counter

    def executeByAppId(self, appId):
        count_before = self.__appCommentsHandler.count()
        # get comment entries from page 1 to 10
        for currPage in range(1, 11):
            url = "https://itunes.apple.com/rss/customerreviews/page=" + str(currPage) + \
                  "/id=" + str(appId) + "/sortby=mostrecent/xml?l=en&&cc=cn"
            spider = MySpider(url, self.__headers, self.__pattern)
            comments = spider.getMsgs()
            try:
                # insert comment entries from current website page one by one
                for comment in comments:
                    commentItemList = list(comment)
                    commentItemList.append(appId)  # app_id
                    commentItemList.append(str(''))  # isSpam
                    try:
                        self.__appCommentsHandler.insertAppComment(
                            commentItemList)
                    except UserWarning:
                        raise UserWarning('Outdated comments!')
                    except Exception as errStr:
                        print(errStr)
            except (Exception, UserWarning) as errStr:
                print(errStr, 'Update next app\'s comment!')
                break
        count_after = self.__appCommentsHandler.count()
        return count_after - count_before
Exemple #3
0
class ThreadOfWebToDb(threading.Thread):

    def __init__(self, appids):
        threading.Thread.__init__(self)
        self.__appids = appids    # a list
        self.__headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/57.0'}
        self.__pattern = re.compile('<entry>.*?<updated>(.*?)</updated>.*?<id>(.*?)</id>.*?'
            '<title>(.*?)</title>.*?<content.*?>(.*?)</content>.*?<im:voteSum>(.*?)</im:voteSum>.*?'
            '<im:voteCount>(.*?)</im:voteCount>.*?<im:rating>(.*?)</im:rating>.*?'
            '<im:version>(.*?)</im:version>.*?<name>(.*?)</name>', re.M | re.S)
        self.__appCommentsHandler = AppCommentsDbHandler()
        self.__count = 0

    def countOfNewComments(self):
        return self.__count

    def run(self):
        # get comments from websits and save them to database
        for appid in self.__appids:
            count_befor = self.__appCommentsHandler.countByAppId(appid)
            for currPage in range(1, 11):
                url = "https://itunes.apple.com/rss/customerreviews/page=" + str(currPage) + \
                      "/id=" + str(appid) + "/sortby=mostrecent/xml?l=en&&cc=cn"
                spider = MySpider(url, self.__headers, self.__pattern)
                comments = spider.getMsgs()
                try:
                    # insert comment entries from current website page one by one
                    for comment in comments:
                        commentItemList = list(comment)
                        commentItemList.append(appid)       # app_id
                        commentItemList.append(str(''))     # isSpam
                        try:
                            self.__appCommentsHandler.insertAppComment(commentItemList)
                        except UserWarning:
                            raise UserWarning('{} Outdated comments!'.format(appid))
                        except Exception as errStr:
                            print(errStr)
                except (Exception,UserWarning) as errStr:
                    # print(errStr)
                    break
            count_after = self.__appCommentsHandler.countByAppId(appid)
            print('{} get {} comments.Update next app\'s comment!'
                  .format(appid, count_after-count_befor))
            self.__count += count_after-count_befor
Exemple #4
0
 def __init__(self):
     self.__appleAppHandler = AppleAppDbHandler()
     self.__appCommentsHandler = AppCommentsDbHandler()
Exemple #5
0
 def __init__(self):
     # connet to database
     self.__appCommentsHandler = AppCommentsDbHandler()
     self.__appleAppHandler = AppleAppDbHandler()
     self.__signedCommentsDbHandler = SignedCommentsDbHandler()
Exemple #6
0
class DbToXlsx(object):

    def __init__(self):
        # connet to database
        self.__appCommentsHandler = AppCommentsDbHandler()
        self.__appleAppHandler = AppleAppDbHandler()
        self.__signedCommentsDbHandler = SignedCommentsDbHandler()

    def exportAllComments(self):
        currTime = time.localtime(time.time())
        xlsxFileName = str(currTime.tm_year) + "_" + str(currTime.tm_mon) + \
                       "_" + str(currTime.tm_mday) + "_" + str(currTime.tm_hour) +\
                       "_" + str(currTime.tm_min) + "_" + str(currTime.tm_sec) + '.xlsx'
        filePath = os.path.join(config.RESOURCES_PATH,'output',xlsxFileName)
        comments = self.__appCommentsHandler.queryAll()

        wb = Workbook()
        ws = wb.active
        # app_id,time,comment_id,title,content,voteSum,voteCount,rating,version,user_name,isSpam,app_name
        head = ('app_id', 'time', 'comment_id', 'title', 'content', 'voteSum', 'voteCount',
                'rating', 'version', 'user_name', 'isSpam', 'app_name')
        for j in range(len(head)):
            ws.cell(row=1, column=j + 1).value = head[j]
        for comment in comments:
            ws.append(comment)
        wb.save(filename = filePath)

    # export comments to several xlsx files named by app_id-app_name
    # multi thread
    def exportCommentsEachApp(self):
        # create output dir
        currTime = time.localtime(time.time())  # get current time
        dirName = str(currTime.tm_year) + "_" + str(currTime.tm_mon) + "_" +\
                  str(currTime.tm_mday) + "_" + str(currTime.tm_hour) + "_" + str(currTime.tm_min) +\
                  "_" + str(currTime.tm_sec) + 'xlsx'
        dirPath = os.path.join(config.RESOURCES_PATH, 'output', dirName)
        os.mkdir(dirPath)
        appTuple = self.__appleAppHandler.queryAll()  # ((app_id,app_name),(app_id,app_name),...)

        threads = []
        for i in range(len(appTuple)):
            print(appTuple[i])
            myThread = ThreadExportCommentOfOneApp(i,appTuple[i],dirPath,self.__appCommentsHandler)
            threads.append(myThread)
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()

    def exportSignedCommentsEachApp(self):
        # create output dir
        currTime = time.localtime(time.time())  # get current time
        dirName = str(currTime.tm_year) + "_" + str(currTime.tm_mon) + "_" +\
                  str(currTime.tm_mday) + "_" + str(currTime.tm_hour) + "_" + str(currTime.tm_min) +\
                  "_" + str(currTime.tm_sec) + 'Signedxlsx'
        dirPath = os.path.join(config.RESOURCES_PATH, 'output', dirName)
        os.mkdir(dirPath)
        appTuple = self.__appleAppHandler.queryAll()  # ((app_id,app_name),(app_id,app_name),...)

        threads = []
        for i in range(len(appTuple)):
            print(appTuple[i])
            myThread = ThreadExportCommentOfOneApp(i,appTuple[i],dirPath,self.__signedCommentsDbHandler)
            threads.append(myThread)
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()