Exemple #1
0
 def getPdfFile(self, pdfRealUrl, filePath):
     '''
     get pdf file by pdf real url
     @param pdfRealUrl:pdf file's real url
     @param filePath:local disk path which will save pdf file
     @return: success return true, or return false  
     '''
     bReturn = False
     f = None
     try:
         if pdfRealUrl:
             response = requests.get(pdfRealUrl, stream=True)
             f = open(filePath, "wb")
             for chunk in response.iter_content(chunk_size=1024):
                 if chunk:
                     f.write(chunk)
             appLogger.info(
                 'success get file from the internet. the file size is %s bytes'
                 % os.path.getsize(filePath))
             bReturn = True
         else:
             #                 print 'sleep 10 second...'
             sleepTime = random.randrange(5, 20, 1)
             time.sleep(
                 sleepTime
             )  #if we can not get real pdf url, thread will sleep 3000 ms in order to simulate the time to download the file
     except Exception, err:
         appLogger.error(err)
Exemple #2
0
    def execute(self):
        '''
        main function to execute this app
        '''
        if self.streamLineTemplate:
            pTotalCount = 1  #the total number of process
            processes = []
            tmpInfo = ''
            for index, process in enumerate(self.streamLineTemplate):
                pCount = process.get('pCount')  #process number
                pTotalCount += pCount
                thread = process.get('Thread')  #thread number
                processes.append([pCount, thread])
            tmpInfo += 'starts  %s process(including main process)\n' % pTotalCount
            for i, p in enumerate(processes):
                tmpInfo += 'process stage %d starts %d process:\n' % (i + 1,
                                                                      p[0])
                tmpInfo += 'every bizprocessor thread in the process is the following:\n'
                if i == 0:
                    thread = p[1]
                    tmpInfo += '%40s.%40s * %s\n' % (thread[0], thread[1],
                                                     thread[2])
                else:
                    for thread in p[1]:
                        tmpInfo += '%40s.%40s * %d\n' % (thread[0], thread[1],
                                                         thread[2])
            appLogger.info(tmpInfo)

        processList, outputQueue = self.CreateSteamLine()
        stat = Statistics()
        for process in processList:
            process.start()  #start processor

        process_record = 0
        productCount = -1

        while True:
            streamBox = outputQueue.get()
            if streamBox:
                if isinstance(streamBox, StreamLogger):
                    stat.addProcessorLog(streamBox)
                    process_record += 1  #the product which has been processed adds 1
                    if process_record % 3 == 0:
                        appLogger.info('\n%s' %
                                       stat.getStatisticInfo())  #print log
                if isinstance(streamBox, StopSignal):
                    productCount = streamBox.productCount  #get product count
                if productCount == process_record:  #if product count equals processed product count then stop app
                    break
                del (streamBox)
            else:
                time.sleep(0.01)

        for process in processList:
            process.terminate()
            process.join()
        appLogger.info('\n%s' % stat.getStatisticInfo())
        appLogger.info('%s thread stop' % self.__class__.__name__)
        appLogger.info('the app stop')
Exemple #3
0
 def getResultData(self):
     keyWords=getKeywords(self.appConfig)
     for keyWord in keyWords:
         results=self.apiSpider.queryData(keyWord)
         if not results or len(results)==0:
             appLogger.info('key word: %s results number is 0' % keyWord)
         else:
             appLogger.info('key word: %s results number is %d' % (keyWord,len(results)))
             for result in results:
                 yield result
Exemple #4
0
    def printEndMessage(self,message):
        ser1Lock.acquire()
        sReturn='Finishing: '
        if message:
            timeArray=self.timeMap.get(message)
            if not timeArray:
                timeArray=[]
            timeArray.append(self.getCurTime())
#             periodArray=self.periodMap.get(message)
            sReturn='Finishing: %-50s spending: %6d ms, max: %6d ms, min: %6d ms, avg: %6d ms' % (message,self.getPeriod(message),max(self.periodMap.get(message)),min(self.periodMap.get(message)),self.getAvg(self.periodMap.get(message)))
        appLogger.info(sReturn)
        ser1Lock.release()
Exemple #5
0
 def printStartMessage(self,message):
     ser1Lock.acquire()
     sReturn='Beginning: '
     if message:
         timeArray=self.timeMap.get(message)
         if not timeArray:
             timeArray=[]
             self.timeMap[message]= timeArray
         sReturn='Beginning: %-s...' %message
         timeArray.append(self.getCurTime())
     appLogger.info(sReturn)
     ser1Lock.release()
 def execute(self):
     '''
     the main function to execute software
     '''
     #create streamline
     streamLineArray = self.CreateSteamLine()
     #start every bizprocessor
     for index, item in enumerate(streamLineArray):
         if isinstance(item, BaseProcessor):
             item.start()
     while True:
         time.sleep(0.1)
     appLogger.info('%s thread stop' % self.__class__.__name__)
Exemple #7
0
    def getRealPdfUrl(self, pdfUrl):
        '''
        get real pdf url by pdfUrl
        @param pdfUrl: the pdfUrl which is gotten from ieee xplore api
        @return: real pdf url
        '''
        sReturn = None

        try:
            if pdfUrl:
                #claim a MozillaCookieJar instance to save cookie
                cookie = cookielib.MozillaCookieJar()
                cookie.load(self.COOKIE_PATH,
                            ignore_discard=True,
                            ignore_expires=True)
                #                 httpHandler = urllib2.HTTPHandler()
                #                 httpsHandler = urllib2.HTTPSHandler()
                cookieHandler = urllib2.HTTPCookieProcessor(cookie)
                opener = urllib2.build_opener(cookieHandler)
                urllib2.install_opener(opener)
                #access pdfUrl
                #                 pt.printStartMessage('get real pdf url from the internet')
                loop = 0
                while True:
                    if loop == 1:
                        #                         appLogger.error('loop 1 times, but we still cannot get real pdf url')
                        break
                    loop += 1

                    #                     queueLock.acquire()
                    requestHeaders = self.getRandomRequestHeaders()
                    request = urllib2.Request(pdfUrl, headers=requestHeaders)
                    response = urllib2.urlopen(request)

                    sleepTime = random.randrange(1, 3, 1)
                    #                     print 'sleep %s s' % sleepTime
                    time.sleep(sleepTime)
                    queueLock.release()
                    #save cookie
                    soup = BeautifulSoup(response, features='lxml')
                    appLogger.info(pdfUrl)
                    if soup.iframe:
                        sReturn = soup.iframe.attrs.get(
                            'src')  #get real pdf url
                        break
                    else:
                        print requestHeaders
                        time.sleep(10)
        except Exception, err:
            appLogger.error(err)
Exemple #8
0
 def getTotalStatistics(self):
     sReturn='The whole procedure:'
     tempMap={}
     for (k,v) in self.periodMap.items():
         if ' - ' in k:
             lastStr=k.split(' - ')[-1]
             tempArray = tempMap.get(lastStr)
             if not tempArray:
                 tempArray=[]
                 tempMap[lastStr]=tempArray
             tempArray.append(self.getAvg(v))
     for (k,v) in tempMap.items():
         sReturn+=' %s : %d ;' % (k,self.getAvg(v))
     appLogger.info(sReturn)
     return sReturn
Exemple #9
0
 def queryData(self, keyWords=''):
     '''
     query ieee xplore database to get results
     @param keyWords: key words
     @return: a list which contains query results, and every result is the json formate. The max results count is 40000  
     '''
     #         lReturn=[{"_id":"5b839c9b7bbd7112ec94e5bf","issn":"0148-9267","start_page":"106","publication_number":6720219,"rank":16,"article_number":"6792690","title":"Évelyne Gayou, Editor: Polychrome Portraits 14: Pierre Schaeffer","abstract_url":"https://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6792690","issue":"1","is_number":6790986,"index_terms":{},"publication_title":"Computer Music Journal","volume":"34","access_type":"LOCKED","content_type":"Journals","authors":{"authors":[{"author_order":1,"affiliation":"San Francisco, California, USA.","full_name":"Thom Blum"}]},"publication_date":"March 2010","fileId":"","publisher":"MIT Press","doi":"10.1162/comj.2010.34.1.106","pdf_url":"https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6792690","partnum":"6792690","end_page":"111","citing_paper_count":0}]
     lReturn = []
     try:
         begin = 1  #query start record number
         query = XPLORE(self.API_KEY)
         query.maximumResults(self.QUERY_RETURN_MAX_RESULTS)
         query.queryText(keyWords)
         query.resultsSorting('publication_year', 'desc')
         query.resultsFilter('content_type',
                             'Journals')  #only query journals
         query.resultsFilter(
             'open_access',
             'True')  #only query the articles which is open access
         if self.QUERY_BEGIN_YEAR:
             query.resultsFilter('start_year', self.QUERY_BEGIN_YEAR)
         if self.QUERY_END_YEAR:
             query.resultsFilter('end_year', self.QUERY_END_YEAR)
         appLogger.info(self.getQueryInfo(keyWords))
         while True:
             query.startingResult(begin)
             results = query.callAPI(debugModeOff=True)
             print results
             self.CUR_QUERY_COUNT += 1
             articles = self.getArticles(results)  #get articles list
             if articles:
                 lReturn.extend(articles)  # add articles to result list
                 size = len(articles)  #get query total number
                 if size == self.QUERY_RETURN_MAX_RESULTS and self.CUR_QUERY_COUNT < self.MAX_QUERY_COUNT_LIMIT:  #if still has more articles,continue query
                     begin = len(lReturn) + 1
                 else:
                     break
             else:
                 break
     except Exception, err:
         appLogger.error(err)
Exemple #10
0
    def run(self):
        self.outputQueue.put(object(),block=True)
        appLogger.info('init queue...')
#         time.sleep(20)
        while self.__class__.isServer:
            beginTime=time.time()
            processObj=self.process()
            endTime=time.time()
#             if isinstance(processObj,StopSignal):
#                 self.__class__.isServer=False
#                 processObj=None
#                 appLogger.info('%s thread stop' % self.__class__.__name__)
            if isinstance(processObj, StreamLogger):
                processObj.setProcessorLog(self.__class__.__name__,beginTime,endTime)
            if processObj and self.outputQueue:
                if isinstance(processObj,StreamBox):
                    self.__class__.productCount=self.__class__.productCount+1
                if isinstance(processObj,StopSignal):
                    processObj.productCount=self.__class__.productCount
                    self.__class__.isServer=False
                self.outputQueue.put(processObj,block=True)
                
#                 print 'producer put a box in the queue' 
            time.sleep(0.01)
Exemple #11
0
    cf = ConfigParser()
    cf.read(configFilePath)
    keyWords = getKeywords(cf)

    pt.printStartMessage('initiate')
    apiSpider = getApiSpider(cf)
    webPageSpider = getWebPageSipder(cf)
    mongoDBDAO = getDatabase(cf)
    threadCount = getTHreadCount(cf)

    #
    pt.printEndMessage('initiate')
    pt.printStartMessage('processes')
    taskQueue = Queue()
    for keyWord in keyWords:
        appLogger.info(
            '------------------------------------------------------------')
        pt.printStartMessage('query articles by keywords:' + keyWord)
        results = apiSpider.queryData(keyWord)
        pt.printEndMessage('query articles by keywords:' + keyWord)
        if not results or len(results) == 0:
            print 'Results number is 0'
            break
        else:
            print 'Results number is %d' % len(results)
            for result in results:
                taskQueue.put(result)

        pt.printStartMessage('processes result set')
        threadArray = []
        for i in range(threadCount):
            nt = NormalThread(taskQueue, apiSpider, webPageSpider, mongoDBDAO,
Exemple #12
0
    #initialize app
    configFilePath = '../config.conf'
    cf = ConfigParser()
    cf.read(configFilePath)
    keyWords = getKeywords(cf)

    pt.printStartMessage('initiate')
    apiSpider = getApiSpider(cf)
    webPageSpider = getWebPageSipder(cf)
    mongoDBDAO = getDatabase(cf)

    #
    pt.printEndMessage('initiate')
    pt.printStartMessage('processes')
    for keyWord in keyWords:
        appLogger.info(
            '------------------------------------------------------------')
        pt.printStartMessage('query articles by keywords:' + keyWord)
        results = apiSpider.queryData(keyWord)
        pt.printEndMessage('query articles by keywords:' + keyWord)
        if not results or len(results) == 0:
            print 'Results number is 0'
            break
        else:
            print 'Results number is %d' % len(results)
        pt.printStartMessage('processes result set')
        resultNum = 0
        for result in results:
            appLogger.info(
                '----------------------------%d--------------------------------'
                % resultNum)
            resultNum += 1
Exemple #13
0
    def run(self):
        appLogger.info('Thread %s start' % self.getName())
        if not self.taskQueue:
            return
        while True:
            try:
                queueLock.acquire()
                if self.taskQueue.empty():
                    queueLock.release()
                    break
                result = self.taskQueue.get(block=True)
                queueLock.release()

                self.printTool.printStartMessage(
                    '%s processes result' % threading.Thread.getName(self))
                self.printTool.printStartMessage(
                    '%s gets pdf url' % threading.Thread.getName(self))
                pdfUrl = self.apiSpider.getPdfUrl(result)
                #             print pdfUrl
                pdfRealUrl = self.webPageSpider.getRealPdfUrl(pdfUrl)
                #             print pdfRealUrl
                self.printTool.printEndMessage('%s gets pdf url' %
                                               threading.Thread.getName(self))
                self.printTool.printStartMessage(
                    '%s gets pdf file' % threading.Thread.getName(self))
                if pdfRealUrl:  #if real file not exist then use simulated file
                    fileName = result.get('article_number') + '.pdf'
                else:
                    fileName = 'simulated file.pdf'
                fileTempPath = self.webPageSpider.generateTempFilePath(
                    fileName)
                fileId = ''
                flag = self.webPageSpider.getPdfFile(pdfRealUrl, fileTempPath)
                self.printTool.printEndMessage('%s gets pdf file' %
                                               threading.Thread.getName(self))
                self.printTool.printStartMessage(
                    '%s inserts pdf file into the database' %
                    threading.Thread.getName(self))
                if flag:  #if get pdf file success then save the file into the database
                    fileId = self.mongoDBDAO.insertFile(fileTempPath,
                                                        fileName,
                                                        isDelFile=True)
                else:
                    fileId = self.mongoDBDAO.insertFile(fileTempPath,
                                                        fileName,
                                                        isDelFile=False)
                self.printTool.printEndMessage(
                    '%s inserts pdf file into the database' %
                    threading.Thread.getName(self))
                self.printTool.printStartMessage(
                    '%s inserts articles into the database' %
                    threading.Thread.getName(self))
                result['fileId'] = fileId  #set fileId in the result
                self.mongoDBDAO.insertOneData(
                    **result)  #save a result into the database
                self.printTool.printEndMessage(
                    '%s inserts articles into the database' %
                    threading.Thread.getName(self))
                self.printTool.printEndMessage('%s processes result' %
                                               threading.Thread.getName(self))
            except Exception, err:
                appLogger.error(err)
            finally:
Exemple #14
0
                                                        isDelFile=True)
                else:
                    fileId = self.mongoDBDAO.insertFile(fileTempPath,
                                                        fileName,
                                                        isDelFile=False)
                self.printTool.printEndMessage(
                    '%s inserts pdf file into the database' %
                    threading.Thread.getName(self))
                self.printTool.printStartMessage(
                    '%s inserts articles into the database' %
                    threading.Thread.getName(self))
                result['fileId'] = fileId  #set fileId in the result
                self.mongoDBDAO.insertOneData(
                    **result)  #save a result into the database
                self.printTool.printEndMessage(
                    '%s inserts articles into the database' %
                    threading.Thread.getName(self))
                self.printTool.printEndMessage('%s processes result' %
                                               threading.Thread.getName(self))
            except Exception, err:
                appLogger.error(err)
            finally:
                try:
                    queueLock.release()
                except Exception, err:
                    pass
        appLogger.info('Thread %s end' % self.getName())


if __name__ == '__main__':
    pass