Ejemplo n.º 1
0
    def run(self):
        while self.__class__.isServer:
            processObj = None

            if self.inputQueue is not None:
                processObj = self.inputQueue.get(block=True)
                if processObj is None:
                    time.sleep(0.01)
                    continue
            beginTime = time.time()
            try:
                if processObj and isinstance(processObj, StreamBox):
                    # print('%s begin execute' % self.__class__)
                    processObj = self.process(processObj=processObj)
                    # print('%s finish execute' % self.__class__)
            except Exception as e:
                appLogger.error(e)
                traceback.print_stack()
                if processObj:
                    processObj.isError = True
            endTime = time.time()
            if processObj and isinstance(processObj, StreamLogger):
                processObj.setProcessorLog(self.__class__.__name__, beginTime,
                                           endTime)

#             if processObj and isinstance(processObj,StopSignal):
#                 self.__class__.isServer=False
#                 appLogger.info('%s thread stop' % self.__class__.__name__)
            try:
                if processObj is not None and self.outputQueue is not None:
                    self.outputQueue.put(processObj, block=True)
            except Exception as e:
                traceback.print_exc()
Ejemplo n.º 2
0
    def run(self):
        while self.__class__.isServer:
            processObj=None
            
            if self.inputQueue:
                processObj=self.inputQueue.get(block=True)
                if not processObj:
                    time.sleep(0.001)
                    continue
            beginTime=time.time()
            try:
                if processObj and isinstance(processObj,StreamBox):
                    processObj=self.process(processObj=processObj)
            except Exception,e:
                appLogger.error(e)
                if processObj:
                    processObj.isError=True
            endTime=time.time()
            if processObj and isinstance(processObj, StreamLogger):
                processObj.setProcessorLog(self.__class__.__name__,beginTime,endTime)

#             if processObj and isinstance(processObj,StopSignal):
#                 self.__class__.isServer=False
#                 appLogger.info('%s thread stop' % self.__class__.__name__)          
            
            if processObj and self.outputQueue:
                self.outputQueue.put(processObj,block=True)
Ejemplo n.º 3
0
    def __init__(self,mainPageUrl=None,cookiePath=None,tempDocPath=None):
        if mainPageUrl:
            self.MAIN_PAGE_URL=mainPageUrl
        if cookiePath:
            self.COOKIE_PATH=cookiePath
        if tempDocPath:
            self.TEMP_DOC_PATH=tempDocPath
        self.requestHeaders=[
                            {'User-Agent':r'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko'}
                            ,{'User-Agent':r'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'}
#                             ,{'User-Agent':r'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)'}
#                             ,{'User-Agent':r'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'}
#                             ,{'User-Agent':r'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}
#                             ,{'User-Agent':r'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'}
#                             ,{'User-Agent':r'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}
                            ]
        try:
             #login web site
            #claim a MozillaCookieJar instance to save cookie
            cookie = cookielib.MozillaCookieJar(self.COOKIE_PATH)
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
            urllib2.install_opener(opener)
            #access main page, and save attributes to the cookie
            response=urllib2.urlopen(self.MAIN_PAGE_URL)
            #save cookie
            cookie.save(ignore_discard=True, ignore_expires=True)

        except Exception, err:
            appLogger.error(err)
Ejemplo n.º 4
0
 def getPdfFile(self, pdfRealUrl, filePath):
     '''
     get pdf file by pdf real url
     @param pdfRealUrl:pdf file's real url
     @param filePath:local disk path which will save pdf file
     @return: success return true, or return false  
     '''
     bReturn = False
     f = None
     try:
         if pdfRealUrl:
             response = requests.get(pdfRealUrl, stream=True)
             f = open(filePath, "wb")
             for chunk in response.iter_content(chunk_size=1024):
                 if chunk:
                     f.write(chunk)
             appLogger.info(
                 'success get file from the internet. the file size is %s bytes'
                 % os.path.getsize(filePath))
             bReturn = True
         else:
             #                 print 'sleep 10 second...'
             sleepTime = random.randrange(5, 20, 1)
             time.sleep(
                 sleepTime
             )  #if we can not get real pdf url, thread will sleep 3000 ms in order to simulate the time to download the file
     except Exception, err:
         appLogger.error(err)
Ejemplo n.º 5
0
 def process(self,processObj=None):
     try:
         streamBox=StreamBox()
         streamBox.result=self.generater.next()
         return streamBox
     except StopIteration:
         return StopSignal()
     except Exception,e:
         appLogger.error(e)
         return None
Ejemplo n.º 6
0
 def getAvg(self, array):
     avg=0
     total=0
     try:
         if array:
             for item in array:
                 total+=item
             avg=total/len(array)
     except Exception, err:
         appLogger.error(err)
Ejemplo n.º 7
0
 def getCollection(self, collectionName):
     '''
     get the collection of the mongoDB
     @param collectionName: the name of the collection
     @return: the collection which name is collectionName, if the collection does not exist return None
     '''
     if collectionName and self.DB_CLIENT:
         try:
             return self.DB_CLIENT[collectionName]
         except Exception, err:
             appLogger.error(err)
Ejemplo n.º 8
0
 def getPdfUrl(self, jsonResult):
     '''
     get pdf file url from article data
     @param jsonResult: article data which is json format
     @return: the pdf url of the article
     '''
     uReturn = None
     try:
         if jsonResult:
             uReturn = jsonResult['pdf_url']
     except Exception, err:
         appLogger.error(err)
Ejemplo n.º 9
0
 def getPeriod(self,message):
     dReturn=0
     try:
         timeArray=self.timeMap.get(message)
         periodArray=self.periodMap.get(message)
         if not periodArray:
             periodArray=[]
             self.periodMap[message]=periodArray
         if timeArray:
             dReturn=timeArray[-1]-timeArray[-2]
             periodArray.append(dReturn)
     except Exception, err:
         appLogger.error(err)
Ejemplo n.º 10
0
 def getArticles(self, jsonResponse):
     '''
     get query articles
     @param jsonResponse: the query results which are from ieee xplore database
     @return: a list which contains article data 
     '''
     pReturn = None
     try:
         if jsonResponse:
             jdatas = json.loads(jsonResponse)
             if jdatas:
                 pReturn = jdatas.get('articles')
     except Exception, err:
         appLogger.error(err)
Ejemplo n.º 11
0
    def getRealPdfUrl(self, pdfUrl):
        '''
        get real pdf url by pdfUrl
        @param pdfUrl: the pdfUrl which is gotten from ieee xplore api
        @return: real pdf url
        '''
        sReturn = None

        try:
            if pdfUrl:
                #claim a MozillaCookieJar instance to save cookie
                cookie = cookielib.MozillaCookieJar()
                cookie.load(self.COOKIE_PATH,
                            ignore_discard=True,
                            ignore_expires=True)
                #                 httpHandler = urllib2.HTTPHandler()
                #                 httpsHandler = urllib2.HTTPSHandler()
                cookieHandler = urllib2.HTTPCookieProcessor(cookie)
                opener = urllib2.build_opener(cookieHandler)
                urllib2.install_opener(opener)
                #access pdfUrl
                #                 pt.printStartMessage('get real pdf url from the internet')
                loop = 0
                while True:
                    if loop == 1:
                        #                         appLogger.error('loop 1 times, but we still cannot get real pdf url')
                        break
                    loop += 1

                    #                     queueLock.acquire()
                    requestHeaders = self.getRandomRequestHeaders()
                    request = urllib2.Request(pdfUrl, headers=requestHeaders)
                    response = urllib2.urlopen(request)

                    sleepTime = random.randrange(1, 3, 1)
                    #                     print 'sleep %s s' % sleepTime
                    time.sleep(sleepTime)
                    queueLock.release()
                    #save cookie
                    soup = BeautifulSoup(response, features='lxml')
                    appLogger.info(pdfUrl)
                    if soup.iframe:
                        sReturn = soup.iframe.attrs.get(
                            'src')  #get real pdf url
                        break
                    else:
                        print requestHeaders
                        time.sleep(10)
        except Exception, err:
            appLogger.error(err)
Ejemplo n.º 12
0
 def getDatabase(self, databaseName):  
     '''
     get the database of mongoDB
     @param databaseName:the name of database
     @return: the database which name is databaseName, if the database does not exist return None 
     '''     
     if databaseName:
         try:
             client = MongoClient(self.DB_URL)
             databaseNames=client.database_names()
             if databaseName in databaseNames:   
                 return client[databaseName]
         except Exception, err:
             appLogger.error(err)
Ejemplo n.º 13
0
 def insertBinaryData(self,binaryData,saveFilename,collectionName):
     '''
     insert binary data into the mongodb
     @param binaryData: binary data which will be inserted
     @param saveFilename: file name which is in the mongodb
     @param collectionName: collection name of the database
     @return: if insert success then it will return id which is in the mongoDB, else return None 
     '''
     sReturn=None
     try:
         if binaryData:
             coll=self.getCollection(collectionName)
             if coll:
                 sReturn = coll.save(dict(content= bson.binary.Binary(binaryData),filename = saveFilename))
     except Exception, err:
         appLogger.error(err)
 def getRealPdfUrl(self, pdfUrl):
     '''
     get real pdf url by pdfUrl
     @param pdfUrl: the pdfUrl which is gotten from ieee xplore api
     @return: real pdf url
     '''
     sReturn = None
     self.addWeb1ConnectCount()
     try:
         if pdfUrl:
             time.sleep(
                 self.getSer1RandomDelayTime(minDelay=1,
                                             maxDelay=3,
                                             baseConnect=1))
     except Exception, err:
         appLogger.error(err)
 def getPdfFile(self, pdfRealUrl, filePath):
     '''
     get pdf file by pdf real url
     @param pdfRealUrl:pdf file's real url
     @param filePath:local disk path which will save pdf file
     @return: success return true, or return false  
     '''
     bReturn = False
     self.addWeb2ConnectCount()
     try:
         time.sleep(
             self.getSer2RandomDelayTime(minDelay=5,
                                         maxDelay=20,
                                         baseConnect=1))
     except Exception, err:
         appLogger.error(err)
Ejemplo n.º 16
0
 def getFileById(self,fileId,collectionName):
     '''
     get binary data from mongodb
     @param fileId: the id of file which will get from the MongoDB
     @param collectionName: collection name of the database
     @return: the file data which is stored in the mongoDB 
     '''
     dReturn=None
     try:
         if fileId:
             coll=self.getCollection(collectionName)
             data = coll.find_one({'_id':ObjectId(fileId)})
             if data:
                 dReturn=data['content']
     except Exception, err:
         appLogger.error(err)
Ejemplo n.º 17
0
 def insertOneData(self, collectionName=DB_COLL, **dataSet):
     '''
     insert data into the mongoDB
     @param collectionName: : collection name of the database
     @param dataSet:BSON data which will be inserted into the mongoDB
     @return:  if insert success return id which is in the mongoDB, else return None 
     '''
     sReturn=None
     try:
         if dataSet and collectionName:
             coll=self.getCollection(collectionName)
             if coll:
                 resultObject = coll.insert_one(dataSet)
                 if resultObject:
                     sReturn=resultObject.inserted_id
     except Exception, err:
         appLogger.error(err)
Ejemplo n.º 18
0
 def process(self,processObj=None):
     streamBox=None
     try:
         conn=self.getConnFromRemote()
         imageMsg = conn.recv()
         streamBox = StreamBox()
         streamBox.captured_time, streamBox.captured_location, streamBox.rgb_small_frame = imageMsg
         streamBox.conn = conn
         # print(streamBox)
         # streamBox.rgb_small_frame, streamBox.frame, streamBox.captured_location, streamBox.captured_time = next(self.generater)
         # return streamBox
     except StopIteration:
         return StopSignal()
     except Exception as e:
         appLogger.error(e)
         traceback.print_exc()
         # return None
     return streamBox
Ejemplo n.º 19
0
 def insertFile(self, filePath, saveFilename, collectionName=DB_COLL_BIN, isDelFile=False):
     '''
     insert file into the mongodb
     @param filePath: file path which will be inserted
     @param saveFilename: file name which is in the mongodb
     @param collectionName: collection name of the database
     @param isDelFile: if true then delete the file which is filePath
     @return: if insert success return id which is in the mongoDB, else return None 
     '''
     sReturn=None
     try:
         if filePath:
             filePath=filePath.decode('utf-8')
             with open (filePath,'rb') as fileObj:
                 content = StringIO(fileObj.read())
                 sReturn=self.insertBinaryData(content.getvalue(), saveFilename, collectionName)
         
     except Exception, err:
         appLogger.error(err)
    def queryData(self, keyWords=''):
        '''
        query ieee xplore database to get results
        @param keyWords: key words
        @return: a list which contains query results, and every result is the json formate. The max results count is 40000  
        '''
        #         lReturn=[{"_id":"5b839c9b7bbd7112ec94e5bf","issn":"0148-9267","start_page":"106","publication_number":6720219,"rank":16,"article_number":"6792690","title":"Évelyne Gayou, Editor: Polychrome Portraits 14: Pierre Schaeffer","abstract_url":"https://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6792690","issue":"1","is_number":6790986,"index_terms":{},"publication_title":"Computer Music Journal","volume":"34","access_type":"LOCKED","content_type":"Journals","authors":{"authors":[{"author_order":1,"affiliation":"San Francisco, California, USA.","full_name":"Thom Blum"}]},"publication_date":"March 2010","fileId":"","publisher":"MIT Press","doi":"10.1162/comj.2010.34.1.106","pdf_url":"https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6792690","partnum":"6792690","end_page":"111","citing_paper_count":0}]
        lReturn = []
        try:
            begin = 1  #query start record number
            query = XPLORE(self.API_KEY)
            query.maximumResults(self.QUERY_RETURN_MAX_RESULTS)
            query.queryText(keyWords)
            query.resultsSorting('publication_year', 'desc')
            #             query.resultsFilter('content_type','Journals')  #only query journals
            query.resultsFilter(
                'open_access',
                'True')  #only query the articles which is open access
            if self.QUERY_BEGIN_YEAR:
                query.resultsFilter('start_year', self.QUERY_BEGIN_YEAR)
            if self.QUERY_END_YEAR:
                query.resultsFilter('end_year', self.QUERY_END_YEAR)


#             appLogger.info(self.getQueryInfo(keyWords))
            while True:
                query.startingResult(begin)
                results = query.callAPI(debugModeOff=True)
                #                 print results
                self.CUR_QUERY_COUNT += 1
                articles = self.getArticles(results)  #get articles list
                if articles:
                    lReturn.extend(articles)  # add articles to result list
                    size = len(articles)  #get query total number
                    if size == self.QUERY_RETURN_MAX_RESULTS and self.CUR_QUERY_COUNT < self.MAX_QUERY_COUNT_LIMIT:  #if still has more articles,continue query
                        begin = len(lReturn) + 1
                    else:
                        break
                else:
                    break
        except Exception, err:
            appLogger.error(err)
    def process(self, processObj=None):
        streamBox = None
        try:
            topic, data = self.mqttTool.recvDataFromServer()
            if topic is None:
                return None
            streamBox = StreamBox()
            streamBox.captured_university, streamBox.captured_classroom, streamBox.captured_time = topic.split(
                '/')

            if get_frame_time_difference(streamBox.captured_time) > 1:
                return None
            image = np.asarray(bytearray(data), dtype="uint8")
            frame = cv2.imdecode(image, cv2.IMREAD_COLOR)
            streamBox.rgb_small_frame = frame[:, :, ::-1]
            streamBox.frame = frame
        except StopIteration:
            return StopSignal()
        except Exception as e:
            appLogger.error(e)
            traceback.print_exc()
            # return None
        return streamBox
Ejemplo n.º 22
0
        sReturn=None
        try:
            if filePath:
                filePath=filePath.decode('utf-8')
                with open (filePath,'rb') as fileObj:
                    content = StringIO(fileObj.read())
                    sReturn=self.insertBinaryData(content.getvalue(), saveFilename, collectionName)
            
        except Exception, err:
            appLogger.error(err)
        finally:
            if isDelFile:
                try:
                    os.remove(filePath)
                except Exception, err:
                    appLogger.error(err)
        return sReturn

                    
    def insertBinaryData(self,binaryData,saveFilename,collectionName):
        '''
        insert binary data into the mongodb
        @param binaryData: binary data which will be inserted
        @param saveFilename: file name which is in the mongodb
        @param collectionName: collection name of the database
        @return: if insert success then it will return id which is in the mongoDB, else return None 
        '''
        sReturn=None
        try:
            if binaryData:
                coll=self.getCollection(collectionName)
Ejemplo n.º 23
0
    def run(self):
        appLogger.info('Thread %s start' % self.getName())
        if not self.taskQueue:
            return
        while True:
            try:
                queueLock.acquire()
                if self.taskQueue.empty():
                    queueLock.release()
                    break
                result = self.taskQueue.get(block=True)
                queueLock.release()

                self.printTool.printStartMessage(
                    '%s processes result' % threading.Thread.getName(self))
                self.printTool.printStartMessage(
                    '%s gets pdf url' % threading.Thread.getName(self))
                pdfUrl = self.apiSpider.getPdfUrl(result)
                #             print pdfUrl
                pdfRealUrl = self.webPageSpider.getRealPdfUrl(pdfUrl)
                #             print pdfRealUrl
                self.printTool.printEndMessage('%s gets pdf url' %
                                               threading.Thread.getName(self))
                self.printTool.printStartMessage(
                    '%s gets pdf file' % threading.Thread.getName(self))
                if pdfRealUrl:  #if real file not exist then use simulated file
                    fileName = result.get('article_number') + '.pdf'
                else:
                    fileName = 'simulated file.pdf'
                fileTempPath = self.webPageSpider.generateTempFilePath(
                    fileName)
                fileId = ''
                flag = self.webPageSpider.getPdfFile(pdfRealUrl, fileTempPath)
                self.printTool.printEndMessage('%s gets pdf file' %
                                               threading.Thread.getName(self))
                self.printTool.printStartMessage(
                    '%s inserts pdf file into the database' %
                    threading.Thread.getName(self))
                if flag:  #if get pdf file success then save the file into the database
                    fileId = self.mongoDBDAO.insertFile(fileTempPath,
                                                        fileName,
                                                        isDelFile=True)
                else:
                    fileId = self.mongoDBDAO.insertFile(fileTempPath,
                                                        fileName,
                                                        isDelFile=False)
                self.printTool.printEndMessage(
                    '%s inserts pdf file into the database' %
                    threading.Thread.getName(self))
                self.printTool.printStartMessage(
                    '%s inserts articles into the database' %
                    threading.Thread.getName(self))
                result['fileId'] = fileId  #set fileId in the result
                self.mongoDBDAO.insertOneData(
                    **result)  #save a result into the database
                self.printTool.printEndMessage(
                    '%s inserts articles into the database' %
                    threading.Thread.getName(self))
                self.printTool.printEndMessage('%s processes result' %
                                               threading.Thread.getName(self))
            except Exception, err:
                appLogger.error(err)
            finally: