Esempio n. 1
0
    def __init__(self,conf,fpool):
        threading.Thread.__init__(self)

        self.conf=conf
        #sets the reference to the file queue
        self._fpool=fpool

        regxep=''
        
        for term in self.conf.bannedTerms:
            regxep= regxep + term.strip() + '|'
            
        regxep=regxep.rstrip('|')
        
        self.pattern=re.compile(regxep, re.IGNORECASE)
        
        self.userProcessor=UserProcessor(conf)
        self.csvProcessor=CsvWrapper(conf)
Esempio n. 2
0
class PostProcessor(threading.Thread):

    def __init__(self,conf,fpool):
        threading.Thread.__init__(self)

        self.conf=conf
        #sets the reference to the file queue
        self._fpool=fpool

        regxep=''
        
        for term in self.conf.bannedTerms:
            regxep= regxep + term.strip() + '|'
            
        regxep=regxep.rstrip('|')
        
        self.pattern=re.compile(regxep, re.IGNORECASE)
        
        self.userProcessor=UserProcessor(conf)
        self.csvProcessor=CsvWrapper(conf)
    
    def run(self):
        if self._fpool!=None:
            while True:
                #get the file from the pool
                file=self._fpool.get()
                if file!=None:
                    print '%s: Star processing file: %s' % (self.getName(),file)
                    self.process((self.conf.queuePath+file))
        else:
            raise NoFilePoolError(),'You must provide a file pool if you intend to run the processor in a multithreading fashion.'
            
    def process(self,file):
        
        #first we open the file to parse
        fhandle=open(file,'r')
        
        #reader=csv.reader(fhandle,delimiter=self.conf.csvDelimiter)
        reader=self.csvProcessor.reader(fhandle)
        
        processedRows=0
        bannedRows=0
        postsCount=0
        posts=[]
        #we parse the rows
        for row in reader:
            if processedRows >0:
                #verify if the post doesn't contains banned terms
                if not self.isBanned(row[0]):
                    if not self.isBanned(row[1]):
                        #get the category
                        category=self.mapCategory(row[2])
                        
                        #get the user for the post
                        user=self.getUser()
                        
                        #create the post
                        posts.append(self.createPost(row, category, user))
                        
                        postsCount = postsCount + 1
                        
                        #we reach the size of the batch file
                        if len(posts)==self.conf.batchSize:
                            self._saveAndSendFile(posts)
                            posts=[]
                    else:
                        bannedRows = bannedRows + 1
                        print '%s: Post baneado' % self.getName()
                else:
                    bannedRows = bannedRows + 1
                    print '%s: Post baneado' % self.getName()
                    
            processedRows = processedRows + 1
            
        #close the source file    
        fhandle.close()
        
        #save the remaining posts
        if len(posts)>0:
            self._saveAndSendFile(posts)
                
    def _saveAndSendFile(self,posts):
        
        #write posts file
        fname='posts_'+str(int(time.time()))+'.csv'
        
        print '%s: Saving file: %s ...%s posts' %  (self.getName(),fname,len(posts))
        
        self.csvProcessor.writerows((self.conf.processedPath+fname),(self.getHeaders()+ posts))
        
        #clear the user's pool so in the next batch, we generate a new one
        self.userProcessor.clearUsers()
        
        #send the posts file.        
        self.sendFile(fname)
        
    def isBanned(self,string):
        result=self.pattern.search(string)
        if not result is None:
            return True    
        return False    
        
    
    def mapCategory(self,category):
        for key in self.conf.categoryMapping.keys():
            mapping=self.conf.categoryMapping[key]
            if category in mapping:
                break
        else:
            key=self.conf.defaultCategory
            
        return key    
            
    
    def getUser(self):
        return self.userProcessor.getUser(self.conf.usersPerBatch)
    
    def createPost(self,rawPost,category,user):
        return [rawPost[0],rawPost[1],user,category,rawPost[3],rawPost[4]]
    
    def getHeaders(self):
        return [['Title','Body','Name','Category','Tags','Created']]
        
   
    def sendFile(self,file):
        #first we load the server
        server = xmlrpclib.ServerProxy(self.conf.importServerUrl)
        #open the file
        handle=open((self.conf.processedPath+file),'r')
        
        bdata = xmlrpclib.Binary(handle.read())
        
        print "%s: Sending file: %s ..." % (self.getName(),file)        
        #now we insert the users in the server
        result = server.import_posts(self.conf.sourceName,file,bdata)
        
        #fill the users pool  
        print "%s: Imported %s posts ..." % (self.getName(),result['processed'])