Ejemplo n.º 1
0
def testProcessPage():

    listCollectionName = "daily_dice_list_2014-07-11"
    infoCollectionName = "daily_dice_info_2014-07-11"

    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    listCollection = dbClient.getCollection(listCollectionName)
    infoCollection = dbClient.getCollection(infoCollectionName)
    getter = DicePageGetter(infoCollection)

    pageSize = 100
    pageNo = 1
    has_more = True
    pageNum = 10000
    find_sort = None
    find_spec = None
    while has_more and pageNo <= pageNum:
        page = dbClient.getPage(listCollection, find_spec, find_sort, pageSize,
                                pageNo)
        getter.processPage(page, pageNo)
        pageNo += 1
        count = page.count(with_limit_and_skip=True)
        #   print "count=",count
        if (count < pageSize):
            has_more = False
Ejemplo n.º 2
0
def main():
     
     collectionName = "job_lang_top_corps"
     infoCollectionName = "jobinfo_lang_top_corps"
    
     dbClient = DbClient('localhost', 27017, "jobaly")
     collection = dbClient.getCollection(collectionName)
     infoCollection = dbClient.getCollection(infoCollectionName)
     getter = IndeedPageGetter(infoCollection)

     pageSize = 10 
     pageNo = 149
     has_more = True
     pageNum = 10000
     find_sort = None
     find_spec=None
     while has_more and pageNo <= pageNum :
        page = dbClient.getPage(collection, find_spec,find_sort, pageSize, pageNo)    
        getter.processPage(page,pageNo)        
        pageNo+=1 
        count =  page.count(with_limit_and_skip = True)
     #   print "count=",count
        if ( count < pageSize ) :
            has_more = False
Ejemplo n.º 3
0
def testProcessPage():
     
     listCollectionName = "daily_dice_list_2014-07-11"
     infoCollectionName = "daily_dice_info_2014-07-11"
    
     dbClient = DbClient('localhost', 27017, "jobaly_daily")
     listCollection = dbClient.getCollection(listCollectionName)
     infoCollection = dbClient.getCollection(infoCollectionName)
     getter = DicePageGetter(infoCollection)

     pageSize = 100 
     pageNo = 1
     has_more = True
     pageNum = 10000
     find_sort = None
     find_spec=None
     while has_more and pageNo <= pageNum :
        page = dbClient.getPage(listCollection, find_spec,find_sort, pageSize, pageNo)    
        getter.processPage(page,pageNo)        
        pageNo+=1 
        count =  page.count(with_limit_and_skip = True)
     #   print "count=",count
        if ( count < pageSize ) :
            has_more = False
Ejemplo n.º 4
0
class DataHandler:
    
    def __init__(self , dbclient=None ):
        self.dbclient = dbclient
   
    def setup_tfidfMatcher(self):
        if ( self.dbclient is None):
          self.dbClient = DbClient('localhost', 27017, "jobaly")               
        else: 
          self.dbClient = dbclient
          
        self.resumeCollection = self.dbClient.getCollection(gConfig["webResumeColName"]) 
        self.jobCollection = self.dbClient.getCollection(gConfig["webJobInfoCollName"])  
        self.jobModelCollection = self.dbClient.getCollection(gConfig["jobModelCollName"])
        self.matcher = TfIdfMatch(self.jobCollection)

    def save_resume(self, resume_text): 
        resume = {"content": resume_text, "date": datetime.datetime.utcnow()}
        resume_id = self.resumeCollection.insert(resume)
        print "add resume id is:", resume_id
        
    def get_resumes(self):
        return self.resumeCollection.find()
        
    def get_resume(self, _id):
        return self.resumeCollection.find_one({'_id': ObjectId(_id)})
        
    def get_jobs(self, page_no=1, page_size=20):
        find_sort = None
        find_spec = None
        return self.dbClient.getPage(self.jobCollection, find_spec,find_sort, page_size, page_no)

    def get_job(self, _id):
        result=list(self.jobCollection.find({'_id': _id }))
        if len(result) > 0:
            return result[0]
        else :
            return None      
            
    def get_job_ids(self, ids):
        result=list(self.jobCollection.find({"_id": {"$in": ids}}))        
        return result  

    def get_jobmodel_ids(self, ids):
        result=list(self.modelCollection.find({"_id": {"$in": ids}}))        
        return result            
    
    def get_model(self, _id):
        result=list(self.modelCollection.find({'_id': _id }))
        if len(result) > 0:
            return result[0]
        else :
            return None
        
    def matchResume(self, resume):
        return self.matcher.matchResume(resume)
        
    def connectJobColl(self, dbName, collName):
        
        self.dbname = dbName 
        self.collname = collName                
        self.dbClient = DbClient('localhost', 27017, dbName)               
        self.jobCollection = self.dbClient.getCollection(collName)  
        self.collSize = self.dbClient.getCollectionSize(collName) 
        self.modelCollection = self.dbClient.getCollection(collName+"_model")
        
    def getJobsByPage(self, page_size ,  page_no ):
        find_sort = None
        find_spec = None        
        return self.dbClient.getPage(self.jobCollection, find_spec,find_sort, page_size, page_no)        
  
    def searchjobs(self,query,qtype )	:        
        if qtype == "jid" :
            result=list(self.jobCollection.find({'_id': query }))
            pageno = 1  
        elif  qtype == "jobtitle" :   
            result=list(self.jobCollection.find({'jobtitle': query }))
            pageno = 1              
            
        resultnum = len(result)
        return (result, pageno, resultnum)