class BaseIr(): def __init__(self, jobCollection): self.tfgetter = TfGetter() self.jobCollection = jobCollection self.processColl(self.jobCollection) def processColl(self, jobcoll ): self.jobs = [] self.doc_num = 0 sum_length = 0 for item in jobcoll.find(): content = irutils.processText(item["summary"]) tokens = self.tfgetter.getTokens(content) tf = self.tfgetter.getTf(tokens) # print "tf=", tf item['tf'] = tf item['length'] = len(tokens) self.jobs.append(item) self.doc_num+=1 sum_length += item['length'] self.avgLength = sum_length/self.doc_num print "self.avgLength =", self.avgLength def matchResume(self, resume): self.calculateScores(resume) self.jobs.sort(key=lambda x: x["score"], reverse=True) return self.jobs
class TfIdfGetter(): def __init__(self): self.tfgetter = TfGetter() def getTf(self, content): tokens = self.tfgetter.getTokens(content) return self.tfgetter.getTf(tokens) def saveJobTfIdf(self, jobcoll , idfColl): df = {} doc_num = 0 for item in jobcoll.find(): content = irutils.processText(item["summary"]) tf = self.getTf(content) item['tf'] = tf item['wtf'] = getwtf(tf) jobcoll.save(item) dfAddTf(df,tf) doc_num+=1 idfitem={} idfitem['doc_num'] = doc_num idfitem['df'] = df idf = getIdf(df,doc_num) idfitem['idf'] = idf idfitem['coll_name'] = jobcoll.name idfitem['date'] = datetime.datetime.now() # print idf idfColl.save(idfitem) for item in jobcoll.find(): wtf = item['wtf'] item['wtfidf'] , item['length'] = getWtfIdf(wtf,idf) jobcoll.save(item) return idfitem def getJobTfIdf(self, jobcoll ): jobs = [] df = {} doc_num = 0 for item in jobcoll.find(): content = irutils.processText(item["summary"]) tf = self.getTf(content) item['wtf'] = getwtf(tf) jobs.append(item) dfAddTf(df,tf) doc_num+=1 idf = getIdf(df,doc_num) for item in jobs: wtf = item['wtf'] item['wtfidf'] , item['length'] = getWtfIdf(wtf,idf) return idf, jobs
def __init__(self): self.tfgetter = TfGetter()
def __init__(self, jobCollection): self.tfgetter = TfGetter() self.jobCollection = jobCollection self.processColl(self.jobCollection)