def calculateScores(self, resume): resume_content = irutils.processText(resume) resume_tokens = self.tfgetter.getTokens(resume_content) idfdict = {} for token in resume_tokens: idfdict[token] = 0 tokenNum = 0 for job in self.jobs: tf = job["tf"] if tf.has_key(token): tokenNum += 1 # print token, tokenNum idfdict[token] = math.log10( (self.doc_num - tokenNum + 0.5) / (tokenNum + 0.5)) # print "idfdict=", idfdict for job in self.jobs: tf = job["tf"] score = 0 for token in resume_tokens: if tf.has_key(token): n1 = tf[token] * (K1 + 1) n2 = tf[token] + K1 * (1 - B + B * job["length"] / self.avgLength) score += idfdict[token] * (n1 / n2) job["score"] = score
def saveJobTfIdf(self, jobcoll , idfColl): df = {} doc_num = 0 for item in jobcoll.find(): content = irutils.processText(item["summary"]) tf = self.getTf(content) item['tf'] = tf item['wtf'] = getwtf(tf) jobcoll.save(item) dfAddTf(df,tf) doc_num+=1 idfitem={} idfitem['doc_num'] = doc_num idfitem['df'] = df idf = getIdf(df,doc_num) idfitem['idf'] = idf idfitem['coll_name'] = jobcoll.name idfitem['date'] = datetime.datetime.now() # print idf idfColl.save(idfitem) for item in jobcoll.find(): wtf = item['wtf'] item['wtfidf'] , item['length'] = getWtfIdf(wtf,idf) jobcoll.save(item) return idfitem
def processColl(self, jobcoll ): self.jobs = [] self.doc_num = 0 sum_length = 0 for item in jobcoll.find(): content = irutils.processText(item["summary"]) tokens = self.tfgetter.getTokens(content) tf = self.tfgetter.getTf(tokens) # print "tf=", tf item['tf'] = tf item['length'] = len(tokens) self.jobs.append(item) self.doc_num+=1 sum_length += item['length'] self.avgLength = sum_length/self.doc_num print "self.avgLength =", self.avgLength
def getJobTfIdf(self, jobcoll ): jobs = [] df = {} doc_num = 0 for item in jobcoll.find(): content = irutils.processText(item["summary"]) tf = self.getTf(content) item['wtf'] = getwtf(tf) jobs.append(item) dfAddTf(df,tf) doc_num+=1 idf = getIdf(df,doc_num) for item in jobs: wtf = item['wtf'] item['wtfidf'] , item['length'] = getWtfIdf(wtf,idf) return idf, jobs
def calculateScores(self,resume): resume_content = irutils.processText(resume) resume_tokens = self.tfgetter.getTokens(resume_content) resumetf = self.tfgetter.getTf(resume_tokens) resume_len = len(resume_tokens) resume_pq = {} for key in resumetf.keys(): resume_pq[key] = float(resumetf[key])/resume_len # print "resume_len=" ,resume_len # print "resume_pq=", resume_pq for job in self.jobs: tf = job["tf"] job_len = job["length"] # print "job_len=", job_len score = 0 for key in resumetf.keys(): if tf.has_key(key): job_p = float (tf[key]) / job_len # print "job_p=", job_p score += job_p * math.log ( job_p / resume_pq[key] ) job["score"] = score
def calculateScores(self, resume): resume_content = irutils.processText(resume) resume_tokens = self.tfgetter.getTokens(resume_content) resumetf = self.tfgetter.getTf(resume_tokens) resume_len = len(resume_tokens) resume_pq = {} for key in resumetf.keys(): resume_pq[key] = float(resumetf[key]) / resume_len # print "resume_len=" ,resume_len # print "resume_pq=", resume_pq for job in self.jobs: tf = job["tf"] job_len = job["length"] # print "job_len=", job_len score = 0 for key in resumetf.keys(): if tf.has_key(key): job_p = float(tf[key]) / job_len # print "job_p=", job_p score += job_p * math.log(job_p / resume_pq[key]) job["score"] = score
def calculateScores(self,resume): resume_content = irutils.processText(resume) resume_tokens = self.tfgetter.getTokens(resume_content) idfdict = {} for token in resume_tokens: idfdict[token] = 0 tokenNum = 0 for job in self.jobs: tf = job["tf"] if tf.has_key(token): tokenNum+=1 # print token, tokenNum idfdict[token] = math.log10( ( self.doc_num - tokenNum + 0.5 ) / (tokenNum + 0.5) ) # print "idfdict=", idfdict for job in self.jobs: tf = job["tf"] score = 0 for token in resume_tokens: if tf.has_key(token): n1 = tf[token] * (K1+1) n2 = tf[token] + K1*(1-B + B * job["length"] / self.avgLength ) score += idfdict[token] * (n1 / n2) job["score"] = score
def getResumeWight(self, resume): content = irutils.processText(resume) tf = self.tfIdfGetter.getTf(content) wtf = getwtf(tf) wtfidf, length = getQueryWtfIdf(wtf, self.jobs_idf) return wtfidf