def main(): days = 4 today = datetime.date.today() today = "2014-06-15_p4" listCollectionName = "daily_job_list_" + str(today) print "list collection name:", listCollectionName infoCollectionName = "daily_job_info_" + str(today) print "info collection name:", infoCollectionName # lang_names = jobaly.utils.loadArrayFromFile("test_lang_list.txt") # cities = jobaly.utils.loadArrayFromFile("test_loc_list.txt") lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt") cities = jobaly.utils.loadArrayFromFile("loc_list.txt") dbClient = DbClient('localhost', 27017, "jobaly_daily_test") listCollection = dbClient.getCollection(listCollectionName) start_time = time.time() print "---- start get job list ----" # getJobList(listCollectionName) crawlIndeed(listCollection, lang_names, cities, days) t = time.time() - start_time print "---- finish get job list, use %s seconds ----" % t print print infoCollection = dbClient.getCollection(infoCollectionName) start_time = time.time() print "---- start get job info ----" getJobInfo(dbClient, listCollection, infoCollection) t = time.time() - start_time print "---- finish get job info, use %s seconds ----" % t
def mergeDailyJob(date): srcBbClient = DbClient('localhost', 27017, "jobaly_daily") targetBbClient = DbClient('localhost', 27017, "jobaly_daily_test") listCol = srcBbClient.getCollection("daily_job_list_" + date) infoCol = srcBbClient.getCollection("daily_job_info_" + date) newCol = targetBbClient.getCollection("daily_job_" + date) mergeJob(listCol, infoCol, newCol)
def main(): # print gConfig dbClient = DbClient('localhost', 27017, "jobaly") jobCollection = dbClient.getCollection(gConfig["webJobInfoCollName"]) jobIdfCollection = dbClient.getCollection(gConfig["JobIdfCollName"]) tfIdfGetter = TfIdfGetter() # tfIdfGetter.saveJobTfIdf(jobCollection, jobIdfCollection ) idf, jobs = tfIdfGetter.getJobTfIdf(jobCollection) print idf
def filterWebDeveloper_indeed(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") targetBbClient = DbClient('localhost', 27017, "jobaly_daily_test") srcCol = srcBbClient.getCollection("daily_job_2014-06-05") newCol = targetBbClient.getCollection("daily_job_webdev") for job in srcCol.find(): jobtitle = job["jobtitle"].lower() if (jobtitle.find("web") != -1 ) and \ (jobtitle.find("developer") != -1 ): print jobtitle.encode("GBK", "ignore") newCol.insert(job)
def testTermsMatching(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") newCol = srcBbClient.getCollection("daily_job_webdev") newCol = srcBbClient.getCollection("daily_job_info_2014-06-16") newCol = srcBbClient.getCollection("daily_job_info_2014-06-10") print "newCol=" ,newCol allSents = getAllSentsInColl(newCol) terms = ["degree", "B.S.", "M.S." ,"BS", "MS", "bachelor", "master", "phd","master's"] matchingSents = termsMatching(allSents,terms) # dumpTwo(matchingSents, "sents\\degree_raw" , ( lambda x: x[0] + ":" + x[1] ) ) dumpTwo(matchingSents, "sents\\degree_0610" , ( lambda x: x[0] + ":" + x[1] ) )
def processResumes(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") resumeCollName = "web_resumes" resumemodelCollName = resumeCollName + "_model" resumeColl = srcBbClient.getCollection(resumeCollName) modelColl = srcBbClient.getCollection(resumemodelCollName) # newCol = srcBbClient.getCollection("daily_job_info_2014-06-16") # for resume in collection.find(): resume = resumeColl.find_one() resumeModel = parseResume(resume) # modelColl.save(resumeModel.serialize()) saveResumeModels(resumeColl, modelColl)
def processResumes(): srcBbClient = DbClient("localhost", 27017, "jobaly_daily_test") resumeCollName = "web_resumes" resumemodelCollName = resumeCollName + "_model" resumeColl = srcBbClient.getCollection(resumeCollName) modelColl = srcBbClient.getCollection(resumemodelCollName) # newCol = srcBbClient.getCollection("daily_job_info_2014-06-16") # for resume in collection.find(): resume = resumeColl.find_one() resumeModel = parseResume(resume) # modelColl.save(resumeModel.serialize()) saveResumeModels(resumeColl, modelColl)
def test_match(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") jobCollName = "daily_job_webdev" jobmodelCollName = jobCollName+"_model" collection = srcBbClient.getCollection(jobCollName) modelColl = srcBbClient.getCollection(jobmodelCollName) # newCol = srcBbClient.getCollection("daily_job_info_2014-06-16") similarity = ModelSimilarity() result = similarity.match_jobs(resumeModel1 , modelColl ) i = 0 for key, value in result: i += 1 print i,key, value
def testTermsMatching(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") newCol = srcBbClient.getCollection("daily_job_webdev") newCol = srcBbClient.getCollection("daily_job_info_2014-06-16") newCol = srcBbClient.getCollection("daily_job_info_2014-06-10") print "newCol=", newCol allSents = getAllSentsInColl(newCol) terms = [ "degree", "B.S.", "M.S.", "BS", "MS", "bachelor", "master", "phd", "master's" ] matchingSents = termsMatching(allSents, terms) # dumpTwo(matchingSents, "sents\\degree_raw" , ( lambda x: x[0] + ":" + x[1] ) ) dumpTwo(matchingSents, "sents\\degree_0610", (lambda x: x[0] + ":" + x[1]))
def getDisMatrixFromColletion(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") collection = srcBbClient.getCollection("daily_job_webdev") f = open('sents.txt','w') # python will convert \n to os.linesep docs = [] for job in collection.find(): # print "\n\n\n======",job["_id"],"============================\n" # f.write(job["summary"].encode("GBK", "ignore")+"\n") jobDesc = JobDescParser.parseJobDesc(job) sents = jobDesc.listAllSentences() doc =[] for sent in sents: # print sent.encode("GBK", "ignore") f.write(sent.encode("GBK", "ignore")+"\n") tokens = [ token.lower() for token in word_tokenize(sent)] for token in tokens: if token == 'c': # print token pass doc.extend(tokens) docs.append(doc) f.close() terms=["javascript", "jquery", "html", "css", "java", "python", "ruby", "mysql", "jdbc" , "cpp" ] # terms=["javascript", "jquery", "html", "css", "java", "jsp", "python", "ruby", "ror" ] # terms=["java","jdbc","spring","hibernate","mysql","oracle"] matrix = getDistanceMatrix(docs, terms) printDisMatrix(terms, matrix) matrix_dump = json.dumps(matrix) print matrix_dump
def testProcessPage(): listCollectionName = "daily_dice_list_2014-07-11" infoCollectionName = "daily_dice_info_2014-07-11" dbClient = DbClient('localhost', 27017, "jobaly_daily") listCollection = dbClient.getCollection(listCollectionName) infoCollection = dbClient.getCollection(infoCollectionName) getter = DicePageGetter(infoCollection) pageSize = 100 pageNo = 1 has_more = True pageNum = 10000 find_sort = None find_spec = None while has_more and pageNo <= pageNum: page = dbClient.getPage(listCollection, find_spec, find_sort, pageSize, pageNo) getter.processPage(page, pageNo) pageNo += 1 count = page.count(with_limit_and_skip=True) # print "count=",count if (count < pageSize): has_more = False
def aggregateHtmlTag(): listCollectionName = "daily_dice_info_2014-07-11" # listCollectionName = "daily_job_info_2014-07-08" dbClient = DbClient('localhost', 27017, "jobaly_daily") collection = dbClient.getCollection(listCollectionName) dataProcessor = JobDataProcessor(collection) dataProcessor.aggregateHtmlTags()
def testGetSentenceByTerm(term): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") collection = srcBbClient.getCollection("daily_job_webdev") outputPath = '..\skill\output\\' + term getSentenceByTerm(collection, term, outputPath)
def getSentsByOntology(): owlfile = "..\..\jobaly\ontology\web_dev.owl" ontology = OntologyLib(owlfile) terms = [ " "+ x.lower()+" " for x in ontology.getLabelList()] terms.extend([" "+x.lower()+" " for x in ontology.getAllClassNames()]) srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") newCol = srcBbClient.getCollection("daily_job_webdev") collection = newCol matchingSents = [] for job in collection.find(): # print "\n\n\n======",job["_id"],"============================\n" jobDesc = JobDescParser.parseJobDesc(job) sents = jobDesc.listAllSentences() jid = job["_id"] for sent in sents: c = 0 sent = " "+sent.lower()+" " for term in terms: if sent.find(term) != -1: c+=1 if c==3 : print sent.encode("GBK", "ignore") matchingSents.append((jid, sent)) break sortedsents = sorted(matchingSents, key=lambda x: len(x[1]) ) dumpTwo(sortedsents, "term3" , ( lambda x: x[0] + ":" + x[1] ) )
def main(): path = "..\\..\\..\\..\\data\\resumes\\web\\" # scandir(path) srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") resumeCollName = "web_resumes" collection = srcBbClient.getCollection(resumeCollName) saveResumes(path, collection)
def aggregateTitle(): listCollectionName = "daily_job_list_2014-06-10" listCollectionName = "daily_dice_info_2014-07-11" dbClient = DbClient('localhost', 27017, "jobaly_daily") collection = dbClient.getCollection(listCollectionName) dataProcessor = JobDataProcessor(collection) dataProcessor.aggregateTitleToFile("titles//dice_titleList.json") dataProcessor.aggregateTitleToFile("titles//dice_titleList.txt", "text")
def filterWebDeveloper_dice(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily") targetBbClient = DbClient('localhost', 27017, "jobaly_daily_test") srcCol = srcBbClient.getCollection("daily_dice_info_2014-07-11") newCol = targetBbClient.getCollection("daily_job_webdev") i = 0 for job in srcCol.find(): jobtitle = job["jobtitle"].lower() if (jobtitle.find("web") != -1 ) and \ (jobtitle.find("developer") != -1 ): job["url"] = job["detailUrl"] job["detailUrl"] = None newCol.insert(job) i+=1 print i, ":", jobtitle.encode("GBK", "ignore") if i == 150: break
def main(): dbClient = DbClient('localhost', 27017, "jobaly") jobCollection = dbClient.getCollection(gConfig["webJobInfoCollName"]) tfIdfMatch = TfIdfMatch(jobCollection) resume = "I a am good java programmer, PHP, XML, hope juse c++, skill" jobs = tfIdfMatch.matchResume(resume) for job in jobs: print job["_id"], job["score"]
def processJobColl(): srcDb = "jobaly_daily" srcCollnames = "daily_job_info_2014-06-16" srcDb = "jobaly_daily_test" srcCollnames = "daily_job_webdev" srcClient = DbClient('localhost', 27017, srcDb) srcCollnames = "daily_job_info_2014-06-16" srcColl = srcClient.getCollection(srcCollnames) targetDb = "jobaly" targetCollName = "job100" targetClient = DbClient('localhost', 27017, targetDb) targetColl = targetClient.getCollection(targetCollName) size = 15 # copyColl(srcColl, targetColl, size) processjobs(targetDb, targetCollName)
def testProcessQuery(): today = datetime.date.today() listCollectionName = "daily_dice_list_" + str(today) listCollectionName = "daily_dice_list_" + "test" print listCollectionName dbClient = DbClient('localhost', 27017, "jobaly_daily") collection = dbClient.getCollection(listCollectionName) diceClient = DiceApiClient() diceClient.setState("TN") print diceClient.processQuery(collection)
def testProcessQuery(): today = datetime.date.today() listCollectionName = "daily_dice_list_" + str(today) listCollectionName = "daily_dice_list_" + "test" print listCollectionName dbClient = DbClient("localhost", 27017, "jobaly_daily") collection = dbClient.getCollection(listCollectionName) diceClient = DiceApiClient() diceClient.setState("TN") print diceClient.processQuery(collection)
def testTermMatching(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") newCol = srcBbClient.getCollection("daily_job_webdev") allSents = getAllSentsInColl(newCol) term = "experience" term = "knowledge" term = "skills" term = "degree" matchingSents = termMatching(allSents,term) dumpTwo(matchingSents, "sents\\matching_"+ term , ( lambda x: x[0] + ":" + x[1] ) )
def processjobs(dbname, collname): srcBbClient = DbClient('localhost', 27017, dbname) jobCollName = collname jobmodelCollName = jobCollName+"_model" collection = srcBbClient.getCollection(jobCollName) modelColl = srcBbClient.getCollection(jobmodelCollName) # newCol = srcBbClient.getCollection("daily_job_info_2014-06-16") removeHtml(collection) copyTitle(collection) for job in collection.find(): sents = preprocess(job) jobModel = JobModel(job["_id"]) processSents(jobModel, sents ) titleModel = processTitle(job) jobModel.titleModel = titleModel jobclassifier.classifyJob(jobModel) modelColl.save(jobModel.serialize())
def testTermMatching(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") newCol = srcBbClient.getCollection("daily_job_webdev") allSents = getAllSentsInColl(newCol) term = "experience" term = "knowledge" term = "skills" term = "degree" matchingSents = termMatching(allSents, term) dumpTwo(matchingSents, "sents\\matching_" + term, (lambda x: x[0] + ":" + x[1]))
def testParseAll(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") newCol = srcBbClient.getCollection("daily_job_webdev") job = DbClient.findById(newCol, jid) # paragraph = JobParser.parseParagraph(job) for job in newCol.find(): print "\n\n\n======", job["_id"], "============================\n" jobDesc = JobDescParser.parseJobDesc(job)
def testParseAll(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") newCol = srcBbClient.getCollection("daily_job_webdev") job = DbClient.findById(newCol,jid) # paragraph = JobParser.parseParagraph(job) for job in newCol.find(): print "\n\n\n======",job["_id"],"============================\n" jobDesc = JobDescParser.parseJobDesc(job)
def getJobList_sync(listCollectionName): print " --- get daily job by language and top cities---" lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt") cities = jobaly.utils.loadArrayFromFile("loc_list.txt") # lang_names = jobaly.utils.loadArrayFromFile("test_lang_list.txt") # cities = jobaly.utils.loadArrayFromFile("test_loc_list.txt") dbClient = DbClient('localhost', 27017, "jobaly_daily") collection = dbClient.getCollection(listCollectionName) crawlIndeed(collection, lang_names, cities)
def main(): targetBbClient = DbClient('localhost', 27017, "jobaly_daily_test") targetColl = targetBbClient.getCollection("test_coll") srcBbClient = DbClient('localhost', 27017, "jobaly_daily") srcColl = srcBbClient.getCollection("daily_job_info_2014-06-04") # collutils.copyCollection(srcColl, targetColl) srcCollNames = [ "daily_job_info_2014-06-04", "daily_job_info_2014-06-05", "daily_job_info_2014-06-06", "daily_job_info_2014-06-08", "daily_job_info_2014-06-10" ] # collutils.copyCollections(targetBbClient, "job_info_merge", srcBbClient, srcCollNames) srcCollNames = [ "daily_job_list_2014-06-04", "daily_job_list_2014-06-05", "daily_job_list_2014-06-06", "daily_job_list_2014-06-08", "daily_job_list_2014-06-10" ] collutils.copyCollections(targetBbClient, "job_list_merge", srcBbClient, srcCollNames)
def loadJobs(dbname, collName, ids): dbClient = DbClient('localhost', 27017, dbname) jobCollection = dbClient.getCollection(collName) jobs = [] for jobid in ids: result = list(jobCollection.find({'_id': jobid})) if len(result) > 0: job = result[0] # print type(job) # print job print job["_id"], job["location"] jobs.append(job) return jobs
def getJobList_sync(listCollectionName): print " --- get daily job by language and top cities---" lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt") cities = jobaly.utils.loadArrayFromFile("loc_list.txt") # lang_names = jobaly.utils.loadArrayFromFile("test_lang_list.txt") # cities = jobaly.utils.loadArrayFromFile("test_loc_list.txt") dbClient = DbClient('localhost', 27017, "jobaly_daily") collection = dbClient.getCollection(listCollectionName) crawlIndeed(collection, lang_names, cities )
def loadJobs(dbname, collName, ids): dbClient = DbClient('localhost', 27017, dbname) jobCollection = dbClient.getCollection(collName) jobs = [] for jobid in ids: result=list(jobCollection.find({'_id': jobid })) if len(result) > 0: job = result[0] # print type(job) # print job print job["_id"], job["location"] jobs.append(job) return jobs
def testGetJobInfo(): dbClient = DbClient('localhost', 27017, "jobaly_daily") today = datetime.date.today() listCollectionName = "daily_dice_list_"+str(today) infoCollectionName = "daily_dice_info_"+str(today) listCollectionName = "daily_dice_list_2014-07-11" infoCollectionName = "daily_dice_info_2014-07-11" print listCollectionName print infoCollectionName listCollection = dbClient.getCollection(listCollectionName) infoCollection = dbClient.getCollection(infoCollectionName) getJobInfo(dbClient,listCollection, infoCollection)
def testGetJobInfo(): dbClient = DbClient('localhost', 27017, "jobaly_daily") today = datetime.date.today() listCollectionName = "daily_dice_list_" + str(today) infoCollectionName = "daily_dice_info_" + str(today) listCollectionName = "daily_dice_list_2014-07-11" infoCollectionName = "daily_dice_info_2014-07-11" print listCollectionName print infoCollectionName listCollection = dbClient.getCollection(listCollectionName) infoCollection = dbClient.getCollection(infoCollectionName) getJobInfo(dbClient, listCollection, infoCollection)
def testParseParagraph(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") newCol = srcBbClient.getCollection("daily_job_webdev") jid = "9e216b2d65bd864b" jid = "matrixga/78237-51" jid = "cybercod/CN-.NETwebDev-CA3" jid = "f3c336fa35c28771" jid = "10116717/638726" jid = "ocs/54391" jid = "0e230c368a34322b" jid = "6718adb8b28b9b39" job = DbClient.findById(newCol,jid) jobDesc = JobDescParser.parseJobDesc(job) jobDesc.printParagraphs()
def testParseParagraph(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") newCol = srcBbClient.getCollection("daily_job_webdev") jid = "9e216b2d65bd864b" jid = "matrixga/78237-51" jid = "cybercod/CN-.NETwebDev-CA3" jid = "f3c336fa35c28771" jid = "10116717/638726" jid = "ocs/54391" jid = "0e230c368a34322b" jid = "6718adb8b28b9b39" job = DbClient.findById(newCol, jid) jobDesc = JobDescParser.parseJobDesc(job) jobDesc.printParagraphs()
def main(): #srcJobInfoCollName: jobinfo_lang_top_corps #webJobInfoCollName: test_jobinfo #webResumeColName: test_resume #JobIdfCollName:job_idf # print gConfig dbClient = DbClient('localhost', 27017, "jobaly") jobCollection = dbClient.getCollection("test_jobinfo") # jobIdfCollection = dbClient.getCollection(gConfig["JobIdfCollName"]) tfIdfGetter = TfIdfGetter() # tfIdfGetter.saveJobTfIdf(jobCollection, jobIdfCollection ) idf, jobs = tfIdfGetter.getJobTfIdf(jobCollection) print idf
def main(): days = 4 today = datetime.date.today() today = "2014-06-15_p4" listCollectionName = "daily_job_list_"+str(today) print "list collection name:", listCollectionName infoCollectionName = "daily_job_info_"+str(today) print "info collection name:", infoCollectionName # lang_names = jobaly.utils.loadArrayFromFile("test_lang_list.txt") # cities = jobaly.utils.loadArrayFromFile("test_loc_list.txt") lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt") cities = jobaly.utils.loadArrayFromFile("loc_list.txt") dbClient = DbClient('localhost', 27017, "jobaly_daily_test") listCollection = dbClient.getCollection(listCollectionName) start_time = time.time() print "---- start get job list ----" # getJobList(listCollectionName) crawlIndeed(listCollection, lang_names, cities,days ) t = time.time() - start_time print "---- finish get job list, use %s seconds ----" %t print print infoCollection = dbClient.getCollection(infoCollectionName) start_time = time.time() print "---- start get job info ----" getJobInfo(dbClient, listCollection, infoCollection) t = time.time() - start_time print "---- finish get job info, use %s seconds ----" %t
def main(): #webJobInfoCollName: test_jobinfo resume = loadResume("..\\..\\..\\data\\test_resumes\\Darin-Densley_web.txt") # resume = loadResume("..\\..\\..\\data\\test_resumes\\Java-Developer.txt") # resume = loadResume("..\\..\\..\\data\\test_resumes\\Fong-Kuo_data.txt") # print resume # resume = "I a am good java programmer, PHP, XML, hope juse c++, skill" dbClient = DbClient('localhost', 27017, "jobaly") jobCollection = dbClient.getCollection("job100") kl = KL(jobCollection) jobs = kl.matchResume(resume) for job in jobs: print job["_id"], job["score"]
def main(): collectionName = "job_lang_top_corps" infoCollectionName = "jobinfo_lang_top_corps" dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) infoCollection = dbClient.getCollection(infoCollectionName) getter = IndeedPageGetter(infoCollection) pageSize = 10 pageNo = 149 has_more = True pageNum = 10000 find_sort = None find_spec=None while has_more and pageNo <= pageNum : page = dbClient.getPage(collection, find_spec,find_sort, pageSize, pageNo) getter.processPage(page,pageNo) pageNo+=1 count = page.count(with_limit_and_skip = True) # print "count=",count if ( count < pageSize ) : has_more = False
def getJobList(listCollectionName): print " --- get daily job by language and top cities---" # lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt") states = jobaly.utils.loadArrayFromFile("state_list.txt") diceClient = DiceApiClient({"age": "1"}) dbClient = DbClient('localhost', 27017, "jobaly_daily") collection = dbClient.getCollection(listCollectionName) for state in states: diceClient.setState(state) print "-----prcoss location %s -------" % (state) diceClient.processQuery(collection)
def getJobList(listCollectionName): print " --- get daily job by language and top cities---" # lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt") states = jobaly.utils.loadArrayFromFile("state_list.txt") diceClient = DiceApiClient({"age": "1"}) dbClient = DbClient("localhost", 27017, "jobaly_daily") collection = dbClient.getCollection(listCollectionName) for state in states: diceClient.setState(state) print "-----prcoss location %s -------" % (state) diceClient.processQuery(collection)
def main(): #webJobInfoCollName: test_jobinfo resume = loadResume( "..\\..\\..\\data\\test_resumes\\Darin-Densley_web.txt") # resume = loadResume("..\\..\\..\\data\\test_resumes\\Java-Developer.txt") # resume = loadResume("..\\..\\..\\data\\test_resumes\\Fong-Kuo_data.txt") # print resume # resume = "I a am good java programmer, PHP, XML, hope juse c++, skill" dbClient = DbClient('localhost', 27017, "jobaly") jobCollection = dbClient.getCollection("job100") kl = KL(jobCollection) jobs = kl.matchResume(resume) for job in jobs: print job["_id"], job["score"]
def main(): #webJobInfoCollName: test_jobinfo resumepath = "" resume = loadResume( "..\\..\\..\\data\\test_resumes\\Darin-Densley_web.txt") resume = loadResume("..\\..\\..\\data\\test_resumes\\Java-Developer.txt") resume = loadResume("..\\..\\..\\data\\test_resumes\\Fong-Kuo_data.txt") # print resume dbClient = DbClient('localhost', 27017, "jobaly") jobCollection = dbClient.getCollection("job100") tfIdfMatch = TfIdfMatch(jobCollection) jobs = tfIdfMatch.matchResume(resume) for job in jobs: print job["_id"], job["score"]
def main(): #webJobInfoCollName: test_jobinfo resumepath = "" resume = loadResume("..\\..\\..\\data\\test_resumes\\Darin-Densley_web.txt") resume = loadResume("..\\..\\..\\data\\test_resumes\\Java-Developer.txt") resume = loadResume("..\\..\\..\\data\\test_resumes\\Fong-Kuo_data.txt") # print resume dbClient = DbClient('localhost', 27017, "jobaly") jobCollection = dbClient.getCollection("job100") tfIdfMatch = TfIdfMatch(jobCollection) jobs = tfIdfMatch.matchResume(resume) for job in jobs: print job["_id"], job["score"]
def processTitles(dbname, collname): srcBbClient = DbClient('localhost', 27017, dbname) jobCollName = collname collection = srcBbClient.getCollection(jobCollName) for job in collection.find(): sid = job["_id"] title = job["jobtitle"] matcher = processTitle(title) if matcher is not None: output = matcher.output() found = matcher.found else: output = None found = None print sid, title print found, output
def getOntology(resumefile, dbname, modelCollName): dbClient = DbClient('localhost', 27017, dbname) modelColl = dbClient.getCollection(modelCollName) with open(resumefile, 'r') as content_file: content = content_file.read() content = remove_non_ascii_2(content) resumeModel = resumeparser.parseResumeText(content) # print resumeModel similarity = ModelSimilarity() result = similarity.match_jobColl(resumeModel, modelColl) n = 1 for key, value in result[:20]: print n, key, value n = n + 1 print "- - - - - - -" for key, value in result[:20]: print key
def processTitles(dbname, collname): srcBbClient = DbClient('localhost', 27017, dbname) jobCollName = collname collection = srcBbClient.getCollection(jobCollName) for job in collection.find(): sid = job["_id"] title = job["jobtitle"] matcher = processTitle(title) if matcher is not None: output = matcher.output() found = matcher.found else: output = None found = None print sid , title print found, output
def getJavaScipt(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") newCol = srcBbClient.getCollection("daily_job_webdev") collection = newCol term = "javascript" matchingSents = [] for job in collection.find(): # print "\n\n\n======",job["_id"],"============================\n" jobDesc = JobDescParser.parseJobDesc(job) sents = jobDesc.listAllSentences() jid = job["_id"] for sent in sents: tokens = [ token.lower() for token in word_tokenize(sent)] if term in tokens : matchingSents.append((jid, sent)) print sent.encode("GBK", "ignore") sortedsents = sorted(matchingSents, key=lambda x: len(x[1]) ) dumpTwo(sortedsents, "..\skill\output\javascript" , ( lambda x: x[0] + ":" + x[1] ) )