Beispiel #1
0
def main():
    days = 4
    today = datetime.date.today()
    today = "2014-06-15_p4"
    listCollectionName = "daily_job_list_" + str(today)
    print "list collection name:", listCollectionName
    infoCollectionName = "daily_job_info_" + str(today)
    print "info collection name:", infoCollectionName

    #   lang_names = jobaly.utils.loadArrayFromFile("test_lang_list.txt")
    #   cities = jobaly.utils.loadArrayFromFile("test_loc_list.txt")
    lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt")
    cities = jobaly.utils.loadArrayFromFile("loc_list.txt")

    dbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    listCollection = dbClient.getCollection(listCollectionName)

    start_time = time.time()
    print "---- start get job list ----"
    #  getJobList(listCollectionName)
    crawlIndeed(listCollection, lang_names, cities, days)
    t = time.time() - start_time
    print "---- finish get job list, use %s seconds  ----" % t

    print
    print

    infoCollection = dbClient.getCollection(infoCollectionName)
    start_time = time.time()
    print "---- start get job info ----"
    getJobInfo(dbClient, listCollection, infoCollection)
    t = time.time() - start_time
    print "---- finish get job info, use %s seconds  ----" % t
Beispiel #2
0
def mergeDailyJob(date):
    srcBbClient = DbClient('localhost', 27017, "jobaly_daily")
    targetBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    listCol = srcBbClient.getCollection("daily_job_list_" + date)
    infoCol = srcBbClient.getCollection("daily_job_info_" + date)
    newCol = targetBbClient.getCollection("daily_job_" + date)
    mergeJob(listCol, infoCol, newCol)
Beispiel #3
0
def main(): 

  # print gConfig
  dbClient = DbClient('localhost', 27017, "jobaly")  
  jobCollection = dbClient.getCollection(gConfig["webJobInfoCollName"])  
  jobIdfCollection = dbClient.getCollection(gConfig["JobIdfCollName"]) 
  
  tfIdfGetter = TfIdfGetter()
 # tfIdfGetter.saveJobTfIdf(jobCollection,  jobIdfCollection )
  idf, jobs = tfIdfGetter.getJobTfIdf(jobCollection)
  print idf
Beispiel #4
0
def main():

    # print gConfig
    dbClient = DbClient('localhost', 27017, "jobaly")
    jobCollection = dbClient.getCollection(gConfig["webJobInfoCollName"])
    jobIdfCollection = dbClient.getCollection(gConfig["JobIdfCollName"])

    tfIdfGetter = TfIdfGetter()
    # tfIdfGetter.saveJobTfIdf(jobCollection,  jobIdfCollection )
    idf, jobs = tfIdfGetter.getJobTfIdf(jobCollection)
    print idf
Beispiel #5
0
def filterWebDeveloper_indeed():
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     targetBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     srcCol = srcBbClient.getCollection("daily_job_2014-06-05")  
     newCol = targetBbClient.getCollection("daily_job_webdev")       
     
     for job in srcCol.find():
         jobtitle = job["jobtitle"].lower()
         if (jobtitle.find("web") != -1 ) and \
             (jobtitle.find("developer") != -1 ):
                print jobtitle.encode("GBK", "ignore")                 
                newCol.insert(job) 
def testTermsMatching():
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     newCol = srcBbClient.getCollection("daily_job_webdev")
     newCol = srcBbClient.getCollection("daily_job_info_2014-06-16")
     newCol = srcBbClient.getCollection("daily_job_info_2014-06-10")
      
     print "newCol=" ,newCol
     allSents =  getAllSentsInColl(newCol) 
     
     terms = ["degree", "B.S.", "M.S." ,"BS", "MS", "bachelor", "master", "phd","master's"]
     matchingSents = termsMatching(allSents,terms)     
   #  dumpTwo(matchingSents, "sents\\degree_raw" , ( lambda x: x[0] + ":" + x[1] ) )  
     dumpTwo(matchingSents, "sents\\degree_0610" , ( lambda x: x[0] + ":" + x[1] ) )     
Beispiel #7
0
def processResumes():
    srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    resumeCollName = "web_resumes"

    resumemodelCollName = resumeCollName + "_model"
    resumeColl = srcBbClient.getCollection(resumeCollName)
    modelColl = srcBbClient.getCollection(resumemodelCollName)
    #  newCol = srcBbClient.getCollection("daily_job_info_2014-06-16")

    #  for resume in collection.find():
    resume = resumeColl.find_one()
    resumeModel = parseResume(resume)
    #   modelColl.save(resumeModel.serialize())

    saveResumeModels(resumeColl, modelColl)
Beispiel #8
0
def processResumes():
    srcBbClient = DbClient("localhost", 27017, "jobaly_daily_test")
    resumeCollName = "web_resumes"

    resumemodelCollName = resumeCollName + "_model"
    resumeColl = srcBbClient.getCollection(resumeCollName)
    modelColl = srcBbClient.getCollection(resumemodelCollName)
    #  newCol = srcBbClient.getCollection("daily_job_info_2014-06-16")

    #  for resume in collection.find():
    resume = resumeColl.find_one()
    resumeModel = parseResume(resume)
    #   modelColl.save(resumeModel.serialize())

    saveResumeModels(resumeColl, modelColl)
Beispiel #9
0
def test_match():
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     jobCollName = "daily_job_webdev"
     
     jobmodelCollName = jobCollName+"_model"
     collection = srcBbClient.getCollection(jobCollName)
     modelColl = srcBbClient.getCollection(jobmodelCollName)
   #  newCol = srcBbClient.getCollection("daily_job_info_2014-06-16")
      
     similarity = ModelSimilarity()    
     result = similarity.match_jobs(resumeModel1 , modelColl  )
     i = 0
     for key, value in result:
         i += 1
         print i,key, value
def testTermsMatching():
    srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    newCol = srcBbClient.getCollection("daily_job_webdev")
    newCol = srcBbClient.getCollection("daily_job_info_2014-06-16")
    newCol = srcBbClient.getCollection("daily_job_info_2014-06-10")

    print "newCol=", newCol
    allSents = getAllSentsInColl(newCol)

    terms = [
        "degree", "B.S.", "M.S.", "BS", "MS", "bachelor", "master", "phd",
        "master's"
    ]
    matchingSents = termsMatching(allSents, terms)
    #  dumpTwo(matchingSents, "sents\\degree_raw" , ( lambda x: x[0] + ":" + x[1] ) )
    dumpTwo(matchingSents, "sents\\degree_0610", (lambda x: x[0] + ":" + x[1]))
Beispiel #11
0
def getDisMatrixFromColletion(): 
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     collection = srcBbClient.getCollection("daily_job_webdev")
     f = open('sents.txt','w')
      # python will convert \n to os.linesep
     
     docs = []
     for job in collection.find(): 
      #  print "\n\n\n======",job["_id"],"============================\n"
     #   f.write(job["summary"].encode("GBK", "ignore")+"\n")
        jobDesc = JobDescParser.parseJobDesc(job)
        
        sents = jobDesc.listAllSentences() 
        doc =[]
        for sent in sents:
           # print sent.encode("GBK", "ignore")
            f.write(sent.encode("GBK", "ignore")+"\n")
            tokens = [ token.lower() for token in word_tokenize(sent)]              
            for token in tokens:
                if token == 'c':
                #    print token
                    pass
            doc.extend(tokens)        
        docs.append(doc)
     f.close()
     terms=["javascript", "jquery", "html", "css", "java", "python", "ruby", "mysql", "jdbc" , "cpp"  ]
  #   terms=["javascript", "jquery", "html", "css", "java", "jsp", "python", "ruby", "ror"  ]
  

   # terms=["java","jdbc","spring","hibernate","mysql","oracle"]
     matrix = getDistanceMatrix(docs, terms)   
     printDisMatrix(terms, matrix)   
     matrix_dump = json.dumps(matrix)
     print matrix_dump
Beispiel #12
0
def testProcessPage():

    listCollectionName = "daily_dice_list_2014-07-11"
    infoCollectionName = "daily_dice_info_2014-07-11"

    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    listCollection = dbClient.getCollection(listCollectionName)
    infoCollection = dbClient.getCollection(infoCollectionName)
    getter = DicePageGetter(infoCollection)

    pageSize = 100
    pageNo = 1
    has_more = True
    pageNum = 10000
    find_sort = None
    find_spec = None
    while has_more and pageNo <= pageNum:
        page = dbClient.getPage(listCollection, find_spec, find_sort, pageSize,
                                pageNo)
        getter.processPage(page, pageNo)
        pageNo += 1
        count = page.count(with_limit_and_skip=True)
        #   print "count=",count
        if (count < pageSize):
            has_more = False
Beispiel #13
0
def aggregateHtmlTag(): 
    listCollectionName = "daily_dice_info_2014-07-11"
 #   listCollectionName = "daily_job_info_2014-07-08"
    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    collection = dbClient.getCollection(listCollectionName) 
    dataProcessor = JobDataProcessor(collection)
    dataProcessor.aggregateHtmlTags() 
Beispiel #14
0
def testGetSentenceByTerm(term):
    
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     collection = srcBbClient.getCollection("daily_job_webdev")
     
     outputPath = '..\skill\output\\' + term
     getSentenceByTerm(collection, term, outputPath)
Beispiel #15
0
def getSentsByOntology():
     owlfile = "..\..\jobaly\ontology\web_dev.owl"
     ontology = OntologyLib(owlfile)
     terms = [ " "+ x.lower()+" " for x in ontology.getLabelList()]
     terms.extend([" "+x.lower()+" " for x in ontology.getAllClassNames()])
     
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     newCol = srcBbClient.getCollection("daily_job_webdev")     
     collection = newCol
     
     matchingSents = []
     for job in collection.find(): 
      #   print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = jobDesc.listAllSentences() 
        jid = job["_id"]
        for sent in sents:
            c = 0
            sent = " "+sent.lower()+" "
            for term in terms:                
                if sent.find(term) != -1:
                   c+=1
                if c==3 : 
                    print sent.encode("GBK", "ignore")
                    matchingSents.append((jid, sent))
                    break
              
     sortedsents = sorted(matchingSents, key=lambda x:   len(x[1]) )
     dumpTwo(sortedsents, "term3" , ( lambda x: x[0] + ":" + x[1] ) )     
Beispiel #16
0
def main(): 
  path = "..\\..\\..\\..\\data\\resumes\\web\\"
 # scandir(path)
  
  srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
  resumeCollName = "web_resumes" 
  collection = srcBbClient.getCollection(resumeCollName)  
  saveResumes(path, collection)
Beispiel #17
0
def main():
    path = "..\\..\\..\\..\\data\\resumes\\web\\"
    # scandir(path)

    srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    resumeCollName = "web_resumes"
    collection = srcBbClient.getCollection(resumeCollName)
    saveResumes(path, collection)
Beispiel #18
0
def aggregateTitle(): 
    listCollectionName = "daily_job_list_2014-06-10"
    listCollectionName = "daily_dice_info_2014-07-11"    
    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    collection = dbClient.getCollection(listCollectionName) 
    dataProcessor = JobDataProcessor(collection)
    dataProcessor.aggregateTitleToFile("titles//dice_titleList.json")
    dataProcessor.aggregateTitleToFile("titles//dice_titleList.txt", "text")
Beispiel #19
0
def filterWebDeveloper_dice():
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily")
     targetBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     srcCol = srcBbClient.getCollection("daily_dice_info_2014-07-11")  
     newCol = targetBbClient.getCollection("daily_job_webdev")       
     
     i = 0
     for job in srcCol.find():
         jobtitle = job["jobtitle"].lower()
         if (jobtitle.find("web") != -1 ) and \
             (jobtitle.find("developer") != -1 ):
                 
                job["url"] = job["detailUrl"] 
                job["detailUrl"] = None
                newCol.insert(job) 
                i+=1
                print i, ":", jobtitle.encode("GBK", "ignore")
                if i == 150: 
                    break
Beispiel #20
0
def main(): 
    
    dbClient = DbClient('localhost', 27017, "jobaly")  
    jobCollection = dbClient.getCollection(gConfig["webJobInfoCollName"])  
    tfIdfMatch = TfIdfMatch(jobCollection)
    resume = "I a am good java programmer, PHP, XML, hope juse c++, skill" 
    jobs = tfIdfMatch.matchResume(resume)
    
    for job in jobs:
        print job["_id"], job["score"]
Beispiel #21
0
def processJobColl():

    srcDb = "jobaly_daily"
    srcCollnames = "daily_job_info_2014-06-16"
    srcDb = "jobaly_daily_test"
    srcCollnames = "daily_job_webdev"
    srcClient = DbClient('localhost', 27017, srcDb)
    srcCollnames = "daily_job_info_2014-06-16"
    srcColl = srcClient.getCollection(srcCollnames)

    targetDb = "jobaly"
    targetCollName = "job100"
    targetClient = DbClient('localhost', 27017, targetDb)

    targetColl = targetClient.getCollection(targetCollName)

    size = 15
    #  copyColl(srcColl,  targetColl, size)
    processjobs(targetDb, targetCollName)
Beispiel #22
0
def processJobColl():

    srcDb = "jobaly_daily"
    srcCollnames = "daily_job_info_2014-06-16"
    srcDb = "jobaly_daily_test"
    srcCollnames = "daily_job_webdev"
    srcClient = DbClient('localhost', 27017, srcDb)
    srcCollnames = "daily_job_info_2014-06-16"
    srcColl = srcClient.getCollection(srcCollnames)

    targetDb = "jobaly"
    targetCollName = "job100"
    targetClient = DbClient('localhost', 27017, targetDb)

    targetColl = targetClient.getCollection(targetCollName)

    size = 15
    #  copyColl(srcColl,  targetColl, size)
    processjobs(targetDb, targetCollName)
Beispiel #23
0
def testProcessQuery():
    today = datetime.date.today()
    listCollectionName = "daily_dice_list_" + str(today)
    listCollectionName = "daily_dice_list_" + "test"

    print listCollectionName
    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    collection = dbClient.getCollection(listCollectionName)
    diceClient = DiceApiClient()
    diceClient.setState("TN")
    print diceClient.processQuery(collection)
Beispiel #24
0
def testProcessQuery():
    today = datetime.date.today()
    listCollectionName = "daily_dice_list_" + str(today)
    listCollectionName = "daily_dice_list_" + "test"

    print listCollectionName
    dbClient = DbClient("localhost", 27017, "jobaly_daily")
    collection = dbClient.getCollection(listCollectionName)
    diceClient = DiceApiClient()
    diceClient.setState("TN")
    print diceClient.processQuery(collection)
def testTermMatching():
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     newCol = srcBbClient.getCollection("daily_job_webdev")       
     allSents =  getAllSentsInColl(newCol) 
     
     term = "experience"
     term = "knowledge"
     term = "skills"
     term = "degree"
     matchingSents = termMatching(allSents,term)     
     dumpTwo(matchingSents, "sents\\matching_"+ term , ( lambda x: x[0] + ":" + x[1] ) )     
Beispiel #26
0
def processjobs(dbname, collname):
     srcBbClient = DbClient('localhost', 27017, dbname)
     jobCollName = collname
     
     jobmodelCollName = jobCollName+"_model"
     collection = srcBbClient.getCollection(jobCollName)
     modelColl = srcBbClient.getCollection(jobmodelCollName)
   #  newCol = srcBbClient.getCollection("daily_job_info_2014-06-16")
     removeHtml(collection) 
     copyTitle(collection) 
     for job in collection.find():       
         sents = preprocess(job)
         jobModel = JobModel(job["_id"])       
         processSents(jobModel,  sents )
         
         titleModel = processTitle(job)
         jobModel.titleModel = titleModel
         jobclassifier.classifyJob(jobModel)         
         
         modelColl.save(jobModel.serialize())
def testTermMatching():
    srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    newCol = srcBbClient.getCollection("daily_job_webdev")
    allSents = getAllSentsInColl(newCol)

    term = "experience"
    term = "knowledge"
    term = "skills"
    term = "degree"
    matchingSents = termMatching(allSents, term)
    dumpTwo(matchingSents, "sents\\matching_" + term,
            (lambda x: x[0] + ":" + x[1]))
Beispiel #28
0
def testParseAll():

    srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    newCol = srcBbClient.getCollection("daily_job_webdev")

    job = DbClient.findById(newCol, jid)
    #  paragraph = JobParser.parseParagraph(job)

    for job in newCol.find():
        print "\n\n\n======", job["_id"], "============================\n"

        jobDesc = JobDescParser.parseJobDesc(job)
Beispiel #29
0
def testParseAll():
    
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     newCol = srcBbClient.getCollection("daily_job_webdev")       
    
     job = DbClient.findById(newCol,jid)
   #  paragraph = JobParser.parseParagraph(job)
     
     for job in newCol.find(): 
         print "\n\n\n======",job["_id"],"============================\n"
     
         jobDesc = JobDescParser.parseJobDesc(job)
Beispiel #30
0
def getJobList_sync(listCollectionName):

    print " --- get daily job by language and top cities---"

    lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt")
    cities = jobaly.utils.loadArrayFromFile("loc_list.txt")

    #  lang_names = jobaly.utils.loadArrayFromFile("test_lang_list.txt")
    #  cities = jobaly.utils.loadArrayFromFile("test_loc_list.txt")

    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    collection = dbClient.getCollection(listCollectionName)
    crawlIndeed(collection, lang_names, cities)
Beispiel #31
0
def main():
    targetBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    targetColl = targetBbClient.getCollection("test_coll")

    srcBbClient = DbClient('localhost', 27017, "jobaly_daily")
    srcColl = srcBbClient.getCollection("daily_job_info_2014-06-04")
    #   collutils.copyCollection(srcColl, targetColl)

    srcCollNames = [
        "daily_job_info_2014-06-04", "daily_job_info_2014-06-05",
        "daily_job_info_2014-06-06", "daily_job_info_2014-06-08",
        "daily_job_info_2014-06-10"
    ]
    #  collutils.copyCollections(targetBbClient, "job_info_merge", srcBbClient, srcCollNames)

    srcCollNames = [
        "daily_job_list_2014-06-04", "daily_job_list_2014-06-05",
        "daily_job_list_2014-06-06", "daily_job_list_2014-06-08",
        "daily_job_list_2014-06-10"
    ]
    collutils.copyCollections(targetBbClient, "job_list_merge", srcBbClient,
                              srcCollNames)
Beispiel #32
0
def loadJobs(dbname, collName, ids):
    dbClient = DbClient('localhost', 27017, dbname)
    jobCollection = dbClient.getCollection(collName)
    jobs = []
    for jobid in ids:
        result = list(jobCollection.find({'_id': jobid}))
        if len(result) > 0:
            job = result[0]
            #   print type(job)
            #   print job
            print job["_id"], job["location"]
            jobs.append(job)
    return jobs
Beispiel #33
0
def getJobList_sync(listCollectionName):
   
    print " --- get daily job by language and top cities---"
               
    lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt")  
    cities = jobaly.utils.loadArrayFromFile("loc_list.txt")  
    
  #  lang_names = jobaly.utils.loadArrayFromFile("test_lang_list.txt")  
  #  cities = jobaly.utils.loadArrayFromFile("test_loc_list.txt") 

    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    collection = dbClient.getCollection(listCollectionName)  
    crawlIndeed(collection, lang_names, cities )  
Beispiel #34
0
def loadJobs(dbname, collName, ids):
     dbClient = DbClient('localhost', 27017, dbname)
     jobCollection = dbClient.getCollection(collName)
     jobs = []
     for jobid in ids:
        result=list(jobCollection.find({'_id': jobid }))
        if len(result) > 0:
            job = result[0]
         #   print type(job)
         #   print job
            print job["_id"], job["location"]
            jobs.append(job)
     return jobs
Beispiel #35
0
def testGetJobInfo():     
     dbClient = DbClient('localhost', 27017, "jobaly_daily")
     today = datetime.date.today()    
     listCollectionName = "daily_dice_list_"+str(today)
     infoCollectionName = "daily_dice_info_"+str(today)
    
     listCollectionName = "daily_dice_list_2014-07-11"
     infoCollectionName = "daily_dice_info_2014-07-11"

     print listCollectionName
     print infoCollectionName
     listCollection = dbClient.getCollection(listCollectionName)   
     infoCollection = dbClient.getCollection(infoCollectionName)
     getJobInfo(dbClient,listCollection, infoCollection)
Beispiel #36
0
def testGetJobInfo():
    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    today = datetime.date.today()
    listCollectionName = "daily_dice_list_" + str(today)
    infoCollectionName = "daily_dice_info_" + str(today)

    listCollectionName = "daily_dice_list_2014-07-11"
    infoCollectionName = "daily_dice_info_2014-07-11"

    print listCollectionName
    print infoCollectionName
    listCollection = dbClient.getCollection(listCollectionName)
    infoCollection = dbClient.getCollection(infoCollectionName)
    getJobInfo(dbClient, listCollection, infoCollection)
Beispiel #37
0
def testParseParagraph():
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     newCol = srcBbClient.getCollection("daily_job_webdev")       
     jid = "9e216b2d65bd864b"
     jid = "matrixga/78237-51"
     jid = "cybercod/CN-.NETwebDev-CA3"  
     jid = "f3c336fa35c28771"
     jid = "10116717/638726"
     jid = "ocs/54391"
     jid = "0e230c368a34322b"
     jid = "6718adb8b28b9b39"
     job = DbClient.findById(newCol,jid)
     jobDesc = JobDescParser.parseJobDesc(job)
     jobDesc.printParagraphs()
Beispiel #38
0
def testParseParagraph():
    srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    newCol = srcBbClient.getCollection("daily_job_webdev")
    jid = "9e216b2d65bd864b"
    jid = "matrixga/78237-51"
    jid = "cybercod/CN-.NETwebDev-CA3"
    jid = "f3c336fa35c28771"
    jid = "10116717/638726"
    jid = "ocs/54391"
    jid = "0e230c368a34322b"
    jid = "6718adb8b28b9b39"
    job = DbClient.findById(newCol, jid)
    jobDesc = JobDescParser.parseJobDesc(job)
    jobDesc.printParagraphs()
Beispiel #39
0
def main(): 
    #srcJobInfoCollName: jobinfo_lang_top_corps 
    #webJobInfoCollName: test_jobinfo
    #webResumeColName: test_resume
    #JobIdfCollName:job_idf
  # print gConfig
  dbClient = DbClient('localhost', 27017, "jobaly")  
  jobCollection = dbClient.getCollection("test_jobinfo")  
 # jobIdfCollection = dbClient.getCollection(gConfig["JobIdfCollName"]) 
  
  tfIdfGetter = TfIdfGetter()
 # tfIdfGetter.saveJobTfIdf(jobCollection,  jobIdfCollection )
  idf, jobs = tfIdfGetter.getJobTfIdf(jobCollection)
  print idf
Beispiel #40
0
def main():
     days = 4
     today = datetime.date.today()  
     today = "2014-06-15_p4"
     listCollectionName = "daily_job_list_"+str(today)
     print "list collection name:", listCollectionName
     infoCollectionName = "daily_job_info_"+str(today)
     print "info collection name:", infoCollectionName
     
     
  #   lang_names = jobaly.utils.loadArrayFromFile("test_lang_list.txt")  
  #   cities = jobaly.utils.loadArrayFromFile("test_loc_list.txt") 
     lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt")  
     cities = jobaly.utils.loadArrayFromFile("loc_list.txt")  
  
     
     
     dbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     listCollection = dbClient.getCollection(listCollectionName)  
       
     
     start_time = time.time()
     print "---- start get job list ----"
   #  getJobList(listCollectionName) 
     crawlIndeed(listCollection, lang_names, cities,days )   
     t =  time.time() - start_time 
     print "---- finish get job list, use %s seconds  ----" %t
     
     print 
     print
     
     infoCollection = dbClient.getCollection(infoCollectionName)
     start_time = time.time()     
     print "---- start get job info ----"
     getJobInfo(dbClient, listCollection, infoCollection)
     t =  time.time() - start_time
     print "---- finish get job info, use %s seconds  ----" %t
Beispiel #41
0
def main(): 
    #webJobInfoCollName: test_jobinfo
    resume =  loadResume("..\\..\\..\\data\\test_resumes\\Darin-Densley_web.txt")
  #  resume =  loadResume("..\\..\\..\\data\\test_resumes\\Java-Developer.txt")
  #  resume =  loadResume("..\\..\\..\\data\\test_resumes\\Fong-Kuo_data.txt")

  #  print resume
  #  resume = "I a am good java programmer, PHP, XML, hope juse c++, skill" 
    dbClient = DbClient('localhost', 27017, "jobaly")  
    jobCollection = dbClient.getCollection("job100")  
    kl = KL(jobCollection)
    jobs = kl.matchResume(resume)
    
    for job in jobs:
        print job["_id"], job["score"]
Beispiel #42
0
def main():
     
     collectionName = "job_lang_top_corps"
     infoCollectionName = "jobinfo_lang_top_corps"
    
     dbClient = DbClient('localhost', 27017, "jobaly")
     collection = dbClient.getCollection(collectionName)
     infoCollection = dbClient.getCollection(infoCollectionName)
     getter = IndeedPageGetter(infoCollection)

     pageSize = 10 
     pageNo = 149
     has_more = True
     pageNum = 10000
     find_sort = None
     find_spec=None
     while has_more and pageNo <= pageNum :
        page = dbClient.getPage(collection, find_spec,find_sort, pageSize, pageNo)    
        getter.processPage(page,pageNo)        
        pageNo+=1 
        count =  page.count(with_limit_and_skip = True)
     #   print "count=",count
        if ( count < pageSize ) :
            has_more = False
Beispiel #43
0
def getJobList(listCollectionName):

    print " --- get daily job by language and top cities---"

    # lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt")
    states = jobaly.utils.loadArrayFromFile("state_list.txt")

    diceClient = DiceApiClient({"age": "1"})
    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    collection = dbClient.getCollection(listCollectionName)

    for state in states:
        diceClient.setState(state)
        print "-----prcoss location %s  -------" % (state)
        diceClient.processQuery(collection)
Beispiel #44
0
def getJobList(listCollectionName):

    print " --- get daily job by language and top cities---"

    # lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt")
    states = jobaly.utils.loadArrayFromFile("state_list.txt")

    diceClient = DiceApiClient({"age": "1"})
    dbClient = DbClient("localhost", 27017, "jobaly_daily")
    collection = dbClient.getCollection(listCollectionName)

    for state in states:
        diceClient.setState(state)
        print "-----prcoss location %s  -------" % (state)
        diceClient.processQuery(collection)
Beispiel #45
0
def main():
    #webJobInfoCollName: test_jobinfo
    resume = loadResume(
        "..\\..\\..\\data\\test_resumes\\Darin-Densley_web.txt")
    #  resume =  loadResume("..\\..\\..\\data\\test_resumes\\Java-Developer.txt")
    #  resume =  loadResume("..\\..\\..\\data\\test_resumes\\Fong-Kuo_data.txt")

    #  print resume
    #  resume = "I a am good java programmer, PHP, XML, hope juse c++, skill"
    dbClient = DbClient('localhost', 27017, "jobaly")
    jobCollection = dbClient.getCollection("job100")
    kl = KL(jobCollection)
    jobs = kl.matchResume(resume)

    for job in jobs:
        print job["_id"], job["score"]
Beispiel #46
0
def main():
    #webJobInfoCollName: test_jobinfo
    resumepath = ""
    resume = loadResume(
        "..\\..\\..\\data\\test_resumes\\Darin-Densley_web.txt")
    resume = loadResume("..\\..\\..\\data\\test_resumes\\Java-Developer.txt")
    resume = loadResume("..\\..\\..\\data\\test_resumes\\Fong-Kuo_data.txt")

    # print resume
    dbClient = DbClient('localhost', 27017, "jobaly")
    jobCollection = dbClient.getCollection("job100")
    tfIdfMatch = TfIdfMatch(jobCollection)

    jobs = tfIdfMatch.matchResume(resume)

    for job in jobs:
        print job["_id"], job["score"]
Beispiel #47
0
def main(): 
    #webJobInfoCollName: test_jobinfo
    resumepath = ""
    resume =  loadResume("..\\..\\..\\data\\test_resumes\\Darin-Densley_web.txt")
    resume =  loadResume("..\\..\\..\\data\\test_resumes\\Java-Developer.txt")
    resume =  loadResume("..\\..\\..\\data\\test_resumes\\Fong-Kuo_data.txt")



   # print resume
    dbClient = DbClient('localhost', 27017, "jobaly")  
    jobCollection = dbClient.getCollection("job100")  
    tfIdfMatch = TfIdfMatch(jobCollection)
    
    jobs = tfIdfMatch.matchResume(resume)
    
    for job in jobs:
        print job["_id"], job["score"]
Beispiel #48
0
def processTitles(dbname, collname):
    srcBbClient = DbClient('localhost', 27017, dbname)
    jobCollName = collname
    collection = srcBbClient.getCollection(jobCollName)
    for job in collection.find():
        sid = job["_id"]
        title = job["jobtitle"]
        matcher = processTitle(title)

        if matcher is not None:
            output = matcher.output()
            found = matcher.found
        else:
            output = None
            found = None

        print sid, title
        print found, output
Beispiel #49
0
def getOntology(resumefile, dbname, modelCollName):
    dbClient = DbClient('localhost', 27017, dbname)
    modelColl = dbClient.getCollection(modelCollName)

    with open(resumefile, 'r') as content_file:
        content = content_file.read()
        content = remove_non_ascii_2(content)
    resumeModel = resumeparser.parseResumeText(content)
    # print     resumeModel
    similarity = ModelSimilarity()
    result = similarity.match_jobColl(resumeModel, modelColl)
    n = 1
    for key, value in result[:20]:
        print n, key, value
        n = n + 1
    print "- - - - - - -"
    for key, value in result[:20]:
        print key
Beispiel #50
0
def processTitles(dbname, collname):
     srcBbClient = DbClient('localhost', 27017, dbname)
     jobCollName = collname
     collection = srcBbClient.getCollection(jobCollName)      
     for job in collection.find(): 
        sid = job["_id"]
        title = job["jobtitle"]
        matcher = processTitle(title)
        
        if matcher is not None:
            output = matcher.output()
            found = matcher.found
        else:
            output = None
            found = None
        
        print sid , title
        print   found, output 
Beispiel #51
0
def getJavaScipt(): 
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     newCol = srcBbClient.getCollection("daily_job_webdev")
     
     collection = newCol
     term = "javascript"
     matchingSents = []
     for job in collection.find(): 
      #   print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = jobDesc.listAllSentences() 
        jid = job["_id"]
        for sent in sents:
            tokens = [ token.lower() for token in word_tokenize(sent)]              
            if term in tokens : 
                matchingSents.append((jid, sent))
                print sent.encode("GBK", "ignore")
                
     sortedsents = sorted(matchingSents, key=lambda x:   len(x[1]) )
     dumpTwo(sortedsents, "..\skill\output\javascript" , ( lambda x: x[0] + ":" + x[1] ) )