Python JobDescParser Exemples, jobdescparser.JobDescParser Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : loadsent1.py Projet : folagit/resumatcher

def getSentsByOntology():
     owlfile = "..\..\jobaly\ontology\web_dev.owl"
     ontology = OntologyLib(owlfile)
     terms = [ " "+ x.lower()+" " for x in ontology.getLabelList()]
     terms.extend([" "+x.lower()+" " for x in ontology.getAllClassNames()])
     
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     newCol = srcBbClient.getCollection("daily_job_webdev")     
     collection = newCol
     
     matchingSents = []
     for job in collection.find(): 
      #   print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = jobDesc.listAllSentences() 
        jid = job["_id"]
        for sent in sents:
            c = 0
            sent = " "+sent.lower()+" "
            for term in terms:                
                if sent.find(term) != -1:
                   c+=1
                if c==3 : 
                    print sent.encode("GBK", "ignore")
                    matchingSents.append((jid, sent))
                    break
              
     sortedsents = sorted(matchingSents, key=lambda x:   len(x[1]) )
     dumpTwo(sortedsents, "term3" , ( lambda x: x[0] + ":" + x[1] ) )

Exemple #2

0

Afficher le fichier

Fichier : termdistance.py Projet : folagit/resumatcher

def getDisMatrixFromColletion(): 
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     collection = srcBbClient.getCollection("daily_job_webdev")
     f = open('sents.txt','w')
      # python will convert \n to os.linesep
     
     docs = []
     for job in collection.find(): 
      #  print "\n\n\n======",job["_id"],"============================\n"
     #   f.write(job["summary"].encode("GBK", "ignore")+"\n")
        jobDesc = JobDescParser.parseJobDesc(job)
        
        sents = jobDesc.listAllSentences() 
        doc =[]
        for sent in sents:
           # print sent.encode("GBK", "ignore")
            f.write(sent.encode("GBK", "ignore")+"\n")
            tokens = [ token.lower() for token in word_tokenize(sent)]              
            for token in tokens:
                if token == 'c':
                #    print token
                    pass
            doc.extend(tokens)        
        docs.append(doc)
     f.close()
     terms=["javascript", "jquery", "html", "css", "java", "python", "ruby", "mysql", "jdbc" , "cpp"  ]
  #   terms=["javascript", "jquery", "html", "css", "java", "jsp", "python", "ruby", "ror"  ]
  

   # terms=["java","jdbc","spring","hibernate","mysql","oracle"]
     matrix = getDistanceMatrix(docs, terms)   
     printDisMatrix(terms, matrix)   
     matrix_dump = json.dumps(matrix)
     print matrix_dump

Exemple #3

0

Afficher le fichier

Fichier : process_sentences.py Projet : folagit/resumatcher

def getAllSentsInColl(collection):
    allSents = []
    for job in collection.find(): 
        print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = [ (  jobDesc._id, sent )  for sent in  jobDesc.listAllSentences() ]
        allSents.extend(sents)  
    return allSents

Exemple #4

0

Afficher le fichier

def preprocess( job ):
    jobDesc = JobDescParser.parseJobDesc(job)    
    sents = jobDesc.listAllSentences() 
    sents2 = []
    for line in sents:     
        sents2.append( processLine(line)  )
    
    return sents2

Exemple #5

0

Afficher le fichier

Fichier : process_sentences.py Projet : folagit/resumatcher

def getAllSentsInColl(collection):
    allSents = []
    for job in collection.find():
        print "\n\n\n======", job["_id"], "============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = [(jobDesc._id, sent) for sent in jobDesc.listAllSentences()]
        allSents.extend(sents)
    return allSents

Exemple #6

0

Afficher le fichier

Fichier : loadsent1.py Projet : folagit/resumatcher

def getSentenceByTerm(collection, term, outputPath):
    
     matchingSents = []
     for job in collection.find(): 
      #   print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = jobDesc.listAllSentences() 
        jid = job["_id"]
        for sent in sents:
            tokens = [ token.lower() for token in word_tokenize(sent)]              
            if term in tokens : 
                matchingSents.append((jid, sent))
                print sent.encode("GBK", "ignore")
                
     sortedsents = sorted(matchingSents, key=lambda x:   len(x[1]) )
     dumpTwo(sortedsents, outputPath , ( lambda x: x[0] + ":" + x[1] ) )

Exemple #7

0

Afficher le fichier

Fichier : loadsent1.py Projet : folagit/resumatcher

def getJavaScipt(): 
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     newCol = srcBbClient.getCollection("daily_job_webdev")
     
     collection = newCol
     term = "javascript"
     matchingSents = []
     for job in collection.find(): 
      #   print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = jobDesc.listAllSentences() 
        jid = job["_id"]
        for sent in sents:
            tokens = [ token.lower() for token in word_tokenize(sent)]              
            if term in tokens : 
                matchingSents.append((jid, sent))
                print sent.encode("GBK", "ignore")
                
     sortedsents = sorted(matchingSents, key=lambda x:   len(x[1]) )
     dumpTwo(sortedsents, "..\skill\output\javascript" , ( lambda x: x[0] + ":" + x[1] ) )

Exemple #8

0

Afficher le fichier

Fichier : pairdistance.py Projet : folagit/resumatcher

def createDocs():
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     collection = srcBbClient.getCollection("daily_job_webdev")
     maxnum =99999
     docs = []
     i= 0
     for job in collection.find(): 
        i+=1
        if i == maxnum: 
             break
      #   print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = jobDesc.listAllSentences() 
       
        doc =[]
        for sent in sents:
            tokens = [ token.lower() for token in word_tokenize(sent)]              
            doc.extend(tokens)      
        docs.append(doc)   
    
     return docs

Exemple #9

0

Afficher le fichier

def getDisMatrixFromColletion():
    srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    collection = srcBbClient.getCollection("daily_job_webdev")
    f = open('sents.txt', 'w')
    # python will convert \n to os.linesep

    docs = []
    for job in collection.find():
        #  print "\n\n\n======",job["_id"],"============================\n"
        #   f.write(job["summary"].encode("GBK", "ignore")+"\n")
        jobDesc = JobDescParser.parseJobDesc(job)

        sents = jobDesc.listAllSentences()
        doc = []
        for sent in sents:
            # print sent.encode("GBK", "ignore")
            f.write(sent.encode("GBK", "ignore") + "\n")
            tokens = [token.lower() for token in word_tokenize(sent)]
            for token in tokens:
                if token == 'c':
                    #    print token
                    pass
            doc.extend(tokens)
        docs.append(doc)
    f.close()
    terms = [
        "javascript", "jquery", "html", "css", "java", "python", "ruby",
        "mysql", "jdbc", "cpp"
    ]
    #   terms=["javascript", "jquery", "html", "css", "java", "jsp", "python", "ruby", "ror"  ]

    # terms=["java","jdbc","spring","hibernate","mysql","oracle"]
    matrix = getDistanceMatrix(docs, terms)
    printDisMatrix(terms, matrix)
    matrix_dump = json.dumps(matrix)
    print matrix_dump