def getSentsByOntology(): owlfile = "..\..\jobaly\ontology\web_dev.owl" ontology = OntologyLib(owlfile) terms = [ " "+ x.lower()+" " for x in ontology.getLabelList()] terms.extend([" "+x.lower()+" " for x in ontology.getAllClassNames()]) srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") newCol = srcBbClient.getCollection("daily_job_webdev") collection = newCol matchingSents = [] for job in collection.find(): # print "\n\n\n======",job["_id"],"============================\n" jobDesc = JobDescParser.parseJobDesc(job) sents = jobDesc.listAllSentences() jid = job["_id"] for sent in sents: c = 0 sent = " "+sent.lower()+" " for term in terms: if sent.find(term) != -1: c+=1 if c==3 : print sent.encode("GBK", "ignore") matchingSents.append((jid, sent)) break sortedsents = sorted(matchingSents, key=lambda x: len(x[1]) ) dumpTwo(sortedsents, "term3" , ( lambda x: x[0] + ":" + x[1] ) )
def getDisMatrixFromColletion(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") collection = srcBbClient.getCollection("daily_job_webdev") f = open('sents.txt','w') # python will convert \n to os.linesep docs = [] for job in collection.find(): # print "\n\n\n======",job["_id"],"============================\n" # f.write(job["summary"].encode("GBK", "ignore")+"\n") jobDesc = JobDescParser.parseJobDesc(job) sents = jobDesc.listAllSentences() doc =[] for sent in sents: # print sent.encode("GBK", "ignore") f.write(sent.encode("GBK", "ignore")+"\n") tokens = [ token.lower() for token in word_tokenize(sent)] for token in tokens: if token == 'c': # print token pass doc.extend(tokens) docs.append(doc) f.close() terms=["javascript", "jquery", "html", "css", "java", "python", "ruby", "mysql", "jdbc" , "cpp" ] # terms=["javascript", "jquery", "html", "css", "java", "jsp", "python", "ruby", "ror" ] # terms=["java","jdbc","spring","hibernate","mysql","oracle"] matrix = getDistanceMatrix(docs, terms) printDisMatrix(terms, matrix) matrix_dump = json.dumps(matrix) print matrix_dump
def getAllSentsInColl(collection): allSents = [] for job in collection.find(): print "\n\n\n======",job["_id"],"============================\n" jobDesc = JobDescParser.parseJobDesc(job) sents = [ ( jobDesc._id, sent ) for sent in jobDesc.listAllSentences() ] allSents.extend(sents) return allSents
def preprocess( job ): jobDesc = JobDescParser.parseJobDesc(job) sents = jobDesc.listAllSentences() sents2 = [] for line in sents: sents2.append( processLine(line) ) return sents2
def getAllSentsInColl(collection): allSents = [] for job in collection.find(): print "\n\n\n======", job["_id"], "============================\n" jobDesc = JobDescParser.parseJobDesc(job) sents = [(jobDesc._id, sent) for sent in jobDesc.listAllSentences()] allSents.extend(sents) return allSents
def getSentenceByTerm(collection, term, outputPath): matchingSents = [] for job in collection.find(): # print "\n\n\n======",job["_id"],"============================\n" jobDesc = JobDescParser.parseJobDesc(job) sents = jobDesc.listAllSentences() jid = job["_id"] for sent in sents: tokens = [ token.lower() for token in word_tokenize(sent)] if term in tokens : matchingSents.append((jid, sent)) print sent.encode("GBK", "ignore") sortedsents = sorted(matchingSents, key=lambda x: len(x[1]) ) dumpTwo(sortedsents, outputPath , ( lambda x: x[0] + ":" + x[1] ) )
def getJavaScipt(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") newCol = srcBbClient.getCollection("daily_job_webdev") collection = newCol term = "javascript" matchingSents = [] for job in collection.find(): # print "\n\n\n======",job["_id"],"============================\n" jobDesc = JobDescParser.parseJobDesc(job) sents = jobDesc.listAllSentences() jid = job["_id"] for sent in sents: tokens = [ token.lower() for token in word_tokenize(sent)] if term in tokens : matchingSents.append((jid, sent)) print sent.encode("GBK", "ignore") sortedsents = sorted(matchingSents, key=lambda x: len(x[1]) ) dumpTwo(sortedsents, "..\skill\output\javascript" , ( lambda x: x[0] + ":" + x[1] ) )
def createDocs(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") collection = srcBbClient.getCollection("daily_job_webdev") maxnum =99999 docs = [] i= 0 for job in collection.find(): i+=1 if i == maxnum: break # print "\n\n\n======",job["_id"],"============================\n" jobDesc = JobDescParser.parseJobDesc(job) sents = jobDesc.listAllSentences() doc =[] for sent in sents: tokens = [ token.lower() for token in word_tokenize(sent)] doc.extend(tokens) docs.append(doc) return docs
def getDisMatrixFromColletion(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") collection = srcBbClient.getCollection("daily_job_webdev") f = open('sents.txt', 'w') # python will convert \n to os.linesep docs = [] for job in collection.find(): # print "\n\n\n======",job["_id"],"============================\n" # f.write(job["summary"].encode("GBK", "ignore")+"\n") jobDesc = JobDescParser.parseJobDesc(job) sents = jobDesc.listAllSentences() doc = [] for sent in sents: # print sent.encode("GBK", "ignore") f.write(sent.encode("GBK", "ignore") + "\n") tokens = [token.lower() for token in word_tokenize(sent)] for token in tokens: if token == 'c': # print token pass doc.extend(tokens) docs.append(doc) f.close() terms = [ "javascript", "jquery", "html", "css", "java", "python", "ruby", "mysql", "jdbc", "cpp" ] # terms=["javascript", "jquery", "html", "css", "java", "jsp", "python", "ruby", "ror" ] # terms=["java","jdbc","spring","hibernate","mysql","oracle"] matrix = getDistanceMatrix(docs, terms) printDisMatrix(terms, matrix) matrix_dump = json.dumps(matrix) print matrix_dump