def get_skill(self,jobname="python",num=5): """ 从demand中关键词抽出相关技能短语 """ key_words = {} jd_skill = self.clear_jd(self.jd_database[jobname]['demand']) for line in jd_skill: for word in jieba.cut(line): word = strQ2B(word).lower() if word in self.skill_words: key_words[word] = key_words.get(word,1)+1 key_words = sorted(key_words.iteritems(),key=lambda x:x[1],reverse=True) res = [ w[0] for w in key_words[:int(num*np.log(num))]] print 'key_words:' print '\n'.join(res) for word in jieba.cut(jobname): word = strQ2B(word).lower() if word in self.skill_words and word not in res: res.insert(0,word) after_top3 = res[3:] np.random.shuffle(after_top3) return res[:3]+after_top3[:num-3]
def get_skill(self, jobname="python", num=5): """ 从demand中关键词抽出相关技能短语 """ key_words = {} jd_skill = self.clear_jd(self.jd_database[jobname]['demand']) for line in jd_skill: for word in jieba.cut(line): word = strQ2B(word).lower() if word in self.skill_words: key_words[word] = key_words.get(word, 1) + 1 key_words = sorted(key_words.iteritems(), key=lambda x: x[1], reverse=True) res = [w[0] for w in key_words[:int(num * np.log(num))]] print 'key_words:' print '\n'.join(res) for word in jieba.cut(jobname): word = strQ2B(word).lower() if word in self.skill_words and word not in res: res.insert(0, word) after_top3 = res[3:] np.random.shuffle(after_top3) return res[:3] + after_top3[:num - 3]
def _extract_from_dataframe(self, df, ignore=(), remap={}, special_handler={}): if df is None or not isinstance(df, DataFrame): logging.error('cannot get data or wrong data -> %s!' % df) return for index, row in df.iterrows(): code = row['code'] if not code in self.stocks: logging.warning('stock %s missed?' % code) continue stock = self.stocks[code] for col_name in df.columns: if col_name == 'code' or col_name in ignore: continue if col_name in special_handler: special_handler[col_name](stock, df, row[col_name]) continue real_field = col_name in remap and remap[col_name] or col_name if not hasattr(stock, real_field): logging.warning('stock obj has no attribute %s, skip' % col_name) else: old = stock.__getattribute__(real_field) new = isinstance(row[col_name], str) and util.strQ2B(row[col_name]).replace(' ', '') or row[col_name] new = isinstance(new, float) and round(new, 2) or new if old is not None and (isinstance(old, float) and not math.isnan(old)) and \ new is not None and (isinstance(new, float) and not math.isnan(new)) and \ old != new: #logging.debug('field %s changed: old(%s) -> new(%s), %s' % (col_name, str(old), str(new), stock)) pass stock.__setattr__(real_field, new)
def get_jd_with_textrank(self, jobname='python', duty_num=4, demand_num=5, skill_num=6): jobname = self.get_closet_jobname(jobname) res = OrderedDict() duty_num, demand_num, skill_num = map( lambda x: int(strQ2B(x.decode('utf-8'))), [duty_num, demand_num, skill_num]) self.tk4sents.train( text='\n'.join(self.clear_jd(self.jd_database[jobname]['duty']))) duty = self.tk4sents.get_key_sentences(duty_num) res['duty'] = sorted( duty, cmp=lambda x, y: self.duty_score(x) - self.duty_score(y)) self.tk4sents.train( text='\n'.join(self.clear_jd(self.jd_database[jobname]['demand']))) demand = self.tk4sents.get_key_sentences(demand_num) res['demand'] = sorted( demand, cmp=lambda x, y: self.demand_score(x) - self.demand_score(y)) self.tk4words.train('\n'.join( self.clear_jd(self.jd_database[jobname]['demand']))) res['skill'] = self.tk4words.get_keywords(skill_num, word_min_len=1) # res['skill_phrases'] = self.tk4words.get_keyphrases(skill_num*2,min_occur_num=2) return res
def regular_inc_name(self): res = set() for line in self.linelist: if self.clf.predict(line) in ['inc_name', 'other', 'job_name']: if self.START_DUTY.search(line) or self.START_DEMAND.search( line): continue findinc = self.INCNAME.search(line) if findinc: if len(findinc.group()) < 26: res.add(findinc.group().replace("招聘", "")) break if not res: for line in self.linelist: if self.START_DUTY.search(line) or self.START_DEMAND.search( line): continue if self.clf.predict(line) != 'other': continue for item in jieba.cut(line): if item in self.firmnames: res.add(item) if len(res) > 0: break res = filter( lambda x: len(x) > 1 and not re.search( u'^\d|参与|负责|协助|的|[,。::!?,薪]|招聘|诚聘', x), list(res)) self.result["inc_name"] = strQ2B(' / '.join(res)) return res
def regulate_project_id(project_id_string): project_id_string = project_id_string.upper() project_id_string = project_id_string.replace(u"(", u"(") project_id_string = project_id_string.replace(u")", u")") project_id_string = project_id_string.replace(u"[", u"【") project_id_string = project_id_string.replace(u"]", u"】") project_id_string = util.strQ2B(project_id_string) return project_id_string
def get_closet_jobname(self,jobname='java'): jobname = strQ2B(jobname).lower() dis = [ (self.simhash_distance(jobname,other),other) for other in self.jd_database.keys() ] sorted_jobname = sorted(dis,key = lambda x:x[0]) for k,v in sorted_jobname[:5]: print 'jobname',k,v return sorted_jobname[0][1]
def get_top_jobname(self,jobname): jobname = strQ2B(jobname).lower() if jobname in self.jd_skills_db: return jobname dis = [ (leven_distance(k,jobname),k) for k in self.jd_skills_db] dis.sort() return dis[0][1]
def get_top_jobname(self, jobname): jobname = strQ2B(jobname).lower() if jobname in self.jd_skills_db: return jobname dis = [(leven_distance(k, jobname), k) for k in self.jd_skills_db] dis.sort() return dis[0][1]
def test_case(): global debug debug = True test_case = [ u"北京25中学校长", ] for t in test_case: t = strQ2B(t) extract_ne(t)
def get_closet_jobname(self, jobname='java'): jobname = strQ2B(jobname).lower() dis = [(self.simhash_distance(jobname, other), other) for other in self.jd_database.keys()] sorted_jobname = sorted(dis, key=lambda x: x[0]) for k, v in sorted_jobname[:5]: print 'jobname', k, v return sorted_jobname[0][1]
def get_jd_with_kmeans(self,jobname='python',duty_num=4,demand_num=5,skill_num=6): jobname = self.get_closet_jobname(jobname) duty_num,demand_num,skill_num = map(lambda x:int(strQ2B(str(x).decode('utf-8'))),[duty_num,demand_num,skill_num]) res = OrderedDict() res['duty'] = self.get_duty(jobname,duty_num) res['demand'] = self.get_demand(jobname,demand_num) res['skill1'] = self.get_skill(jobname,skill_num) res['skill2'] = self.get_skill2(jobname,skill_num) return res
def train2(self): res = {} i = 0 for label in self.jd_cluster: i += 1 tmp = {} add_jobname = [] for jobname in self.jd_cluster[label]: if jobname in self.jd_database: add_jobname.append(jobname) for line in self.jd_database[jobname]['demand']: for word in jieba.cut(line): word = strQ2B(word) if word not in self.skills_words: continue if word not in tmp: tmp[word] = 1 else: tmp[word] += 1 tmp = sorted(tmp.iteritems(), key=lambda x: x[1], reverse=True) for tmp_jobname in add_jobname: if tmp_jobname not in res: res[tmp_jobname] = [w[0] for w in tmp[:100]] else: res[tmp_jobname] += [w[0] for w in tmp[:100]] for word in jieba.cut(tmp_jobname): word = strQ2B(word).lower() if word in self.skills_words and word not in res[ tmp_jobname]: res[tmp_jobname].insert(0, word) if i % 200 == 0: print i print 'origin', len(self.jd_database.keys()) print i, 'done' print 'len(res)', len(res) json.dump(res, open('./data/jd_skills_db.json', 'wb'))
def regular_skill(self, num=6): res = [] for line in self.linelist: if self.DEMAND.search(line) or self.clf.predict(line) == 'demand': for word in jieba.cut(line): word = strQ2B(word).lower() if word in self.skills: res.append(word) sorted_words = [w[0] for w in Counter(res).most_common(2 * num)] for word in jieba.cut(self.result['job_name']): word = strQ2B(word).lower() if word in self.skills and word not in sorted_words: sorted_words.insert(0, word) after_top3 = sorted_words[3:] np.random.shuffle(after_top3) self.result['skill'] = sorted_words[:3] + after_top3[:num - 3]
def train2(self): res = {} i = 0 for label in self.jd_cluster: i += 1 tmp = {} add_jobname = [] for jobname in self.jd_cluster[label]: if jobname in self.jd_database: add_jobname.append(jobname) for line in self.jd_database[jobname]['demand']: for word in jieba.cut(line): word = strQ2B(word) if word not in self.skills_words:continue if word not in tmp: tmp[word] = 1 else: tmp[word] += 1 tmp = sorted(tmp.iteritems(),key=lambda x:x[1],reverse=True) for tmp_jobname in add_jobname: if tmp_jobname not in res: res[tmp_jobname] = [w[0] for w in tmp[:100]] else: res[tmp_jobname] += [w[0] for w in tmp[:100]] for word in jieba.cut(tmp_jobname): word = strQ2B(word).lower() if word in self.skills_words and word not in res[tmp_jobname]: res[tmp_jobname].insert(0,word) if i%200==0: print i print 'origin',len(self.jd_database.keys()) print i,'done' print 'len(res)',len(res) json.dump(res,open('./data/jd_skills_db.json','wb'))
def mapper(): for line in sys.stdin: try: dic = eval(line) content = dic.get('content', '') print '\t'.join( stop_word_filter([ word for word in jieba.cut(chinese_word_filter(strQ2B(content))) ])) except: traceback.print_exc() continue
def regular_jobname(self): res = set() jdStr = self.jdStr findpos = re.search(u"(招聘岗位|招聘职位|职位名称|岗位名称|岗位[一二三四五六七八九])[:、:\s ]", jdStr) # if not findpos: # findpos = re.search(u"(职位类别|职位职能)[::\s ]",jdStr) if findpos: pos = findpos.span()[1] linelist = jdStr[pos:].split("\n") for line in linelist: if len(line) < 2: continue if len(line) >= 2 and len(line) < 20: if re.search(u"职位描述|查看|地址|工作|分享|举报|下一条|时间|福利|待遇|周末|双休", line): continue res.add(re.sub(u"聘请|高薪诚聘|诚聘|[,。、\d!]+", "", line.strip())) break # 如果没有匹配到招聘的具体职位信息,就切词后到职位列表去匹配 if not res: for line in self.linelist: if re.search(u"招聘|高薪|诚聘", line): continue if len(line) < 6 and not re.search( u'岗位|岗位内容|工作内容|职责|任职|资格', line) and self.clf.predict(line) == 'job_name': res.add(line) break findPos = self.JOBNAME.search(line) if findPos and len(findPos.group()) < 20 and not re.match( u'\d', findPos.group()): jobname = findPos.group() res.add(re.sub(u"聘请|高薪诚聘|诚聘|急.|[,。、!]+", "", jobname)) break # res.add(re.sub(u"\(.+\)|(.+)|【.+】|[,。、\s\d]+|聘请|高薪诚聘|诚聘|急招|","",line.strip())) if not res: for line in self.linelist: for word in jieba.cut(line.lower()): if word in self.jobdic: res.add(word) self.result["job_name"] = " / ".join(res) return res if not res: tag = re.search(u"实习生|兼职", self.jdStr) if tag: res.add(tag.group()) self.result["job_name"] = strQ2B(" / ".join(res)).lower() return res
def get_jd_with_kmeans(self, jobname='python', duty_num=4, demand_num=5, skill_num=6): jobname = self.get_closet_jobname(jobname) duty_num, demand_num, skill_num = map( lambda x: int(strQ2B(str(x).decode('utf-8'))), [duty_num, demand_num, skill_num]) res = OrderedDict() res['duty'] = self.get_duty(jobname, duty_num) res['demand'] = self.get_demand(jobname, demand_num) res['skill1'] = self.get_skill(jobname, skill_num) res['skill2'] = self.get_skill2(jobname, skill_num) return res
def _init_stock_objs(self): logging.info('getting stock list from tushare') df = ts.get_stock_basics() logging.info('tushare listed %d stocks' % df.index.size) for index, row in df.iterrows(): stock = self.stocks[index] if index in self.stocks else Stock(code=index) for col_name in df.columns: # we only trust these data if not col_name in ('name', 'industry', 'area', 'timeToMarket'): continue if not hasattr(stock, col_name): logging.warning('Stock obj has no attribute %s, skip' % col_name) else: value = row[col_name] value = util.strQ2B(value).replace(' ', '') if isinstance(value, str) else value stock.__setattr__(col_name, value) self.stocks[stock.code] = stock
def train(self,fname="./data/jd_skills_db.json"): res = {} for jobname in self.jd_database: res[jobname] = {} tmp = {} for line in self.jd_database[jobname]['demand']: for word in jieba.cut(line): word = strQ2B(word).lower() if word in self.skills_words: tmp[word] = tmp.get(word,0)+1 sorted_keywords = sorted(tmp.iteritems(),key=lambda x:x[1],reverse=True) res[jobname] =[ w[0] for w in sorted_keywords[:100] if len(w[0])>1] if len(jobname)>8 or len(jobname)<3: print jobname print len(res) json.dump(res,open(fname,'wb'),ensure_ascii=False)
def get_jd_with_textrank(self,jobname='python',duty_num=4,demand_num=5,skill_num=6): jobname = self.get_closet_jobname(jobname) res = OrderedDict() duty_num,demand_num,skill_num = map(lambda x:int(strQ2B(x.decode('utf-8'))),[duty_num,demand_num,skill_num]) self.tk4sents.train(text='\n'.join(self.clear_jd(self.jd_database[jobname]['duty']))) duty = self.tk4sents.get_key_sentences(duty_num) res['duty'] = sorted(duty,cmp = lambda x,y:self.duty_score(x)-self.duty_score(y)) self.tk4sents.train(text='\n'.join(self.clear_jd(self.jd_database[jobname]['demand']))) demand = self.tk4sents.get_key_sentences(demand_num) res['demand'] = sorted(demand,cmp = lambda x,y: self.demand_score(x)-self.demand_score(y)) self.tk4words.train('\n'.join(self.clear_jd(self.jd_database[jobname]['demand']))) res['skill'] = self.tk4words.get_keywords(skill_num,word_min_len=1) # res['skill_phrases'] = self.tk4words.get_keyphrases(skill_num*2,min_occur_num=2) return res
def get_skill2(self,jobname="java",num=6): """ 直接从技能词库中抽取出来,取最高频率前3个,加后面随机num-3个. """ key_words = self.skill_db[jobname] res = [ w for w in key_words[:int(num*np.log(num))]] for word in jieba.cut(jobname): word = strQ2B(word).lower() if word in self.skill_words and word not in res: res.insert(0,word) after_top3 = res[3:] np.random.shuffle(after_top3) key_words = res[:3]+after_top3[:num-3] key_words.sort() return key_words
def get_skill2(self, jobname="java", num=6): """ 直接从技能词库中抽取出来,取最高频率前3个,加后面随机num-3个. """ key_words = self.skill_db[jobname] res = [w for w in key_words[:int(num * np.log(num))]] for word in jieba.cut(jobname): word = strQ2B(word).lower() if word in self.skill_words and word not in res: res.insert(0, word) after_top3 = res[3:] np.random.shuffle(after_top3) key_words = res[:3] + after_top3[:num - 3] key_words.sort() return key_words
def train(self, fname="./data/jd_skills_db.json"): res = {} for jobname in self.jd_database: res[jobname] = {} tmp = {} for line in self.jd_database[jobname]['demand']: for word in jieba.cut(line): word = strQ2B(word).lower() if word in self.skills_words: tmp[word] = tmp.get(word, 0) + 1 sorted_keywords = sorted(tmp.iteritems(), key=lambda x: x[1], reverse=True) res[jobname] = [ w[0] for w in sorted_keywords[:100] if len(w[0]) > 1 ] if len(jobname) > 8 or len(jobname) < 3: print jobname print len(res) json.dump(res, open(fname, 'wb'), ensure_ascii=False)
def extract_ne_from_onto(fname, o): f = open(fname, "r") a = f.readline() print a print fname for line in f.readlines(): sent = line.strip().decode("utf8") if sent == "</DOC>": continue sent = strQ2B(sent) sent = re.sub(r"<( /)?/? ([^<]+) >", replace_exclude, sent) sent = re.sub(r"<ENAMEX ([^<]+)</ENAMEX>", replace_include, sent) # print sent sent = sent.split() sList = [] i = 0 while True: if i == len(sent): break s = sent[i] if s[:7] == "<ENAMEX": tag = tagDict.get(s.split("\"")[1], "/o") temp = s.split(">")[1].split("<")[0] sList.append(temp + tag) elif s[0] != "<": sList.append(s + "/o") else: a = 1 print line for p in sent: print p # print "".join(sent) # print fname i += 1 sent_write = " ".join(sList) o.write(sent_write.encode("utf8") + "\n") f.close()
def dealData(filename): """ 处理数据 :return: """ file = open(filename, 'r', encoding="utf8") char4train = [] tag4train = [] try: while True: text_line_ = file.readline() if text_line_: text_line_ = text_line_[23:] # 分句 text_line_cut_Lst = text_line_.split("。/w ") for text_line in text_line_cut_Lst: # 去除最后的标点符号 text_line = text_line.strip() if text_line[-2:] == "/w": text_line = text_line[:-3] if len(text_line) > 20: text_line = deal_(text_line) print(text_line) text_line = util.strQ2B(text_line) charLst, tagLSt = BMEWO(text_line) char4train.append(charLst) tag4train.append(tagLSt) else: break finally: file.close() return char4train, tag4train
def simhash_distance(self,job1,job2): job1 = strQ2B(job1.decode('utf-8')).lower() job2 = strQ2B(job2.decode('utf-8')).lower() return Simhash(job1).distance(Simhash(job2))
def simhash_distance(self, job1, job2): job1 = strQ2B(job1.decode('utf-8')).lower() job2 = strQ2B(job2.decode('utf-8')).lower() return Simhash(job1).distance(Simhash(job2))