Exemple #1
0
    def get_skill(self,jobname="python",num=5):
        """
        从demand中关键词抽出相关技能短语
        """
        key_words = {}
        
        jd_skill = self.clear_jd(self.jd_database[jobname]['demand'])
        
        for line in jd_skill:
            for word in jieba.cut(line):
                word = strQ2B(word).lower()
                if word in self.skill_words:
                    key_words[word] = key_words.get(word,1)+1


        key_words = sorted(key_words.iteritems(),key=lambda x:x[1],reverse=True)
        
        res = [ w[0] for w in key_words[:int(num*np.log(num))]]
        
        print 'key_words:'
        print '\n'.join(res)

        for word in jieba.cut(jobname):
            word = strQ2B(word).lower()
            if word in self.skill_words and word not in res:
                res.insert(0,word)

        after_top3 = res[3:]
        np.random.shuffle(after_top3)

        return res[:3]+after_top3[:num-3]
Exemple #2
0
    def get_skill(self, jobname="python", num=5):
        """
        从demand中关键词抽出相关技能短语
        """
        key_words = {}

        jd_skill = self.clear_jd(self.jd_database[jobname]['demand'])

        for line in jd_skill:
            for word in jieba.cut(line):
                word = strQ2B(word).lower()
                if word in self.skill_words:
                    key_words[word] = key_words.get(word, 1) + 1

        key_words = sorted(key_words.iteritems(),
                           key=lambda x: x[1],
                           reverse=True)

        res = [w[0] for w in key_words[:int(num * np.log(num))]]

        print 'key_words:'
        print '\n'.join(res)

        for word in jieba.cut(jobname):
            word = strQ2B(word).lower()
            if word in self.skill_words and word not in res:
                res.insert(0, word)

        after_top3 = res[3:]
        np.random.shuffle(after_top3)

        return res[:3] + after_top3[:num - 3]
Exemple #3
0
 def _extract_from_dataframe(self, df, ignore=(), remap={}, special_handler={}):
     if df is None or not isinstance(df, DataFrame):
         logging.error('cannot get data or wrong data -> %s!' % df)
         return
     for index, row in df.iterrows():
         code = row['code']
         if not code in self.stocks:
             logging.warning('stock %s missed?' % code)
             continue
         stock = self.stocks[code]
         for col_name in df.columns:
             if col_name == 'code' or col_name in ignore:
                 continue
             if col_name in special_handler:
                 special_handler[col_name](stock, df, row[col_name])
                 continue
             real_field = col_name in remap and remap[col_name] or col_name
             if not hasattr(stock, real_field):
                 logging.warning('stock obj has no attribute %s, skip' % col_name)
             else:
                 old = stock.__getattribute__(real_field)
                 new = isinstance(row[col_name], str) and util.strQ2B(row[col_name]).replace(' ', '') or row[col_name]
                 new = isinstance(new, float) and round(new, 2) or new
                 if old is not None and (isinstance(old, float) and not math.isnan(old)) and \
                    new is not None and (isinstance(new, float) and not math.isnan(new)) and \
                    old != new:
                     #logging.debug('field %s changed: old(%s) -> new(%s), %s' % (col_name, str(old), str(new), stock))
                     pass
                 stock.__setattr__(real_field, new)
Exemple #4
0
    def get_jd_with_textrank(self,
                             jobname='python',
                             duty_num=4,
                             demand_num=5,
                             skill_num=6):
        jobname = self.get_closet_jobname(jobname)
        res = OrderedDict()

        duty_num, demand_num, skill_num = map(
            lambda x: int(strQ2B(x.decode('utf-8'))),
            [duty_num, demand_num, skill_num])

        self.tk4sents.train(
            text='\n'.join(self.clear_jd(self.jd_database[jobname]['duty'])))
        duty = self.tk4sents.get_key_sentences(duty_num)
        res['duty'] = sorted(
            duty, cmp=lambda x, y: self.duty_score(x) - self.duty_score(y))

        self.tk4sents.train(
            text='\n'.join(self.clear_jd(self.jd_database[jobname]['demand'])))
        demand = self.tk4sents.get_key_sentences(demand_num)
        res['demand'] = sorted(
            demand,
            cmp=lambda x, y: self.demand_score(x) - self.demand_score(y))

        self.tk4words.train('\n'.join(
            self.clear_jd(self.jd_database[jobname]['demand'])))
        res['skill'] = self.tk4words.get_keywords(skill_num, word_min_len=1)
        #       res['skill_phrases'] = self.tk4words.get_keyphrases(skill_num*2,min_occur_num=2)
        return res
Exemple #5
0
    def regular_inc_name(self):
        res = set()
        for line in self.linelist:
            if self.clf.predict(line) in ['inc_name', 'other', 'job_name']:
                if self.START_DUTY.search(line) or self.START_DEMAND.search(
                        line):
                    continue
                findinc = self.INCNAME.search(line)
                if findinc:
                    if len(findinc.group()) < 26:
                        res.add(findinc.group().replace("招聘", ""))
                        break

        if not res:
            for line in self.linelist:
                if self.START_DUTY.search(line) or self.START_DEMAND.search(
                        line):
                    continue
                if self.clf.predict(line) != 'other': continue
                for item in jieba.cut(line):
                    if item in self.firmnames:
                        res.add(item)
                if len(res) > 0: break
        res = filter(
            lambda x: len(x) > 1 and not re.search(
                u'^\d|参与|负责|协助|的|[,。::!?,薪]|招聘|诚聘', x), list(res))
        self.result["inc_name"] = strQ2B(' / '.join(res))
        return res
def regulate_project_id(project_id_string):
    project_id_string = project_id_string.upper()
    project_id_string = project_id_string.replace(u"(", u"(")
    project_id_string = project_id_string.replace(u")", u")")
    project_id_string = project_id_string.replace(u"[", u"【")
    project_id_string = project_id_string.replace(u"]", u"】")
    project_id_string = util.strQ2B(project_id_string)
    return project_id_string
Exemple #7
0
    def get_closet_jobname(self,jobname='java'):
        jobname = strQ2B(jobname).lower()
        dis = [ (self.simhash_distance(jobname,other),other) for other in self.jd_database.keys() ]
        sorted_jobname = sorted(dis,key = lambda x:x[0])

        for k,v in sorted_jobname[:5]:
            print 'jobname',k,v
        return sorted_jobname[0][1]
Exemple #8
0
    def get_top_jobname(self,jobname):
        jobname = strQ2B(jobname).lower()
        if jobname in self.jd_skills_db:
            return jobname

        dis = [ (leven_distance(k,jobname),k) for k in self.jd_skills_db]
        dis.sort()
        return dis[0][1]
Exemple #9
0
    def get_top_jobname(self, jobname):
        jobname = strQ2B(jobname).lower()
        if jobname in self.jd_skills_db:
            return jobname

        dis = [(leven_distance(k, jobname), k) for k in self.jd_skills_db]
        dis.sort()
        return dis[0][1]
Exemple #10
0
def test_case():
    global debug
    debug = True
    test_case = [
        u"北京25中学校长",
    ]
    for t in test_case:
        t = strQ2B(t)
        extract_ne(t)
Exemple #11
0
    def get_closet_jobname(self, jobname='java'):
        jobname = strQ2B(jobname).lower()
        dis = [(self.simhash_distance(jobname, other), other)
               for other in self.jd_database.keys()]
        sorted_jobname = sorted(dis, key=lambda x: x[0])

        for k, v in sorted_jobname[:5]:
            print 'jobname', k, v
        return sorted_jobname[0][1]
Exemple #12
0
 def get_jd_with_kmeans(self,jobname='python',duty_num=4,demand_num=5,skill_num=6):
     jobname = self.get_closet_jobname(jobname)
     duty_num,demand_num,skill_num = map(lambda x:int(strQ2B(str(x).decode('utf-8'))),[duty_num,demand_num,skill_num])
     res = OrderedDict()
     res['duty'] = self.get_duty(jobname,duty_num)
     res['demand'] = self.get_demand(jobname,demand_num)
     res['skill1'] = self.get_skill(jobname,skill_num)
     res['skill2'] = self.get_skill2(jobname,skill_num)
     return res
Exemple #13
0
    def train2(self):
        res = {}
        i = 0
        for label in self.jd_cluster:
            i += 1
            tmp = {}
            add_jobname = []

            for jobname in self.jd_cluster[label]:
                if jobname in self.jd_database:
                    add_jobname.append(jobname)

                    for line in self.jd_database[jobname]['demand']:
                        for word in jieba.cut(line):
                            word = strQ2B(word)
                            if word not in self.skills_words: continue
                            if word not in tmp:
                                tmp[word] = 1
                            else:
                                tmp[word] += 1

            tmp = sorted(tmp.iteritems(), key=lambda x: x[1], reverse=True)

            for tmp_jobname in add_jobname:
                if tmp_jobname not in res:
                    res[tmp_jobname] = [w[0] for w in tmp[:100]]
                else:
                    res[tmp_jobname] += [w[0] for w in tmp[:100]]

                for word in jieba.cut(tmp_jobname):
                    word = strQ2B(word).lower()
                    if word in self.skills_words and word not in res[
                            tmp_jobname]:
                        res[tmp_jobname].insert(0, word)

            if i % 200 == 0:
                print i

        print 'origin', len(self.jd_database.keys())
        print i, 'done'
        print 'len(res)', len(res)
        json.dump(res, open('./data/jd_skills_db.json', 'wb'))
Exemple #14
0
    def regular_skill(self, num=6):
        res = []
        for line in self.linelist:
            if self.DEMAND.search(line) or self.clf.predict(line) == 'demand':
                for word in jieba.cut(line):
                    word = strQ2B(word).lower()
                    if word in self.skills:
                        res.append(word)

        sorted_words = [w[0] for w in Counter(res).most_common(2 * num)]

        for word in jieba.cut(self.result['job_name']):
            word = strQ2B(word).lower()
            if word in self.skills and word not in sorted_words:
                sorted_words.insert(0, word)

        after_top3 = sorted_words[3:]
        np.random.shuffle(after_top3)

        self.result['skill'] = sorted_words[:3] + after_top3[:num - 3]
Exemple #15
0
    def train2(self):
        res = {}
        i = 0
        for label in self.jd_cluster:
            i += 1
            tmp = {}
            add_jobname = []

            for jobname in self.jd_cluster[label]:
                if jobname in self.jd_database:
                    add_jobname.append(jobname)
               
                    for line in self.jd_database[jobname]['demand']:
                        for word in jieba.cut(line):
                            word = strQ2B(word)
                            if word not in self.skills_words:continue
                            if word not in tmp:
                                tmp[word] = 1
                            else:
                                tmp[word] += 1

            tmp = sorted(tmp.iteritems(),key=lambda x:x[1],reverse=True)
            
            for tmp_jobname in add_jobname:
                if tmp_jobname not in res:
                    res[tmp_jobname] = [w[0] for w in tmp[:100]]
                else:
                    res[tmp_jobname] += [w[0] for w in tmp[:100]]
                
                for word in jieba.cut(tmp_jobname):
                    word = strQ2B(word).lower()
                    if word in self.skills_words and word not in res[tmp_jobname]:
                        res[tmp_jobname].insert(0,word)

            if i%200==0:
                print i

        print 'origin',len(self.jd_database.keys())
        print i,'done'
        print 'len(res)',len(res)
        json.dump(res,open('./data/jd_skills_db.json','wb'))
Exemple #16
0
def mapper():
    for line in sys.stdin:
        try:
            dic = eval(line)
            content = dic.get('content', '')
            print '\t'.join(
                stop_word_filter([
                    word
                    for word in jieba.cut(chinese_word_filter(strQ2B(content)))
                ]))
        except:
            traceback.print_exc()
            continue
Exemple #17
0
    def regular_jobname(self):
        res = set()
        jdStr = self.jdStr
        findpos = re.search(u"(招聘岗位|招聘职位|职位名称|岗位名称|岗位[一二三四五六七八九])[:、:\s ]",
                            jdStr)
        #        if not findpos:
        #            findpos = re.search(u"(职位类别|职位职能)[::\s ]",jdStr)

        if findpos:
            pos = findpos.span()[1]
            linelist = jdStr[pos:].split("\n")
            for line in linelist:
                if len(line) < 2: continue
                if len(line) >= 2 and len(line) < 20:
                    if re.search(u"职位描述|查看|地址|工作|分享|举报|下一条|时间|福利|待遇|周末|双休",
                                 line):
                        continue
                    res.add(re.sub(u"聘请|高薪诚聘|诚聘|[,。、\d!]+", "", line.strip()))
                    break

        # 如果没有匹配到招聘的具体职位信息,就切词后到职位列表去匹配
        if not res:
            for line in self.linelist:
                if re.search(u"招聘|高薪|诚聘", line): continue
                if len(line) < 6 and not re.search(
                        u'岗位|岗位内容|工作内容|职责|任职|资格',
                        line) and self.clf.predict(line) == 'job_name':
                    res.add(line)
                    break
                findPos = self.JOBNAME.search(line)
                if findPos and len(findPos.group()) < 20 and not re.match(
                        u'\d', findPos.group()):
                    jobname = findPos.group()
                    res.add(re.sub(u"聘请|高薪诚聘|诚聘|急.|[,。、!]+", "", jobname))
                    break
                #   res.add(re.sub(u"\(.+\)|(.+)|【.+】|[,。、\s\d]+|聘请|高薪诚聘|诚聘|急招|","",line.strip()))

        if not res:
            for line in self.linelist:
                for word in jieba.cut(line.lower()):
                    if word in self.jobdic:
                        res.add(word)
                        self.result["job_name"] = " / ".join(res)
                        return res
        if not res:
            tag = re.search(u"实习生|兼职", self.jdStr)
            if tag:
                res.add(tag.group())
        self.result["job_name"] = strQ2B(" / ".join(res)).lower()
        return res
Exemple #18
0
 def get_jd_with_kmeans(self,
                        jobname='python',
                        duty_num=4,
                        demand_num=5,
                        skill_num=6):
     jobname = self.get_closet_jobname(jobname)
     duty_num, demand_num, skill_num = map(
         lambda x: int(strQ2B(str(x).decode('utf-8'))),
         [duty_num, demand_num, skill_num])
     res = OrderedDict()
     res['duty'] = self.get_duty(jobname, duty_num)
     res['demand'] = self.get_demand(jobname, demand_num)
     res['skill1'] = self.get_skill(jobname, skill_num)
     res['skill2'] = self.get_skill2(jobname, skill_num)
     return res
Exemple #19
0
 def _init_stock_objs(self):
     logging.info('getting stock list from tushare')
     df = ts.get_stock_basics()
     logging.info('tushare listed %d stocks' % df.index.size)
     for index, row in df.iterrows():
         stock = self.stocks[index] if index in self.stocks else Stock(code=index)
         for col_name in df.columns:
             # we only trust these data
             if not col_name in ('name', 'industry', 'area', 'timeToMarket'):
                 continue
             if not hasattr(stock, col_name):
                 logging.warning('Stock obj has no attribute %s, skip' % col_name)
             else:
                 value = row[col_name]
                 value = util.strQ2B(value).replace(' ', '') if isinstance(value, str) else value
                 stock.__setattr__(col_name, value)
         self.stocks[stock.code] = stock
Exemple #20
0
    def train(self,fname="./data/jd_skills_db.json"):
        res = {}
        for jobname in self.jd_database:
            res[jobname] = {}
            tmp = {}
            for line in self.jd_database[jobname]['demand']:
                for word in jieba.cut(line):
                    word = strQ2B(word).lower()
                    if word in self.skills_words:
                        tmp[word] = tmp.get(word,0)+1
            sorted_keywords = sorted(tmp.iteritems(),key=lambda x:x[1],reverse=True)
            res[jobname] =[ w[0] for w in sorted_keywords[:100] if len(w[0])>1]
            if len(jobname)>8 or len(jobname)<3:
                print jobname

        print len(res)
        json.dump(res,open(fname,'wb'),ensure_ascii=False)
Exemple #21
0
    def get_jd_with_textrank(self,jobname='python',duty_num=4,demand_num=5,skill_num=6):
        jobname = self.get_closet_jobname(jobname)
        res = OrderedDict()
        
        duty_num,demand_num,skill_num = map(lambda x:int(strQ2B(x.decode('utf-8'))),[duty_num,demand_num,skill_num])

        self.tk4sents.train(text='\n'.join(self.clear_jd(self.jd_database[jobname]['duty'])))
        duty = self.tk4sents.get_key_sentences(duty_num)
        res['duty'] = sorted(duty,cmp = lambda x,y:self.duty_score(x)-self.duty_score(y))

        self.tk4sents.train(text='\n'.join(self.clear_jd(self.jd_database[jobname]['demand'])))
        demand = self.tk4sents.get_key_sentences(demand_num)
        res['demand'] = sorted(demand,cmp = lambda x,y: self.demand_score(x)-self.demand_score(y))

        self.tk4words.train('\n'.join(self.clear_jd(self.jd_database[jobname]['demand'])))
        res['skill'] = self.tk4words.get_keywords(skill_num,word_min_len=1)
 #       res['skill_phrases'] = self.tk4words.get_keyphrases(skill_num*2,min_occur_num=2)
        return res
Exemple #22
0
    def get_skill2(self,jobname="java",num=6):
        """
        直接从技能词库中抽取出来,取最高频率前3个,加后面随机num-3个.
        """
        key_words = self.skill_db[jobname]

        res = [ w for w in key_words[:int(num*np.log(num))]]

        for word in jieba.cut(jobname):
            word = strQ2B(word).lower()
            if word in self.skill_words and word not in res:
                res.insert(0,word)

        after_top3 = res[3:]
        np.random.shuffle(after_top3)

        key_words = res[:3]+after_top3[:num-3]
        key_words.sort()

        return key_words
Exemple #23
0
    def get_skill2(self, jobname="java", num=6):
        """
        直接从技能词库中抽取出来,取最高频率前3个,加后面随机num-3个.
        """
        key_words = self.skill_db[jobname]

        res = [w for w in key_words[:int(num * np.log(num))]]

        for word in jieba.cut(jobname):
            word = strQ2B(word).lower()
            if word in self.skill_words and word not in res:
                res.insert(0, word)

        after_top3 = res[3:]
        np.random.shuffle(after_top3)

        key_words = res[:3] + after_top3[:num - 3]
        key_words.sort()

        return key_words
Exemple #24
0
    def train(self, fname="./data/jd_skills_db.json"):
        res = {}
        for jobname in self.jd_database:
            res[jobname] = {}
            tmp = {}
            for line in self.jd_database[jobname]['demand']:
                for word in jieba.cut(line):
                    word = strQ2B(word).lower()
                    if word in self.skills_words:
                        tmp[word] = tmp.get(word, 0) + 1
            sorted_keywords = sorted(tmp.iteritems(),
                                     key=lambda x: x[1],
                                     reverse=True)
            res[jobname] = [
                w[0] for w in sorted_keywords[:100] if len(w[0]) > 1
            ]
            if len(jobname) > 8 or len(jobname) < 3:
                print jobname

        print len(res)
        json.dump(res, open(fname, 'wb'), ensure_ascii=False)
Exemple #25
0
def extract_ne_from_onto(fname, o):
    f = open(fname, "r")
    a = f.readline()
    print a
    print fname
    for line in f.readlines():
        sent = line.strip().decode("utf8")
        if sent == "</DOC>":
            continue
        sent = strQ2B(sent)
        sent = re.sub(r"<( /)?/? ([^<]+) >", replace_exclude, sent)
        sent = re.sub(r"<ENAMEX ([^<]+)</ENAMEX>", replace_include, sent)
        # print sent
        sent = sent.split()
        sList = []
        i = 0
        while True:
            if i == len(sent):
                break
            s = sent[i]
            if s[:7] == "<ENAMEX":
                tag = tagDict.get(s.split("\"")[1], "/o")
                temp = s.split(">")[1].split("<")[0]
                sList.append(temp + tag)
            elif s[0] != "<":
                sList.append(s + "/o")
            else:
                a = 1
                print line
                for p in sent:
                    print p
                # print "".join(sent)
                # print fname
            i += 1
        sent_write = " ".join(sList)
        o.write(sent_write.encode("utf8") + "\n")
    f.close()
Exemple #26
0
def dealData(filename):
    """
    处理数据
    :return:
    """
    file = open(filename, 'r', encoding="utf8")

    char4train = []
    tag4train = []

    try:
        while True:
            text_line_ = file.readline()
            if text_line_:
                text_line_ = text_line_[23:]
                # 分句
                text_line_cut_Lst = text_line_.split("。/w  ")

                for text_line in text_line_cut_Lst:
                    # 去除最后的标点符号
                    text_line = text_line.strip()
                    if text_line[-2:] == "/w":
                        text_line = text_line[:-3]
                    if len(text_line) > 20:
                        text_line = deal_(text_line)
                        print(text_line)
                        text_line = util.strQ2B(text_line)

                        charLst, tagLSt = BMEWO(text_line)
                        char4train.append(charLst)
                        tag4train.append(tagLSt)

            else:
                break
    finally:
        file.close()
        return char4train, tag4train
Exemple #27
0
 def simhash_distance(self,job1,job2):
     job1 = strQ2B(job1.decode('utf-8')).lower()
     job2 = strQ2B(job2.decode('utf-8')).lower()
     return Simhash(job1).distance(Simhash(job2))
Exemple #28
0
 def simhash_distance(self, job1, job2):
     job1 = strQ2B(job1.decode('utf-8')).lower()
     job2 = strQ2B(job2.decode('utf-8')).lower()
     return Simhash(job1).distance(Simhash(job2))