def get_clean_keywords_list(self, unique=True): """ @summary: 获取规则化的关键词列表, 由于用户输入关键词可能含有中文标点等情况. #统一为半角,小写,用户输入关键词按照1.自然分词 2.用户本身的输入间隔进行 #返回这两种分割非重复的关键词列表. """ uniform_keywords = uniform(self.keywords) # uniform_keywords = replace_punctuation(uniform_keywords) seg_keywords = jieba.cut(uniform_keywords) seg_keywords = [ keyword.lower() for keyword in seg_keywords if not is_other(keyword) ] user_seg_keywords = uniform_keywords.split() user_seg_keywords = [ keyword.lower() for keyword in user_seg_keywords if not is_other(keyword) ] if unique: seg_keywords = get_ordered_unique(seg_keywords) user_seg_keywords = get_ordered_unique(user_seg_keywords) if seg_keywords == user_seg_keywords: return [seg_keywords] else: return [seg_keywords, user_seg_keywords]
def get_clean_keywords_list(self, unique=True): """ @summary: 获取规则化的关键词列表, 由于用户输入关键词可能含有中文标点等情况. #统一为半角,小写,用户输入关键词按照1.自然分词 2.用户本身的输入间隔进行 #返回这两种分割非重复的关键词列表. """ uniform_keywords = uniform(self.keywords) # uniform_keywords = replace_punctuation(uniform_keywords) seg_keywords = jieba.cut(uniform_keywords) seg_keywords = [ keyword.lower() for keyword in seg_keywords if not is_other(keyword) ] user_seg_keywords = uniform_keywords.split() user_seg_keywords = [ keyword.lower() for keyword in user_seg_keywords if not is_other(keyword)] if unique: seg_keywords = get_ordered_unique(seg_keywords) user_seg_keywords = get_ordered_unique(user_seg_keywords) if seg_keywords == user_seg_keywords: return [seg_keywords] else: return [seg_keywords, user_seg_keywords]
def get_all_eng_words(self, unique=True): """ @summary: 获取关键词和职位描述中的所有单词或者单词+数字类型 ,如cocos2d """ desc = uniform(self.keywords + ' ' + self.job_desc) words = jieba.cut(desc) eng_words = [word for word in words if is_num_word(word)] if unique: eng_words = get_ordered_unique(eng_words) return eng_words