Beispiel #1
0
    def split_test(self, sentence):
        #line = sentence.strip().decode('utf-8', 'ignore')  # 去除每行首尾可能出现的空格,并转为Unicode进行处理
        #line1 = re.sub("[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+".decode("utf8"),
        #               " ".decode("utf8"), line)
        #wordList = list(jieba.cut(line1))  # 用结巴分词,对每行内容进行分词

        print(HanLP.segment('你好,欢迎在Python中调用HanLP的API'))
        for term in HanLP.segment('下雨天地面积水'):
            print('{}\t{}'.format(term.word, term.nature))  # 获取单词与词性
        testCases = [
            "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京",
            "欢迎新老师生前来就餐", "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
            "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。"
        ]
        for sentence in testCases:
            print(HanLP.segment(sentence))
        # 关键词提取
        document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \
                   "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \
                   "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \
                   "严格地进行水资源论证和取水许可的批准。"
        print(HanLP.extractKeyword(document, 2))
        # 自动摘要
        print(HanLP.extractSummary(document, 3))
        # 依存句法分析
        print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。"))
Beispiel #2
0
    def pos_filter(self, s):
        if not s:
            return []
        wds = [w.word for w in HanLP.segment(s)]
        pos = [str(w.nature) for w in HanLP.segment(s) if w.nature]

        if len(''.join(wds)) < 2:
            return []
        if 'n' not in pos and 'nhd' not in pos:
            return []
        return ''.join(wds)
Beispiel #3
0
def make_index():
    with open(ITEM_INDEX_JSON, 'w', encoding='utf8') as item_index_file, \
            open(ITEM_SOURCE_JSON, 'r', encoding='utf8') as item_file:

        item_js = json.load(item_file)
        all_info = item_js['RECORDS']
        for item in all_info:
            title = item['TITLE']
            ITEM_DICT[item['ENTERPRISE_ID']]['org_id'] = item['ORG_ID']
            if 'items' not in ITEM_DICT[item['ENTERPRISE_ID']]:
                ITEM_DICT[item['ENTERPRISE_ID']]['items'] = set()
            # TODO: segment and filter here.
            segs = HanLP.segment(title)
            for word in segs:
                _word = word.word
                nature = str(word.nature)
                if nature in ['vn', 'vi']:
                    ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word)
                elif nature == 'v' and _word in V_SET:
                    ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word)
                elif nature in [
                        'n', 'ng', 'nh', 'nhd', 'nl', 'nm', 'nz', 'nba'
                ] and _word not in FIL_SET:
                    ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word)

        for key in ITEM_DICT.keys():
            ITEM_DICT[key]['items'] = list(ITEM_DICT[key]['items'])
        js_info = json.dumps(ITEM_DICT)
        item_index_file.write(js_info)

    with open(TYPE_INDEX_JSON, 'w', encoding='utf8') as type_index_file, \
            open(TYPE_SOURCE_JSON, 'r', encoding='utf8') as type_file:

        type_js = json.load(type_file)
        all_info = type_js['RECORDS']
        for item in filter(lambda x: len(x['CODE']) == 9, all_info):
            TYPE_DICT[item['CODE']] = set()
            if item['SERVICETYPEVALUE']:
                value_words = HanLP.segment(item['SERVICETYPEVALUE'])
                for word in value_words:
                    TYPE_DICT[item['CODE']].add(word.word)
            if item['KEYWORD']:
                key_words = HanLP.segment(item['KEYWORD'])
                for word in key_words:
                    TYPE_DICT[item['CODE']].add(word.word)
        # convert set to list
        for k in TYPE_DICT.keys():
            TYPE_DICT[k] = list(TYPE_DICT[k])

        js_info = json.dumps(TYPE_DICT)
        type_index_file.write(js_info)
Beispiel #4
0
def input_pipeline(sentence, lang, bpe=None):
    """
    1. 分词(zh)
    2. 转小写(en)
    3. tokenzie
    4. bpe
    """
    if lang == 'zh':
        seg = [term.word for term in HanLP.segment(sentence)]
        seg_str = ' '.join(seg)
        #print('分词后:', seg)
        mt = MosesTokenizer(lang='zh')
        tokenized_str = mt.tokenize(seg_str, return_str=True)
        #print('tokenize后;',tokenized_str)
        if bpe is not None:
            bpe_str = bpe.apply([tokenized_str])[0]
            #print('bpe后:', bpe_str)
            return bpe_str.split()
        return tokenized_str.split()
    elif lang == 'en':
        lower = sentence.lower()
        #print('小写后:'. lower)
        mt = MosesTokenizer(lang='en')
        tokenized_str = mt.tokenize(lower, return_str=True)
        #print('tokenize后;',tokenized_str)
        if bpe is not None:
            bpe_str = bpe.apply([tokenized_str])[0]
            #print('bpe后:', bpe_str)
            return bpe_str.split()
        return tokenized_str.split()
    else:
        raise Exception
Beispiel #5
0
    def split1list(self, sentence):
        line = sentence.strip().decode(
            'utf-8', 'ignore')  # 去除每行首尾可能出现的空格,并转为Unicode进行处理
        line1 = re.sub(
            "[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+".
            decode("utf8"), " ".decode("utf8"), line)
        #wordList = list(jieba.cut(line1))  # 用结巴分词,对每行内容进行分词
        wordList = HanLP.segment(line1.strip())
        poslist = set()
        for w in wordList:
            length = len(w.word)
            nature = str(w.nature)
            if length < 2 and nature.__contains__('w'):
                continue

            if w.word in self.stopwords:
                preflag = None
                continue

            #if self.isFormWord(nature):
            #    continue

            #wordpos = w.word + '   ' + nature
            #self.wordposlist.append(wordpos)

            poslist.add(w.word)

        return poslist
Beispiel #6
0
def add_to_dictionary(word, part, mod=0):
    result = CustomDictionary.add(word, part)
    if not result and mod:
        CustomDictionary.insert(word, part)
    text = "我用天猫交社保"
    print(HanLP.segment(text))
    return result
Beispiel #7
0
    def get_sentence_mapping(self, overload=False):
        """
        句子映射表
        :return:    vec_space,如: {'我们是中国人,我们爱自己的祖国':[......], '蜀道难,难于上青天':[......]}
        """
        sentence_to_vec_file = current_path + '/sentence_mapping.pkl'
        if not os.path.isfile(sentence_to_vec_file) or overload:
            print('首次加载句子时间较长,请稍等......')
            sentence_to_vec = {}
            for sentence in self.sentence_list:
                tmp = np.zeros(shape=self.dim)
                index = 0
                for obj in HanLP.segment(sentence):
                    word = obj.word
                    if word in self.char_mapping:
                        tmp += self.char_mapping[word]
                    else:
                        tmp += np.zeros(shape=self.dim)
                    index += 1
                tmp /= index
                sentence_to_vec[sentence] = tmp

                with open(sentence_to_vec_file, 'wb') as f:
                    pickle.dump(sentence_to_vec, f)
                    f.close()

        else:
            with open(sentence_to_vec_file, 'rb') as f:
                sentence_to_vec = pickle.load(f)
                f.close()

        return sentence_to_vec
def get_keywords(query, par_dict, sim_dic):
    _words = HanLP.segment(query)
    temp = []
    added = []
    keywords = []
    visited = set()

    for word in _words:
        _word = word.word
        nature = str(word.nature)
        if _word in SAVED:
            temp.append(_word)
        elif nature in ['vn', 'vi']:
            temp.append(_word)
        elif nature == 'v' and _word in V_SET:
            temp.append(_word)
        elif nature in ['n', 'ng', 'nh', 'nhd', 'nl', 'nm', 'nz', 'nba'
                        ] and _word not in FIL_SET and len(_word) > 1:
            temp.append(_word)
    for item in temp:
        added.append((item, 1.5))
        if item in par_dict:
            added.append((par_dict[item], 1))
        if item in sim_dic[0]:
            for sim in sim_dic[1][sim_dic[0][item]]:
                added.append((sim, 1))
    for item in added:
        if item[0] not in visited:
            keywords.append(item)
            visited.add(item)
    return keywords
def get_abstract_sentence(sentence, vocabulary):
    '''
    句子抽象化
    电影名 nm
    演员名 nnt
    电影类型 ng
    紧跟演员名之后的演员名 nnr
    评分 x
    '''
    abstract_sentence = []
    query_dict = {}
    second = False
    for segment in HanLP.segment(sentence):
        word = str(segment.word)
        nature = str(segment.nature)
        if nature == "nm":
            query_dict["nm"] = word
            word == "nm"
        elif nature == "nnt" and not second:
            query_dict["nnt"] = word
            word == "nnt"
            second = True
        elif nature == "ng":
            query_dict["ng"] = word
            word = "ng"
        elif nature == "m":
            query_dict["x"] = word
            word = "x"
        elif nature == "nnt" and second:
            query_dict["nnr"] = word
            word = "nnr"
            second = False
        if word in vocabulary:
            abstract_sentence.append(word)
    return abstract_sentence, query_dict
Beispiel #10
0
 def __iter__(self):
     """make each sentence a new line"""
     normed_sent = preprocess(self.strings)
     for sent in split_iter(normed_sent, self.eos_placement):
         sent = ''.join(sent)
         if sent:
             yield list(term.word for term in HanLP.segment(sent))
Beispiel #11
0
 def segment(self, text):
     word_tag_list = HanLP.segment(text)
     word_list = []
     for word_tag in word_tag_list:
         word, tag = str(word_tag).split('/')
         if tag=='n':
             word_list.append(word)
     return word_list
Beispiel #12
0
 def load_data(self, file):
     result = []
     with open(file, mode='r', encoding="utf-8") as fp:
         lines = fp.readlines()
         for line in lines:
             words = HanLP.segment(str(line).strip())
             result.append(" ".join([str(i.word) for i in words]))
     return result
Beispiel #13
0
def wordSeg(text):
    wordPostag = HanLP.segment(text)
    words, postags = [], []
    for line in wordPostag:
        line = str(line)
        word, postag = line.split('/')
        words.append(word)
        postags.append(postag)
    return words, postags
def segment(text):
    '''
    使用HanLP对中文句子进行分词
    '''
    try:
        seg_result = hanlp.segment(text)
        return [term.word for term in seg_result]
    except Exception:
        return text.split()
def remove(test_text):
    a = HanLP.segment(test_text)
    rem = dict()
    curs = 0
    for i in a:
        if str(i.nature) in ['ns','nz']:
            rem[str(i.word)] = [(curs,curs+len(str(i.word))-1)]
        curs += len(str(i.word))

    return rem
Beispiel #16
0
 def word_segment(self, sentence):
     word_tag_list = HanLP.segment(sentence)
     words = []
     for word_tag in word_tag_list:
         word_tag = str(word_tag).split('/')
         if len(word_tag) == 2:
             word, tag = word_tag
             if 'n' == tag and word not in self.stop_words:
                 words.append(word)
     return set(words)
Beispiel #17
0
def generate_feature(data, word2vec):
    """
        生成特征向量
    Args:
        data: 数据集
        word2vec:
    Returns:
    """
    features = []
    end = len(data.columns)
    for idx, row in data.iterrows():
        prefix_vec = np.zeros(DIM)
        title_vec = np.zeros(DIM)
        tag_vec = row.iloc[3: end]
        count = 0
        try:
            for word in HanLP.segment(row['prefix']):
                word = str(word).split('/')[0]
                word=unicode(word, 'utf-8')
                try:
                    prefix_vec += word2vec[word]
                    count += 1
                except:
                    print('word %s not in vocab' % word)
            if count > 0:
                prefix_vec = np.true_divide(prefix_vec, count)
            count = 0
            for word in HanLP.segment(row['title']):
                word = str(word).split('/')[0]
                word=unicode(word, 'utf-8')
                try:
                    title_vec += word2vec[word]
                    count += 1
                except:
                    print('word %s not in vocab' % word)
            if count > 0:
                title_vec = np.true_divide(title_vec, count)
        except Exception as e:
            print(e)
        feature = np.concatenate((prefix_vec, title_vec, tag_vec))
        features.append(feature)
        
    return pd.DataFrame(features)
Beispiel #18
0
 def get_job_address(self, source):
     soup = BeautifulSoup(
         source.text.encode('iso-8859-1').decode('gbk'), 'lxml')
     address = soup.find('p', {
         'class': 'msg ltype'
     }).text.split('\xa0\xa0|\xa0\xa0')[0]
     if 'ns' in HanLP.segment(address).toString():
         return address
     else:
         return ''
Beispiel #19
0
def delete_stop_words(item, stop_words):
    result_word = ""
    words = HanLP.segment(item)
    for word in words:
        word = str(word).split('/')[0]
        if (word in stop_words):
            continue
        else:
            result_word += word
    return result_word
Beispiel #20
0
    def __iter__(self):
        for sentence in get_sentence(self.fname):
            seg_list = HanLP.segment(sentence)
            for i, word in enumerate(seg_list):
                seg_list[i] = str(word).split('/')[0]


#            print str(seg_list)
#            seg_list = char_list_cheaner(seg_list)
            if seg_list:
                yield seg_list
 def get_sentence_vector(self, sentence):
     words = [item.word for item in HanLP.segment(sentence)]
     cnt = 0
     vec_fin = np.zeros(self.wv.vector_size)
     for w in words:
         if w in self.wv:
             vec_fin += self.get_word_vector(w)
             cnt += 1
     if cnt > 0:
         vec_fin = vec_fin / cnt
     return vec_fin
Beispiel #22
0
def replace_samePinyin(content, same_pinyin, word_freq_vocab, replace_num=1):
    """
    使用同音字替换content中关键词中,(替换规则为替换掉所有同音字出现频率最高的那个字)
    :param content:  要替换的文本
    :param same_pinyin: 相同拼音词汇表
    :param word_freq_vocab: 汉语字频率表
    :param replace_num: 要替换的数量,这个版本目前只考虑一个content中只替换一个字
    :return: 经过相同拼音替换掉的文本
    """
    segmentationList = HanLP.segment(content)
    word_list_of_content = list(content)
    # print(len(segmentationList))
    if len(set(segmentationList)) <= 2:
        keynum = 1
    elif len(segmentationList) > 2 and len(set(segmentationList)) <= 6:
        keynum = 2
    else:
        # keynum = int(len(set(segmentationList))/3)
        keynum = 4
    keywordList = get_keyword(content, keynum)  # 获取关键词
    key_character = []
    for word in keywordList:  # 提取关键词里的关键字
        key_character += list(word)
    key_character = list(set(key_character))  # 去掉重复的关键字
    key_character = [word for word in key_character if word in same_pinyin
                     ]  # 先检查关键词中的所有字是否都出现在same_pinyin词汇表中
    word_freq = []
    for i in key_character:  # 统计关键字的频率
        samePinyin_list = same_pinyin[i]  # 获取相同拼音的所有字
        samePinyin_freq = []
        for j in samePinyin_list:
            if j in word_freq_vocab:
                samePinyin_freq.append(word_freq_vocab[j])
            else:
                samePinyin_freq.append(1)
        word_freq.append(samePinyin_list[samePinyin_freq.index(
            max(samePinyin_freq))])
    freq = []
    if len(word_freq) != 0:
        for i in word_freq:
            if i in word_freq_vocab:
                freq.append(word_freq_vocab[i])
            else:
                freq.append(1)
        same_pinyin_HighFreq_word = word_freq[freq.index(max(freq))]
        replace_word = key_character[freq.index(max(freq))]
        replace_index = word_list_of_content.index(replace_word)
        word_list_of_content[replace_index] = same_pinyin_HighFreq_word
        new_content = "".join(word_list_of_content)
        # print("smae_pinyin",same_pinyin["火"])
        return new_content
    else:
        return content
Beispiel #23
0
    def split(self, sentence):
        line = sentence.strip().decode(
            'utf-8', 'ignore')  # 去除每行首尾可能出现的空格,并转为Unicode进行处理
        line1 = re.sub(
            "[0-9\s+\.\!\/_,$%^*()?;;::“”-【】+\"\']+|[+——!,;::“”。?、~@#¥%……&*()]+"
            .decode("utf8"), " ".decode("utf8"), line)
        #wordList = list(jieba.cut(line1))  # 用结巴分词,对每行内容进行分词
        wordList = HanLP.segment(line1)

        #
        self.process(wordList)
        return self.wordnetlist
Beispiel #24
0
    def hanlp_cut(self):
        lines = []
        df = pd.read_excel(self.excel_path)
        sentences = df['sentences']
        for sentence in sentences:
            if sentence is not np.nan:
                # print(sentence)
                cuts = HanLP.segment(sentence)
                lines.append(' '.join(cut.word for cut in cuts))

        # print(lines)
        self.save_to_excel(lines)
Beispiel #25
0
def extract_locations(text):
    """
    extract locations by from texts
    eg: extract_locations('我家住在陕西省安康市汉滨区。')
    :param: raw_text<string>
    :return: location_list<list> eg: ['陕西省安康市汉滨区', '安康市汉滨区', '汉滨区']
    """
    if text == '':
        return []
    seg_list = [(str(t.word), str(t.nature)) for t in HanLP.segment(text)]
    location_list = get_location(seg_list)
    return location_list
Beispiel #26
0
def prefix_cut_in_title(item):
    """
    cut the prefix and title. And matching them.
    """
    prefix = item["prefix"]
    title = item["title"]

    til_list = []
    words_til = HanLP.segment(title)
    for word_til in words_til:
        word_til = str(word_til).split('/')[0]
        til_list.append(word_til)

    words_pre = HanLP.segment(prefix)
    for word_pre in words_pre:
        word_pre = str(word_pre).split('/')[0]
        if (word_pre in til_list):
            continue
        else:
            return 0
    return 1
Beispiel #27
0
 def tokenize(self, text):
     # type: (Text) -> List[Token]
     from pyhanlp import HanLP
     terms = HanLP.segment(text)
     running_offset = 0
     tokens = []
     for term in terms:
         word_offset = text.index(term.word, running_offset)
         word_len = len(term.word)
         running_offset = word_offset + word_len
         tokens.append(Token(term.word, word_offset))
     logging.debug(terms)
     return tokens
Beispiel #28
0
def title_cut_word(item,stop_words):
    click=[]
    words=HanLP.segment(item)
    for word in words:
        word=str(word).split('/')[0]
        if((word in stop_words) or (word=='') or (word not in title_count_dict)):
            continue
        else:
            click.append(unicode(word, 'utf-8'))
    if(len(click)==0):
        return 0.0
    else:
        return str(click)
Beispiel #29
0
def title_cut_maxclick(item,stop_words):
    click=[]
    words=HanLP.segment(item)
    for word in words:
        word=str(word).split('/')[0]
        if((word in stop_words) or (word=='') or (word not in title_count_dict)):
            continue
        else:
            click.append(title_click_dict[word])
    if(len(click)==0):
        return 0.0
    else:
        return np.max(click)
Beispiel #30
0
def seg_with_han176(in_file, out_file_path, manual_seg_file):

    # save seg_result
    corpus = construct_corpus(in_file)
    f = open(out_file_path, "w", encoding='utf-8')
    for line in corpus:
        result_h176 = "=".join("%s" % t.word
                               for t in HanLP.segment(line))  # 每个text是一句话
        f.write(result_h176 + "\n")
        f.flush()

    # test qps
    corpus = construct_corpus(in_file, 500)
    start = time.time()
    for line in corpus:
        _ = HanLP.segment(line)
    end = time.time()
    qps = round(len(corpus) / (end - start), 2)

    # test accuracy
    p, r, f1, line_aver_length = evaluate(out_file_path, manual_seg_file)
    return qps, p, r, f1, line_aver_length