Esempio n. 1
0
 def test_pos(self):
     text = '厦门明天会不会下雨'
     pos = jiagu.pos(text)  # 词性标注
     print('POS tagging result:',
           [(c, p) for c, p in zip(text, pos)])  # Character-level labeling
     self.assertEqual(len(pos), len(text))
     self.assertEqual(pos, ['n', 'n', 'a', 'nt', 'vu', 'd', 'vu', 'v', 'n'])
Esempio n. 2
0
    def analyze_lyrics_real(self, lyrics):
        freq_db = self.freq_db
        translation_db = self.translation_db
        whitelisted_pos = ["n"]

        lyrics_info = []
        for line in lyrics:
            text = line.text
            time = line.time

            line_info = {"t": time, "words": []}

            logger.debug("Analyzing line {}".format(text))

            words = jiagu.seg(text)
            pos = jiagu.pos(words)

            for item in zip(words, pos):
                word, p = item

                freq = self.freq_db.get(word, -1)
                translation = self.translation_db.get(word, None)

                word_info = {"w": word, "p": p, "f": freq, "e": translation}
                line_info["words"].append(word_info)
            lyrics_info.append(line_info)
        return lyrics_info
def generate_vector(query):
    global pku_seg, redis_api, args
    client = get_tencent_client()
    word_request = models.WordEmbeddingRequest()
    '''
    The main purpose of this function is to accept an query, segment it into nouns, 
    gather embedding of these nouns and take the average. 
    calling jieba_seg, pku_seg all have some overhead where they need to build a dictionary 
    It's be best to keep this alive, and call as a function somehow
    '''
    time_stamp, _, query = query.partition('_')
    identifier, _, query = query.partition('_')
        
    temp_result = []
    # some words cannot get segmented, need more time to investigate specific cases 
    try:
        segment_list = list(filter(lambda item: item != ' ', jieba.lcut_for_search(query)))
    except: 
        return '-1'

    # get part of speech, vote for is_noun and is_chinese
    for word in segment_list:
        jieba_pos = jieba_seg.lcut(word)
        jiagu_pos = jiagu.pos([word])
        pku_pos = pku_seg.cut(word)
        try:
            temp_result.append(word_obj(word, jieba_pos[0].flag, jiagu_pos[0], pku_pos[0][1]))
        except:
            continue
    result_cleaned = list(filter(lambda item: item.is_n and item.is_cn, temp_result))
    
    # fetch vector for each word
    vector_list = []
    for word_ins in result_cleaned: 
        current_word = word_ins.word
        try:
            params = {'Text': current_word}
            word_request.from_json_string(json.dumps(params))
            current_response = client.WordEmbedding(word_request)
            word_vector = json.loads(current_response.to_json_string())['Vector']
        except TencentCloudSDKException as error:
            print(f'\rreached exception with inner {error}')
            continue
        except: 
            client = get_tencent_client()
        vector_list.append(word_vector)
    
    query_vector = list(map(lambda item: round( sum(item)/len(item) , 5), zip(*vector_list)))
    
    redis_api.lpush(f'{time_stamp}_{identifier}', str(query_vector) )
    print(f'processed {time_stamp}_{identifier}_{query}, value pushed into Redis')
    return None 
def contentpostagseg(chapter_id):
    wordsegss = getContentSeg(chapter_id)

    if PostWordSeg.query.filter_by(word_id=wordsegss[0][0].id).all() == []:
        for wordsegs in wordsegss:
            poss = jiagu.pos([wordseg.wordseg for wordseg in wordsegs])  # 词性标注
            for i, pos in enumerate(poss):
                wordseg = PostWordSeg(postag=post_dict[pos],
                                      word_id=wordsegs[i].id)
                db.session.add(wordseg)

    tags = [[
        PostWordSeg.query.filter_by(word_id=wordseg.id).first().postag
        for wordseg in wordsegs
    ] for wordsegs in wordsegss]

    return jsonify({'tags': tags}), 200
Esempio n. 5
0
def execute(corpus):
    print('Start Pos processing...')
    for id in range(len(corpus)):
        post = corpus[id][0].strip()
        response = corpus[id][1].strip()

        response_lst = response.split(' ')
        while '' in response_lst:
            response_lst.remove('')

        response_pos = jiagu.pos(response_lst)
        response_pos = ' '.join(response_pos)

        corpus[id].append(response_pos)

        if ((id+1) % 50000) == 0:
            print("already process {} instances".format(id+1))

    return corpus
    def __init__(self, sen):
        self.sentence = sen
        self.words_list = jiagu.seg(sen)
        self.words_dict = {}
        self.word_class_dict = {}
        self.word_pos_dict = {}

        # 获取每个词的词性
        word_class_list = jiagu.pos(self.words_list)
        # 将词性和词语关联起来放到字典中
        word_class_set = set(word_class_list)  # 获取词性的集合作为字典的键值
        for word_class in word_class_set:
            self.words_dict[word_class] = []
        for index in range(0, len(self.words_list)):
            self.words_dict[word_class_list[index]].append(
                self.words_list[index])
            self.word_class_dict[
                self.words_list[index]] = word_class_list[index]
            self.word_pos_dict[self.words_list[index]] = index
Esempio n. 7
0
 def split_by_pos(self, sentence):
     pos_dict = {}
     pos_dict_out = {}
     input_words = jiagu.seg(sent_flush(sentence))  # 分词
     input_pos = jiagu.pos(input_words)  # 标注词性
     for word, po in zip(input_words, input_pos):
         if pos_dict.get(po) == None:
             pos_dict[po] = [word]
         else:
             pos_dict[po].append(word)
     for key, val in pos_dict.items():
         if len(val) == 1:
             if pos_dict_out.get('other') == None:
                 pos_dict_out['other'] = [val[0]]
             else:
                 pos_dict_out['other'].append(val[0])
         else:
             pos_dict_out[key] = val
     return pos_dict_out
Esempio n. 8
0
import jiagu

# jiagu.init() # 可手动初始化,也可以动态初始化

text = '苏州的天气不错'

words = jiagu.seg(text)  # 分词
print(words)

words = jiagu.cut(text)  # 分词
print(words)

pos = jiagu.pos(words)  # 词性标注
print(pos)

ner = jiagu.ner(words)  # 命名实体识别
print(ner)

# 字典模式分词
text = '思知机器人挺好用的'
words = jiagu.seg(text)
print(words)

# jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。
jiagu.load_userdict(['思知机器人'])

words = jiagu.seg(text)
print(words)

text = '''
该研究主持者之一、波士顿大学地球与环境科学系博士陈池(音)表示,“尽管中国和印度国土面积仅占全球陆地的9%,但两国为这一绿化过程贡献超过三分之一。考虑到人口过多的国家一般存在对土地过度利用的问题,这个发现令人吃惊。”