def test_pos(self): text = '厦门明天会不会下雨' pos = jiagu.pos(text) # 词性标注 print('POS tagging result:', [(c, p) for c, p in zip(text, pos)]) # Character-level labeling self.assertEqual(len(pos), len(text)) self.assertEqual(pos, ['n', 'n', 'a', 'nt', 'vu', 'd', 'vu', 'v', 'n'])
def analyze_lyrics_real(self, lyrics): freq_db = self.freq_db translation_db = self.translation_db whitelisted_pos = ["n"] lyrics_info = [] for line in lyrics: text = line.text time = line.time line_info = {"t": time, "words": []} logger.debug("Analyzing line {}".format(text)) words = jiagu.seg(text) pos = jiagu.pos(words) for item in zip(words, pos): word, p = item freq = self.freq_db.get(word, -1) translation = self.translation_db.get(word, None) word_info = {"w": word, "p": p, "f": freq, "e": translation} line_info["words"].append(word_info) lyrics_info.append(line_info) return lyrics_info
def generate_vector(query): global pku_seg, redis_api, args client = get_tencent_client() word_request = models.WordEmbeddingRequest() ''' The main purpose of this function is to accept an query, segment it into nouns, gather embedding of these nouns and take the average. calling jieba_seg, pku_seg all have some overhead where they need to build a dictionary It's be best to keep this alive, and call as a function somehow ''' time_stamp, _, query = query.partition('_') identifier, _, query = query.partition('_') temp_result = [] # some words cannot get segmented, need more time to investigate specific cases try: segment_list = list(filter(lambda item: item != ' ', jieba.lcut_for_search(query))) except: return '-1' # get part of speech, vote for is_noun and is_chinese for word in segment_list: jieba_pos = jieba_seg.lcut(word) jiagu_pos = jiagu.pos([word]) pku_pos = pku_seg.cut(word) try: temp_result.append(word_obj(word, jieba_pos[0].flag, jiagu_pos[0], pku_pos[0][1])) except: continue result_cleaned = list(filter(lambda item: item.is_n and item.is_cn, temp_result)) # fetch vector for each word vector_list = [] for word_ins in result_cleaned: current_word = word_ins.word try: params = {'Text': current_word} word_request.from_json_string(json.dumps(params)) current_response = client.WordEmbedding(word_request) word_vector = json.loads(current_response.to_json_string())['Vector'] except TencentCloudSDKException as error: print(f'\rreached exception with inner {error}') continue except: client = get_tencent_client() vector_list.append(word_vector) query_vector = list(map(lambda item: round( sum(item)/len(item) , 5), zip(*vector_list))) redis_api.lpush(f'{time_stamp}_{identifier}', str(query_vector) ) print(f'processed {time_stamp}_{identifier}_{query}, value pushed into Redis') return None
def contentpostagseg(chapter_id): wordsegss = getContentSeg(chapter_id) if PostWordSeg.query.filter_by(word_id=wordsegss[0][0].id).all() == []: for wordsegs in wordsegss: poss = jiagu.pos([wordseg.wordseg for wordseg in wordsegs]) # 词性标注 for i, pos in enumerate(poss): wordseg = PostWordSeg(postag=post_dict[pos], word_id=wordsegs[i].id) db.session.add(wordseg) tags = [[ PostWordSeg.query.filter_by(word_id=wordseg.id).first().postag for wordseg in wordsegs ] for wordsegs in wordsegss] return jsonify({'tags': tags}), 200
def execute(corpus): print('Start Pos processing...') for id in range(len(corpus)): post = corpus[id][0].strip() response = corpus[id][1].strip() response_lst = response.split(' ') while '' in response_lst: response_lst.remove('') response_pos = jiagu.pos(response_lst) response_pos = ' '.join(response_pos) corpus[id].append(response_pos) if ((id+1) % 50000) == 0: print("already process {} instances".format(id+1)) return corpus
def __init__(self, sen): self.sentence = sen self.words_list = jiagu.seg(sen) self.words_dict = {} self.word_class_dict = {} self.word_pos_dict = {} # 获取每个词的词性 word_class_list = jiagu.pos(self.words_list) # 将词性和词语关联起来放到字典中 word_class_set = set(word_class_list) # 获取词性的集合作为字典的键值 for word_class in word_class_set: self.words_dict[word_class] = [] for index in range(0, len(self.words_list)): self.words_dict[word_class_list[index]].append( self.words_list[index]) self.word_class_dict[ self.words_list[index]] = word_class_list[index] self.word_pos_dict[self.words_list[index]] = index
def split_by_pos(self, sentence): pos_dict = {} pos_dict_out = {} input_words = jiagu.seg(sent_flush(sentence)) # 分词 input_pos = jiagu.pos(input_words) # 标注词性 for word, po in zip(input_words, input_pos): if pos_dict.get(po) == None: pos_dict[po] = [word] else: pos_dict[po].append(word) for key, val in pos_dict.items(): if len(val) == 1: if pos_dict_out.get('other') == None: pos_dict_out['other'] = [val[0]] else: pos_dict_out['other'].append(val[0]) else: pos_dict_out[key] = val return pos_dict_out
import jiagu # jiagu.init() # 可手动初始化,也可以动态初始化 text = '苏州的天气不错' words = jiagu.seg(text) # 分词 print(words) words = jiagu.cut(text) # 分词 print(words) pos = jiagu.pos(words) # 词性标注 print(pos) ner = jiagu.ner(words) # 命名实体识别 print(ner) # 字典模式分词 text = '思知机器人挺好用的' words = jiagu.seg(text) print(words) # jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 jiagu.load_userdict(['思知机器人']) words = jiagu.seg(text) print(words) text = ''' 该研究主持者之一、波士顿大学地球与环境科学系博士陈池(音)表示,“尽管中国和印度国土面积仅占全球陆地的9%,但两国为这一绿化过程贡献超过三分之一。考虑到人口过多的国家一般存在对土地过度利用的问题,这个发现令人吃惊。”