def cixing(old): org = old words = pseg.cut(org) qingli = "清理" try: while 1: w = words.next() k = words.next() #去掉形容词 if (w.flag == 'a' or w.flag == 'ad' or w.flag == 'an' or w.flag == 'ag' or w.flag == 'al') and w.word != qingli.decode('utf-8'): old = string.replace(old,w.word,"") if k.flag == 'uj': old = string.replace(old,w.word,"") except StopIteration: print 'old' words = pseg.cut(org) try: w = words.next() while 1: w = words.next() k = words.next() #去掉形容词 if (w.flag == 'a' or w.flag == 'ad' or w.flag == 'an' or w.flag == 'ag' or w.flag == 'al') and w.word != qingli.decode('utf-8'): old = string.replace(old,w.word,"") if k.flag == 'uj': old = string.replace(old,w.word,"") except StopIteration: print 'old' return old
def comsini(self,data1,data2): test_words={} all_words={} f1_text=data1 f1_seg_list =pseg.cut(f1_text) for w in f1_seg_list: if 'n' in w.flag or 'eng' in w.flag : test_words.setdefault(w.word,0) all_words.setdefault(w.word,0) all_words[w.word]+=1 ftest1_text = data2 mytest1_words = copy.deepcopy(test_words) ftest1_seg_list =pseg.cut(ftest1_text) for w in ftest1_seg_list: if 'n' in w.flag or 'eng' in w.flag : if mytest1_words.has_key(w.word): mytest1_words[w.word]+=1 sampdata=[] test1data=[] for key in all_words.keys(): sampdata.append(all_words[key]) test1data.append(mytest1_words[key]) test1simi=self.get_cossimi(sampdata,test1data) return test1simi
def generate_keywords(): client = MongoClient('localhost', 27017) db= client['baidu'] characteristic_keywords = db.characteristic_keywords file = open(url + '1') features = [] lines = file.readlines() keywords_count = {} for line in lines: news = line.split()[3] words = pesg.cut(news) for word, flag in words: if flag not in features: features.append(flag) for feature in features: generate_characteristic_keywords_monogodb(feature) for line in lines: news = line.split()[3] words = pesg.cut(news) for w, flag in words: keywords_count[w] = 0 for line in lines: news = line.split()[3] words = pesg.cut(news) for w, flag in words: keywords_count[w] = keywords_count[w] + 1 if keywords_count[w] > 0: keywords_count[w] = -10000000 characteristic_keywords.update({ 'feature': flag }, {'$push': { 'keywords': w}})
def jieba_cut(): #处理pos_all_dict文件 fp_pos = open("hownet/pos_all_dict.txt", "r") # 原始的积极词的词典 fp_pos_cut = codecs.open('hownet/pos_all_cut.txt', "w+", encoding='UTF-8') # 将结果保存到另一个文档中 contents = fp_pos.readlines() for content in contents: word = content.decode("utf-8") # 解码 word_tag = pseg.cut(word) str_tag = "" for tag in word_tag: str_tag += str(tag.word) + '/' + str(tag.flag) p = re.compile(r'/x(.*)') str_tag = p.sub(r'\1', str_tag) # 提取第一分组 fp_pos_cut.write(str_tag) fp_pos.close() fp_pos_cut.close() #处理pos_all_dict文件 fp_neg = open("hownet/neg_all_dict.txt", "r") # 原始的积极词的词典 fp_neg_cut = codecs.open('hownet/neg_all_cut.txt', "w+", encoding='UTF-8') # 将结果保存到另一个文档中 contents = fp_neg.readlines() for content in contents: word = content.decode("utf-8") # 解码 word_tag = pseg.cut(word) str_tag = "" for tag in word_tag: str_tag += str(tag.word) + '/' + str(tag.flag) p = re.compile(r'/x(.*)') str_tag = p.sub(r'\1', str_tag) # 提取第一分组 fp_neg_cut.write(str_tag) fp_neg.close() fp_neg_cut.close()
def st_parse(self, text, freq=.15, pos=.25, env=.4, le=.2): wdmap = {} pos = 0 for w in pseg.cut(text.lower()): pos += 1 if w.word not in self.model: continue if w.word not in wdmap: wdmap[w.word] = [0]*4 wdmap[w.word][0] += freq/2 # frequence if pos < 5: wdmap[w.word][1] = pos # position if len(w.word)>2: wdmap[w.word][3] = le # length for keytext in re.findall(u"(#.*?#)|(【.*?】)|(《.*?》)|(\".*?\")|(“.*?”)", text.lower()): for t in keytext: for w in pseg.cut(t): if w.word not in self.model: continue wdmap[w.word][2] = env # environment return wdmap
def getWordList(): weight = [0, 1, 2, 4, 6, 6, 10, 30, 40, 50, 60, 70]; ratio = [1, 0.8, 0.7, 0.9]; neededFlag = ['a', 'ad', 'an', 'i', 'l', 'n', 'nr', 'ns', 'nt']; os.chdir("1"); text = readText("hudong_type_info.txt"); hudong_type_words = pseg.cut(text) word_list1 = []; for w in hudong_type_words: if (w.flag in neededFlag) and (len(w.word) > 1): addWord([w.word, w.flag, 2 * weight[len(w.word)]], word_list1); source_list1 = ["baidu_info.txt", "hudong_zoom_info.txt", "iqili_tag_info.txt", "mtime_info.txt"]; for sourName in source_list1: text = readText(sourName); retWord = pseg.cut(text) for w in retWord: if (w.flag in neededFlag) and (len(w.word) > 1): #if (not (w.word in ban)): addWord([w.word, w.flag, weight[len(w.word)] * ratio[1]], word_list1); word_list2 = []; source_list2 = ["baiduwiki_info.txt", "douban_info.txt", "hudong_info.txt", "wiki_info.txt"]; for sourName in source_list2: text = readText(sourName); retWord = pseg.cut(text) for w in retWord: if (w.flag in neededFlag and (len(w.word) > 1)): #if (not (w.word in ban)): addWord([w.word, w.flag, weight[len(w.word)] * ratio[2]], word_list2); word_list3 = []; source_list3 = ["sogou_title_", "soso_title_"]; for sourName in source_list3: for i in range(12): text = readText(sourName + str(i + 1) + ".txt"); retWord = pseg.cut(text) for w in retWord: if (w.flag in neededFlag) and (len(w.word) > 1): #if (not (w.word in ban)): addWord([w.word, w.flag, weight[len(w.word)] * ratio[3]], word_list3); final_word = word_list1; for w in word_list2: addWord(w, final_word); for w in word_list3: addWord(w, final_word); deleteWord(final_word, banList); topWords = getTopWords(final_word); os.chdir(".."); #for w in topWords: # print w[0], w[1], w[2]; fout = codecs.open("print.txt", "w", encoding = "utf-8"); for w in final_word: fout.write(w[0] + " "); # fout.write("\n"); #print "f**k"; return topWords;
def test_pos(): s = u'是谁呢' assert(u'是谁'== normal_pos(s)) s = u'你会讲英语吗' assert(u'你会讲英语' == normal_pos(s)) s = u'_2005年我们出去玩2,_ 然后聘情况!知道道理5abc如何走*。这么说不 *' print list(pseg.cut(s)) s = u'户外活动有哪些' print list(pseg.cut(s)) s = u'知道这条路怎么走吗' print list(pseg.cut(s)) s = u'小突想知道这条路怎么走' print list(pseg.cut(s))
def testcase(): pattern = {} sh = xlrd.open_workbook(u'坚守模式.xls').sheet_by_index(0) for r in range(sh.nrows): for c in range(2,5): value = sh.cell_value(r, c) words = '\t'.join([word for word, tag in ps.cut(value, HMM=True)]) tags = '\t'.join([tag for word, tag in ps.cut(value, HMM=True)]) pattern[tags] = pattern.get(tags, []) + [words] #print ' '.join(['%s/%s' % (word, tag) for word, tag in ps.cut(' '.join(sh.row_values(r)[2:]))]) for p, info in pattern.iteritems(): print p print '\n'.join(list(set(info)))
def fenci(Num, segbook, typenum, segtables, sourcename): data = xlrd.open_workbook(sourcename) table = data.sheets()[typenum] nrows = table.nrows ncols = table.ncols row=1 col=1 ls=[] lsw=[] if ((Num%8)%4)%2==1: jieba.load_userdict('userdict.txt') while row<nrows : col=1 length=0 cell = table.cell(row,col).value s=cell ls=[] lsw=[] seg_list = pseg.cut(s.decode("utf-8")) for w in seg_list: length+=1 if length<50: if ((Num%8)%4)/2==0: seg_list = pseg.cut(s.decode("utf-8")) for w in seg_list: segtables[typenum].write(row,col,w.word) col+=1 segtables[typenum].write(row,col,w.flag) col+=1 else: cell = table.cell(row,col).value s=cell seg_list = pseg.cut(s.decode("utf-8")) for ww in seg_list: ls.append(ww.flag) lsw.append(ww.word) for i in range(length): if i-1>0: if ls[i-1]=='uj' and ls[i]!='n': ls[i]='n' if i-1>0 and i-2>=0: if (ls[i-2]=='n' or ls[i-2]=='nz' or ls[i-2]=='vn' or ls[i-2]=='ng' or ls[i-2]=='nl') and (ls[i-1]=='d' or ls[i-1]=='vd' or ls[i-1]=='ad' or ls[i-1]=='zg') and ls[i]!='a': ls[i]='a' for j in range(length): segtables[typenum].write(row,col,lsw[j]) col+=1 segtables[typenum].write(row,col,ls[j]) col+=1 row+=1 segbook.save('Segmentation.xls')
def My_make_word_could(fileStr,outPNGStr): jieba.load_userdict("ap_dict.txt") STOP_WORD = set() stopword_file = open("stopwords.txt") for each_line in stopword_file: each_line_list = pseg.cut(each_line) for elem in each_line_list: STOP_WORD.add(elem.word) STOP_WORD.add(each_line.strip().decode('utf-8')) stopword_file.close() ##-----------------------------------------------------cut and cul wrod freq------------------------------------ word_freq = {} ## fileStr = "kouzhao.txt" raw_file = open(fileStr) for line in raw_file: seg_list = pseg.cut(line) for ele in seg_list: words = ele.word.strip() ## print words in STOP_WORD if ((ele.flag == 'n' or ele.flag == 'a' ) and (words not in STOP_WORD)): if(word_freq.has_key(words)): word_freq[words] += 1 else: word_freq[words] = 1 raw_file.close() ##---------------------------------------------sort the result-------------------------- paixu= sorted(word_freq.iteritems(), key=lambda d:d[1], reverse = True) paixu_tiqu=paixu[0:25] print "over" ##for (k,v) in word_freq.items(): ## if v==1: ## del word_freq[k] ## print k,v ##for (k,v) in word_freq.items(): ## print k,v ##for item in word_freq.keys(): ## print item ##for (k,v) in (dict (paixu_tiqu)).items(): ## print k,v ##--------------------------------------------make word cloud -------------------------------- tags = make_tags(dict(paixu_tiqu)) ##print tags ## outPNGStr = 'kouzhao.png' create_tag_image(tags, outPNGStr, size=(2000, 1600), fontname='haokan.ttf',fontzoom=4) print "all over"
def cuttest(sent): result_arr = [] words_use = pseg.cut(test_sent) for word_use in words_use: result_arr.append({"pos":word_use.flag, "term":word_use.word}) print("no ckip") return result_arr
def type_features1(word): f = open(word) sentence = f.read() wordlist = pseg.cut(sentence) for w in wordlist: if w.flag.startswith('n'): return {'firstn':w.word}
def add_text_jieba(self, content): term_map = {} #jieba.enable_parallel(4) #tokens = jieba.cut(content) #for fet in tokens: words = pseg.cut(content) for w in words: fet = w.word pos = w.flag #if not pos in pos_list: #continue if pos != 'n' and pos != 'v' and pos != 'vn': continue #u0 = fet[0] #if not is_chinese_word(u0) : #continue if len(fet) < 2: continue #print fet, pos term_id = self.add_term(fet) if term_id in term_map: term_map[term_id] += 1 else: term_map[term_id] = 1 return term_map
def getWordsCount(in_file,out_file): # 读取excel文件 # lib = load_workbook(file, use_iterators = True) # 创建新的excel文件 wt_wb = Workbook(write_only=True) wt_ws = wt_wb.create_sheet() word_all='' fenci_list=[] dis_list = [] words_sum=[] lib = csv.reader(file(in_file, 'rb')) # 文档分词 for row in lib: fenci_list_p = list(pseg.cut(row[0])) for w in fenci_list_p: word = [w.word,w.flag] # print word words_sum.append(word) #生成字典 for word in words_sum: if word not in dis_list: dis_list.append(word) #计算词频 wt_ws.append(['words','notes','count']) for word in dis_list: count = words_sum.count(word) word_c = [word[0],word[1],count] wt_ws.append(word_c) # 保存excel wt_wb.save(out_file)
def tokenize(text): docid, body = text.split('\t', 1) items = [] for word,flag in pseg.cut(body): items.append('%s/%s'%(word,flag)) result = "%s\t%s"%(docid, ' '.join(items)) return result
def getWordsCount(file,sheets_name,anlysis_row_nm,out_file): # 读取excel文件 lib = load_workbook(file, use_iterators = True) # 创建新的excel文件 wt_wb = Workbook(write_only=True) wt_ws = wt_wb.create_sheet() word_all='' fenci_list=[] dis_list = [] words_sum=[] # 文档分词 for row in lib[sheets_name].iter_rows(): # print row[1].value fenci_list = list(jieba.cut(row[anlysis_row_nm].value, cut_all=False)) fenci_list_p = list(pseg.cut(row[anlysis_row_nm].value)) for w in fenci_list_p: word = [w.word,w.flag] # print word words_sum.append(word) #生成字典 for word in words_sum: if word not in dis_list: dis_list.append(word) #计算词频 wt_ws.append(['words','notes','count']) for word in dis_list: count = words_sum.count(word) word_c = [word[0],word[1],count] wt_ws.append(word_c) # 保存excel wt_wb.save(out_file)
def segment(self, text, lower=True, use_stop_words=True, use_speech_tags_filter=False): """对一段文本进行分词,返回list类型的分词结果 Keyword arguments: lower -- 是否将单词小写(针对英文) use_stop_words -- 若为True,则利用停止词集合来过滤(去掉停止词) use_speech_tags_filter -- 是否基于词性进行过滤。若为True,则使用self.default_speech_tag_filter过滤。否则,不过滤。 """ text = util.as_text(text) jieba_result = pseg.cut(text) if use_speech_tags_filter == True: jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter] else: jieba_result = [w for w in jieba_result] # 去除特殊符号 word_list = [w.word.strip() for w in jieba_result if w.flag != 'x'] word_list = [word for word in word_list if len(word) > 0] if lower: word_list = [word.lower() for word in word_list] if use_stop_words: word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words] return word_list
def run(self, textLine): array = pseg.cut(textLine) tag ='O' inner = 0 words =[] for w in array: words.append(w) line ="" for i in range(0, len(words)): if words[i].word == u'【': inner =1 continue if words[i].word == u'】': tag = 'O' inner = 0 continue if inner == 1: if i+1 < len(words) and words[i-1].word == u'【' and words[i+1].word == u'】': tag = 'S' elif words[i-1].word == u'【': tag = 'B' elif i+1 < len(words) and words[i+1].word == u'】': tag = 'E' else: tag = 'I' line = line + words[i].word + '\t' + words[i].flag + '\t' + tag + '\n' if inner == 0: print line return True
def passage_second_level_classify(content): """ given a passage content, return its second level class :param content: :return: a topic list with probablity """ first_class = passage_first_level_classify(content) print first_class lda_model = gensim.models.LdaModel.load('%s/wechat_data/lda_in_classify/%s.model' % (apath, first_class)) word_list = [] words = pseg.cut(content) for item in words: if item.flag in [u'n', u'ns']: word_list.append(item.word) train_set = [word_list] dic = gensim.corpora.Dictionary(train_set) corpus = [dic.doc2bow(text) for text in train_set] doc_lda = lda_model.get_document_topics(corpus) count = 0 # for j in lda_model.print_topics(20): # print count, j # count += 1 # print doc_lda topic_list = [] for i in lda_model[corpus]: for k in i: print lda_model.print_topic(k[0], 7), k[1] topic_list.append( {'topic_tag': u'%s-%s' % (first_class, k[0]), 'topic_content': lda_model.print_topic(k[0], 7), 'topic_prob': k[1]}) return topic_list
def postagger(sentence): pos_data = pseg.cut(sentence) pos_list = [] for w in pos_data: pos_list.append((w.word, w.flag)) #print pos_list[:] return pos_list
def __ansj_seg(self, content, tool = 'ansj_seg'): """ 默认使用ansj_seg分词工具 """ if tool == 'ansj_seg': ws = CrfppUtil.ansj_seg.cut(content) return ws else: return pseg.cut( content )
def jieba_pseg(fname,fenci_fname, pos_fname, tag_fname): f1 = open(fenci_fname,'w') f2 = open(pos_fname,'w') f3 = open(tag_fname, 'w') with open(fname) as xs: for l in xs.readlines(): l = l.strip() res = pseg.cut(l) token_list = [] pos_list = [] tag_list = [] for token, pos in res: token_list.append(token) pos_list.append(pos) tag = token + '/' + pos tag_list.append(tag) token_str = ' '.join(token_list) pos_str = ' '.join(pos_list) tag_str = ' '.join(tag_list) f1.write(token_str.encode('utf8') + '\n') f2.write(pos_str.encode('utf8') + '\n') f3.write(tag_str.encode('utf8') + '\n') f1.close() f2.close() f3.close()
def extractEntity(): db = client.holmesdb t_news = db.t_news_di res_list = t_news.find() last_name_dict = getLastNameDict() ntoken_dict = {} people_dict = {} row_cnt = 0 for res in res_list: row_cnt += 1 title = res["title"] content = res["content"] doc = myio.handleContent(title) + " " + myio.handleContent(content) words = pseg.cut(doc) for (word, flag) in words: if flag.find("n") != -1: print word, flag word1 = word[0].encode("utf-8") word2 = word[:2].encode("utf-8") if word1 in last_name_dict or word2 in last_name_dict: #print word[0], word[:2] people_dict[word] = people_dict.setdefault(word, 0) + 1 else: #print w.word, w.flag ntoken_dict[word] = ntoken_dict.setdefault(word, 0) + 1 ntoken_list = sorted(ntoken_dict.items(), lambda a, b: -cmp(a[1], b[1])) people_list = sorted(people_dict.items(), lambda a, b: -cmp(a[1], b[1]))
def read_sentences(sentence_path) : with codecs.open(sentence_path, 'r', 'gb18030') as fo : sentences = [line.strip().split('\t') for line in fo.readlines()] for sentence in sentences : words = pseg.cut(sentence[1]) for word, flag in words : print word.encode('utf8'), flag
def __is_clause_pattern3(self, the_clause, seg_result): for a_phrase in self.__phrase_dict: keys = a_phrase.keys() to_compile = a_phrase["key"].replace("……", "[\u4e00-\u9fa5]*") if "start" in keys: to_compile = to_compile.replace("*", "{" + a_phrase["start"] + "," + a_phrase["end"] + "}") if "head" in keys: to_compile = a_phrase["head"] + to_compile match = re.compile(to_compile).search(the_clause) if match is not None: can_continue = True pos = [flag for word, flag in posseg.cut(match.group())] if "between_tag" in keys: if a_phrase["between_tag"] not in pos and len(pos) > 2: can_continue = False if can_continue: for i in range(len(seg_result)): if seg_result[i].word in match.group(): try: if seg_result[i + 1].word in match.group(): return self.__emotional_word_analysis( a_phrase["key"] + ":" + match.group(), a_phrase["value"], [x for x, y in seg_result], i) except IndexError: return self.__emotional_word_analysis( a_phrase["key"] + ":" + match.group(), a_phrase["value"], [x for x, y in seg_result], i) return ""
def human2machine(msg): if not isinstance(msg, unicode): msg = msg.decode('utf-8') #: process with some hard coded translations first for k, v in h_d.items(): if k in msg.split('@3bugs')[-1]: return v[0] action = None action_type = None obj = None repeated_duration = 0 import jieba.posseg as pseg seg = classify(pseg.cut(msg), l_d.keys()) for v in seg['verb']: action = ch_d.get(v, None)[0] or action action_type = action_type_d.get(action, None)[0] for n in seg['noun']: obj = ch_d.get(n, None)[0] or obj repeated_duration = find_repeated(seg) or 0 if action and (action_type is not None) and \ (action == 'capture' or obj): return action, action_type, obj, repeated_duration else: logger.info('Found unknown command %s' % msg) logger.debug('%s %s %s %s' % (str(action), str(action_type), str(obj), str(repeated_duration))) logger.debug(seg) return None
def get_content(biz, sn): try: string = "" with open("../public/" + biz + "/" + sn + ".txt", 'r') as f: for l in f.readlines(): l = l.strip('\n') if (l == ""): continue if (l.find(biz) != -1): info = l.split(',') title = ','.join(info[4:-2]) author = info[-1] string = title print title, author continue string += l seg_list = pseg.cut(string) string = "" for word, flag in seg_list: if (flag in save): string += unicode.encode(word, 'utf-8') + ' ' return string + '\n' except: return None print("error")
def normal_pos(ins): if ins.strip() == '': return ins #s = q2b(ins) s = ins words = [''] for seg, zh in find_zh(s): #seg = zh_s.strip() if seg == '': continue if not zh: words.append(seg) continue for w in pseg.cut(seg): t = (w.word, w.flag) if any(t[1].find(fi) >= 0 for fi in reserve_pos): words.append(t[0]) elif any(t[1].find(fi) >= 0 for fi in filter_pos) \ or (t[1].find('d') >= 0 and all(t[0].find(ig) < 0 for ig in ignore_pos)): #if words[-1] == ' ': # continue #else: # words.append(' ') #print w continue else: words.append(t[0]) #print 'BEGPOS', ''.join(words), 'END' return merge_zh(''.join(words))
def testReference(self): import jieba # May fail to load jieba jieba.initialize(usingSmall=False) import jieba.posseg as pseg pwords = [] content = u'上海今日新确诊3例人感染H7N9禽流感病例' _ = """ ns 上海 t 今日 a 新 v 确诊 m 3 n 例人 v 感染 eng H7N9 n 禽流感 n 病例 """ content = u'李克强:在半岛挑事无异于搬石头砸自己脚' _ = """ nr 李克强 p 在 n 半岛 v 挑事 l 无异于 v 搬 l 石头砸 r 自己 n 脚 """ for word in pseg.cut(content): print word.flag, word.word
def is_question(s): s = s.strip() if s == '': return False cuts = list(pseg.cut(s)) pos = [w.word+w.flag for w in cuts] pos_set = set(pos) words = [w.word for w in cuts] flags = [w.flag for w in cuts] if u'是v' in pos_set and u'还是c' in pos_set: return True if pos[-1] == u'不d': return True if len(pos_set & questions_pos) > 0: return True sel = next((x for x in range(len(pos)) if pos[x] in [u'不d',u'还是c']), 0) if sel > 0: p1, p2 = set(pos[0:sel]), set(pos[sel+1:]) if len(p1 & p2) > 0: return True sel = next((x for x in range(len(words)) if words[x] == u'不'), 0) if sel > 0 and words[sel-1] == words[sel+1]: return True return False
def get_overseas_exp(): select_sql = "select * from teacherdata_info" teacher_list = dbs.getDics(select_sql) print(len(teacher_list)) extractor = Extractor() jieba.load_userdict("E:\\shixi\\justcoding\\extract_v1.0\\user_dict.txt") result_list = [] w_list = [] for teacher in teacher_list: if re.search(r'cksp\.eol\.cn', teacher["homepage"]) is not None: info_dict = eval(teacher["info"]) extractor.set_text(info_dict["个人简介"]) else: try: info = eval(teacher['info']) person_info = "".join(list(info.values())) except Exception as e: person_info = teacher['info'] if person_info is None: continue extractor.set_text(person_info) reList = [r'教育经历|学习经历|教育背景', r'个人简介|个人简历', teacher["name"]] extractor.cut_blocks(reList) extractor.compute_gravity() sentences = extractor.filter_sentence() if sentences is None: continue label = ["ns", "nt"] description = "" wo_list = [] for sentence in sentences: if re.search(r'留学', sentence): description = description + ";" + sentence continue words = pseg.cut(sentence) # for w in words: # if w.flag in label and re.search(r'大学|学院|院', w.word) and school_dict.get(w.word): # description = description + ";" + sentence # break words = [w for w in words if w.flag in label] if len(words) > 0: description = description + ";" + sentence wo_list = [w.word for w in words] if not description == "": w_list.extend(wo_list) result_list.append(("-".join(wo_list), teacher['id'], description.lstrip(";"))) print(len(result_list)) fw = open("5.csv", "a+", encoding="utf-8") for i in result_list: fw.write("%s,%s,%s\n" % i) fw.close() print(len(w_list)) fw1 = open("6.csv", "a+", encoding="utf-8") for i in w_list: fw1.write("%s\n" % i) fw1.close()
def gen_data(graph_path, lda_path, user_lim=200, user_wb_lim=200): import jieba import jieba.posseg as pseg jieba.load_userdict(u"/etc/jieba/jieba.dic") G = nx.Graph() ldaf = open(lda_path, 'w') ldaf.write("%d\n" % user_lim) ucnt = 0 for item in Weibo.objects.values('owner').annotate(cnt=Count('owner')): if item['cnt']>450: user = Account.objects.get(id=item['owner']) logging.info(u'%5d Dealing with %s' % (ucnt, user)) logging.info(u'Current graph:%d nodes and %d edges' % (G.number_of_nodes(), G.number_of_edges())) user_words = [] for wb in user.ownweibo.order_by("-created_at").all()[:user_wb_lim]: #filter(retweeted_status__exact=None).all(): text = wb.text.lower() #TODO #if wb.retweeted_status: # text = wb.retweeted_status.text.lower() + text text = re.sub("@[^\s@:]+", "", text) text = re.sub(u"http://t.cn[^ ]*", u"", text) text = re.sub(u"\[[^ ]{1,3}\]", u"", text) for word in re.findall(u"【.+?】|#.+?#|《.+?》|“.+?”|\".+?\"", text): for w in pseg.cut(word): if len(w.word)<2 or w.word in Config.STOP_WORDS or 'n' not in w.flag: continue wd = w.word.encode('utf-8') if G.has_node(wd) and 'weight' in G.node[wd]: G.node[wd]['weight'] += 1.0 else: G.add_node(wd, weight=1.0) wb_words = [] for w in pseg.cut(text): if len(w.word)>1 and 'n' in w.flag and w.word not in Config.STOP_WORDS: wb_words.append(w.word.encode('utf-8')) if not wb_words: continue for w1, w2, w3 in zip(wb_words[:-2], wb_words[1:-1], wb_words[2:]): if not G.has_edge(w1, w2): G.add_edge(w1, w2, weight=1.0) else: G[w1][w2]['weight'] += 1.0 if not G.has_edge(w1, w3): G.add_edge(w1, w3, weight=1.0) else: G[w1][w3]['weight'] += 1.0 user_words.extend(wb_words) if not user_words: continue ldaf.write(' '.join(user_words)+'\n') ucnt += 1 if ucnt>=user_lim: break if ucnt<user_lim: logging.error("no enough docs, %d/%d" % (ucnt, user_lim)) if graph_path: nx.write_yaml(G, graph_path, encoding='UTF-8') ldaf.close() return G
wyz 左引号,全角:“ ‘ 『 wyy 右引号,全角:” ’ 』 wj 句号,全角:。 ww 问号,全角:? 半角:? wt 叹号,全角:! 半角:! wd 逗号,全角:, 半角:, wf 分号,全角:; 半角: ; wn 顿号,全角:、 wm 冒号,全角:: 半角: : ws 省略号,全角:…… … wp 破折号,全角:—— -- ——- 半角:— —- wb 百分号千分号,全角:% ‰ 半角:% wh 单位符号,全角:¥ $ £ ° ℃ 半角:$ """ import jieba.posseg as pseg words = pseg.cut("我很饿,你知道吗?你有上过马来西亚大学吗?") for word, flag in words: print('%s %s' % (word, flag)) #---------------------------------------- #返回词语在原文的起止位置 import jieba result = jieba.tokenize(u'我还是围绕着想吃什么') for tk in result: print("Word: %s\t\t Start: %s\t\t End: %d" % (tk[0], tk[1], tk[2]))
# -*- coding: utf-8 -*- from __future__ import print_function, unicode_literals import sys sys.path.append("../") import jieba jieba.load_userdict("userdict.txt") import jieba.posseg as pseg test_sent = "l+navtion+cf1 l-navtion-cf1 l-navtion-co1 l+navtion+cn2 l.navtion.cn2 l#navtion#cn2 l:navtion:cn2 l:navtion|cn2 l-navtion.co1 Edu Trust认证 Edu Trust认" words = pseg.cut(test_sent) for k,v in words: print(k,v) w = jieba.cut(test_sent) print(",".join(w))
#get data, 500 per time cursor.execute("select user_id from users where is_evil=1 limit 0,500") allUser = cursor.fetchall() if len(allUser) == 0: exit(0) for eachUser in allUser: total += 1 print 'evil user is processing...' + str(eachUser[0]) cursor.execute('select content from post where user_id = %s',[eachUser[0]]) allMsg = cursor.fetchall() x = [0 for i in range(0, vLen)] for eachMsg in allMsg: #print eachMsg[1] soup = BeautifulSoup(str(eachMsg[0])) plaintext = soup.get_text().strip() seg_list = pseg.cut(plaintext) # x 为词库长度大小的词频列表 初始为 0 for w in seg_list: #tmp_word 为分词结果 tmp_word = w.word.strip() tmp_flag = w.flag[0] if len(tmp_word) == 0 or tmp_flag != 'n': continue else: if tmp_word not in v: continue else: x[v.index(tmp_word)] += 1 #x = [rate*1.0/(total+1) for rate in x] #使用之前的模型进行学习 new_class = clf.predict(x)
import jieba.analyse import jieba.posseg as peg jieba.load_userdict('./new_words.txt') words = peg.cut('事业编') tag = str(list(words)).split('/') print(tag)
def distance(self, text1, text2):#相似性计算主函数 word_list1=[word.word for word in pesg.cut(text1) if word.flag[0] not in ['w','x','u']] word_list2=[word.word for word in pesg.cut(text2) if word.flag[0] not in ['w','x','u']] return self.similarity_cosine(word_list1,word_list2)
def cut_and_flag(HAN_SENTENCE): WORDS_AND_FLAGS = pseg.cut(HAN_SENTENCE) return WORDS_AND_FLAGS
def txt2label(txt, sfsfile=None, style='default'): '''Return a generator of HTS format label of txt. Args: txt: like raw txt "向香港特别行政区同胞澳门台湾同胞" or txt with prosody make like "向#1香港#2特别行政区#1同胞#3澳门台湾#1同胞", punctuation is also allow in txt sfsfile: absolute path of sfs file (alignment file). A sfs file example(measure time by 10e-7 second, 12345678 means 1.2345678 second) -------- 239100 s 313000 a 323000 d 400000 b 480000 s --------- a stands for consonant b stands for vowel d stands for silence that is shorter than 100ms s stands for silence that is longer than 100ms style: label style, currently only support the default HTS format Return: A generator of phone label for the txt, convenient to save as a label file ''' assert style == 'default', 'Currently only default style is support in txt2label' # del all Chinese punctuation # punctuation = "·!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." # txt = re.sub(r'[%s]'%punctuation, '', txt) # delete all character which is not number && alphabet && chinese word txt = re.sub(r'(?!#)\W', '', txt) # If txt with prosody mark, use prosody mark, # else use jieba position segmetation if '#' in txt: words, poses, rhythms = _adjust(txt) else: txt = re.sub('[,.,。]', '#4', txt) words = [] poses = [] for word, pos in posseg.cut(txt): words.append(word) poses.append(pos[0]) rhythms = ['#0'] * (len(words) - 1) rhythms.append('#4') syllables = txt2pinyin(''.join(words)) phone_num = 0 for syllable in syllables: phone_num += len(syllable) # syllable is like ('b', 'a3') if sfsfile: phs_type = [] times = ['0'] with open(sfsfile) as fid: for line in fid.readlines(): line = line.strip().rstrip('\n') assert len(line.split(' ')) == 2, 'check format of sfs file' time, ph = line.split(' ') times.append(int(float(time))) phs_type.extend(ph) else: phs_type = [] for i, rhythm in enumerate(rhythms): single_word_pinyin = txt2pinyin(words[i]) single_word_phone_num = sum( [len(syllable) for syllable in single_word_pinyin]) phs_type.extend(['a'] * single_word_phone_num) if i != (len(rhythms) - 1) and rhythm == '#4': phs_type.append('s') ''' phs_type = ['a'] * phone_num ''' phs_type.insert(0, 's') phs_type.append('s') times = [0] * (len(phs_type) + 1) ''' for item in words: print(item) print (words) print (rhythms) print (syllables) print (poses) print (phs_type) print (times) ''' phone = tree(words, rhythms, syllables, poses, phs_type) return LabGenerator(phone, rhythms, times)
# 主函数 if __name__ == '__main__': # 1.输入文件 data_xls = pd.read_excel('词云/法国新闻.xlsx') # 2.这种方法是因为我用直接读取的方法有点麻烦 test_data = [] for i in data_xls.index.values: # 获取行号的索引,并对其进行遍历: # 根据i来获取每一行指定的数据 并利用to_dict转成字典 row_data = data_xls.loc[i, ['链接', '新闻', '日期', '来源', '内容']].to_dict() test_data.append(row_data) print("最终获取到的数据是:{0}".format(test_data)) #3.抽取内容中的地名和机构名 for i in test_data: words = pseg.cut(i["内容"]) i["地点"] = "" for word, flag in words: if (flag == 'ns' or flag == "nt"): print('%s, %s' % (word, flag)) i["地点"] += word + "\n" #4.保存抽取出来的地点 # 创建工作簿 file_name = "涉侨资讯_慈善公益.xlsx" workbook = xlsxwriter.Workbook(file_name) # 创建工作表 worksheet = workbook.add_worksheet('慈善公益') # 写单元格 worksheet.write(0, 0, '链接') worksheet.write(0, 1, '新闻')
def _firstWordSegmentationWithPOS(cleaned_raw_data_dict: dict, tools: str = 'jieba'): assert tools in ('pkuseg', 'jieba') print("Chinese word segmenting and Pre-part-of-speech tagging using {}...". format(tools)) word_seg_list_dict = defaultdict(list) word_seg_dict = {} pre_pos_list_dict = defaultdict(list) pre_pos_dict = {} for seq_num, string in cleaned_raw_data_dict.items(): if tools == 'pkuseg': words = pseg.cut(string) elif tools == 'jieba': words = jseg.cut(string) spaceDetector = 0 for word, flag in words: # For POS tagging "b" if len(word) >= 2 and word[ -1] == '状' and word[-2:] != '症状': # end with 状 but not 症状 flag = 'b' if len(word) >= 2 and word[-1] == '性' and ( word[-2:] != '毒性' and len(word) == 2): # end with 性, and not 毒性 flag = 'b' word_with_tag = word + '/' + flag if word == '\n': # jieba will retain last \n as word continue if flag == 'm' and re.match( r"\w+\.$", word): # Split "3." ("3./m") to "3 ." and "3/m ./w" word_seg_list_dict[seq_num].append(word[:-1]) pre_pos_list_dict[seq_num].append(word[:-1] + '/m') word = '.' word_with_tag = './w' if flag == 'nr': # people name # print(word, flag) # useful to find mis-classified name if len(word) >= 2 and word[0:2] in lastName: # e.g. 司馬 word_seg_list_dict[seq_num].append(word[0:2]) pre_pos_list_dict[seq_num].append(word[0:2] + '/nr') word = word[2:] word_with_tag = word_with_tag[2:] if len(word) == 2: # only lastname continue elif word[0] in lastName: word_seg_list_dict[seq_num].append(word[0]) pre_pos_list_dict[seq_num].append(word[0] + '/nr') if len(word) == 1: # only lastname continue word = word[1:] word_with_tag = word_with_tag[1:] word_seg_list_dict[seq_num].append(word) pre_pos_list_dict[seq_num].append(word_with_tag) # only work with jieba (pkuseg will change $$_ to $$&...) if word == '$' and spaceDetector == 0: spaceDetector += 1 elif word == '$' and spaceDetector == 1: spaceDetector += 1 elif word == '_' and spaceDetector == 2: spaceDetector = 0 for _ in range(3): word_seg_list_dict[seq_num].pop() pre_pos_list_dict[seq_num].pop() word_seg_list_dict[seq_num].append('$$_') pre_pos_list_dict[seq_num].append('$$_') spaceDetector = 0 else: spaceDetector = 0 for seq_num in word_seg_list_dict.keys(): word_seg_dict[seq_num] = " ".join(word_seg_list_dict[seq_num]) pre_pos_dict[seq_num] = " ".join(pre_pos_list_dict[seq_num]) return word_seg_dict, word_seg_list_dict, pre_pos_dict, pre_pos_list_dict
def cut_sentence(self, sent): words = [] _words = pseg.cut(sent) for _word in _words: words.append(_word.word) return words
import jieba.posseg as seg import codecs cnt0 = {} with codecs.open("./ind_keyword.ind", "r", "utf-8") as f: str0 = f.read() lis0 = str0.split("#") # print(lis0) for i in lis0: ls1 = seg.cut(i) for w in ls1: if w.flag in cnt0.keys(): cnt0[w.flag] += 1 else: cnt0[w.flag] = 1 cnt1 = {} with codecs.open("./tensorflow/data/embedding/sgns.wiki.bigram-char", "r", "utf-8") as f: str0 = f.readline() n = int(str0.split(' ')[0]) cnt00 = 0 for i in range(0, n): str0 = f.readline() lis = [] lis0 = str0.split(' ') if len(lis0) < 301: continue if not ('\u4e00' <= lis0[0] <= '\u9fff'): # print("jumped") continue
def get_word_objects(sentence): # 将自然语言转为Word对象 return [Word(word.encode('utf-8'), tag) for word, tag in pseg.cut(sentence)]
import jieba import jieba.posseg as pseg import time # sent = '明明是數字鎖,卻需要畫圖形,滑動解鎖竟然是將手機翻轉90度,圖形鎖則變成射擊到螢幕右上角。' # wordlist = jieba.cut(sent, cut_all=True) # print(" | ".join(wordlist)) # wordlist = jieba.cut(sent) # print(" | ".join(wordlist)) # wordlist = jieba.cut_for_search(sent) # print(" | ".join(wordlist)) start = time.time() # words = pseg.cut("這隻程式可以幫我們把網站資料爬下來") # jieba默认模式 jieba.enable_paddle() # 启动paddle模式。 0.40版之后开始支持,早期版本不支持 words = pseg.cut("這隻程式可以幫我們把網站資料爬下來", use_paddle=True) # paddle模式 for word, flag in words: print('%s %s' % (word, flag)) # print(words) # print(flags) end = time.time() print(end - start)
def search_and_destory(file_path, library_path): words = pseg.cut('江桥收费站至中环路严重堵塞,大量外地车辆涌入上海') for word in words: print word.word, word.flag
#-*-coding:utf-8 -*- ## python2.7 bin/jieba_cli.py 我爱北京 我爱Clojure import jieba.posseg as pseg import sys import json str_arrays = sys.argv str_arrays.pop(0) print str_arrays print json.dumps([[(word, flag) for word, flag in pseg.cut(words)] for words in str_arrays], ensure_ascii=False)
# -*- coding: utf-8 -*- import os, sys import jieba, codecs, math import jieba.posseg as pseg names = {} # 姓名字典 relationships = {} # 关系字典 lineNames = [] # 每段内人物关系 # count names jieba.load_userdict("dict.txt") # 加载字典 with codecs.open("busan.txt", "r", "utf8") as f: for line in f.readlines(): poss = pseg.cut(line) # 分词并返回该词词性 lineNames.append([]) # 为新读入的一段添加人物名称列表 for w in poss: if w.flag != "nr" or len(w.word) < 2: continue # 当分词长度小于2或该词词性不为nr时认为该词不为人名 lineNames[-1].append(w.word) # 为当前段的环境增加一个人物 if names.get(w.word) is None: names[w.word] = 0 relationships[w.word] = {} names[w.word] += 1 # 该人物出现次数加 1 for name, times in names.items(): print(name, times)
from jieba import posseg import sys import multiprocessing input_file = sys.argv[1] output_file = sys.argv[2] stopword = set([x.strip() for x in open('stopword.txt', encoding='utf8').readlines()]) corpus = [x.strip() for x in open(input_file, encoding='utf8').readlines()] corpus = [[y for y, z in posseg.cut(x) if z not in ['x', 'm', 'eng'] and y not in stopword] for x in corpus] open(output_file, 'w', encoding='utf8').writelines([' '.join(x) + '\n' for x in corpus])
def test_thulac(text): words = pseg.cut(text) print("jieba分词:") for word, flag in words: print('%s_%s' % (word, flag)) return
def get_features(self, string): word_list = [ word.word for word in pseg.cut(string) if word.flag[0] not in ['u', 'x', 'w', 'o', 'p', 'c', 'm', 'q'] ] return word_list
# coding: utf-8 import jieba.posseg as pseg words = pseg.cut("15亿光年神秘太空信号王源粉丝") for word, flag in words: print("%s %s" % (word, flag))
def entity_analysis(entity): db = neo_con words = entity.split(' ') if len(words) == 1: if is_loc(words[0]): return db.match_location4event_patient(entity) else: wordp = posseg.cut(words[0]) for w in wordp: if w.flag in ['v', 'vd', 'vn', 'vg']: return db.match_topic4event(entity) elif w.flag in ['nr']: return db.match_patient_name(entity) elif len(words) == 2: isloc_dict = {} flag = 0 for word in words: isloc_dict[word] = is_loc(word) if isloc_dict[word]: flag = 1 if isloc_dict[words[0]]: wordp = posseg.cut(words[1]) for w in wordp: if w.flag in ['v', 'vd', 'vn', 'vg']: return db.match_location_topic4event(words[0], words[1]) elif w.flag in ['m']: return db.match_location_time4event_patient( words[0], words[1]) else: gender = words[1].replace('性', '').replace('生', '') return db.match_location_gender4patient(words[0], gender) else: wordp = posseg.cut(words[0]) for w in wordp: if w.flag in ['v', 'vd', 'vn', 'vg']: return db.match_location_topic4event(words[1], words[0]) elif w.flag in ['m']: return db.match_location_time4event_patient( words[1], words[0]) else: gender = words[0].replace('性', '').replace('生', '') return db.match_location_gender4patient(words[1], gender) if not flag: wordp = posseg.cut(words[0]) for w in wordp: if w.flag in ['m']: return db.match_name_time4location_event( words[1], words[0]) else: return db.match_name_time4location_event( words[0], words[1]) elif len(words) == 3: loc = '' time = '' topic = '' for word in words: if is_loc(word): loc = word words.remove(word) break wordp = posseg.cut(words[0]) for w in wordp: if w.flag in ['m']: return db.match_location_time_topic4patient( loc, words[0], words[1]) else: return db.match_location_time_topic4patient( loc, words[1], words[0]) else: answer = db.match_location4event_patient(words[0]) if len(answer) == 0: answer = db.match_topic4event(words[0]) return answer
import re import jieba.posseg as psg import numpy as np # 去重,去除完全重复的数据 reviews = pd.read_csv("../tmp/reviews.csv") reviews = reviews[['content', 'content_type']].drop_duplicates() content = reviews['content'] # 去除去除英文、数字等 # 由于评论主要为京东美的电热水器的评论,因此去除这些词语 strinfo = re.compile('[0-9a-zA-Z]|京东|美的|电热水器|热水器|') content = content.apply(lambda x: strinfo.sub('', x)) # 分词 worker = lambda s: [(x.word, x.flag) for x in psg.cut(s)] # 自定义简单分词函数 seg_word = content.apply(worker) # 将词语转为数据框形式,一列是词,一列是词语所在的句子ID,最后一列是词语在该句子的位置 n_word = seg_word.apply(lambda x: len(x)) # 每一评论中词的个数 n_content = [[x + 1] * y for x, y in zip(list(seg_word.index), list(n_word))] index_content = sum(n_content, []) # 将嵌套的列表展开,作为词所在评论的id seg_word = sum(seg_word, []) word = [x[0] for x in seg_word] # 词 nature = [x[1] for x in seg_word] # 词性 content_type = [[x] * y for x, y in zip(list(reviews['content_type']), list(n_word))]
from collections import Counter import sys reload(sys) sys.setdefaultencoding('utf8') import jieba jieba.load_userdict('symptom.txt') total = [] wordtype = [] with open('segwords.txt','w') as f: with open('training.txt','r') as k: for i in k: #print i i = i.replace('"','') i = i.replace("'", '') seg = pseg.cut(i.strip()) for word, flag in seg: total.append(word) wordtype.append(flag) #f.write('\n'.join(seg)) f.write(word + flag + "\n") c = Counter(total) with open('dicmap.txt','w') as f: for i in c.most_common(): f.write('"'+ i[0].decode('utf-8').encode('unicode_escape')+'": ' + str(i[1]) + ', ')
def term_segment(self, phrase): words = pos_seg.cut(phrase) for w, t in words: if "n" in t: yield w
# -*- coding: utf-8 -*- import jieba.posseg as pseg wordList = [] file = open(r'./true_short.txt.bak', 'r').read() wfile = open('./true_short.txt', 'w') words = list(pseg.cut(file)) for w in words: wordList.append(w.word) # 去除停用词 stopwords = [] for line in open("stopword.txt"): line = line.strip('\n') line = line.strip(' ') stopwords.append(line) print stopwords for w in wordList: if w in stopwords: wordList.remove(w) for w in wordList: wfile.write(w.encode('utf-8'))
jieba.load_userdict("userdict.txt") import jieba.posseg as pseg jieba.add_word('石墨烯') jieba.add_word('凱特琳') jieba.del_word('自定义词') test_sent = ("李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n" "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n" "「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。") words = jieba.cut(test_sent) print('/'.join(words)) print("=" * 40) result = pseg.cut(test_sent) for w in result: print(w.word, "/", w.flag, ", ", end=' ') print("\n" + "=" * 40) terms = jieba.cut('easy_install is great') print('/'.join(terms)) terms = jieba.cut('python 的正则表达式是好用的') print('/'.join(terms)) print("=" * 40) # test frequency tune testlist = [ ('今天天气不错', ('今天', '天气')),
def evaluate_line(text): segwords = " ".join( [w.word for w in pseg.cut(text) if w.word not in stopwords]) prediction = classifier.predict([segwords]) return prediction[0][0]
#写roo write_root =ElementTree.Element("documents") for person in persons: refers = set() xml_doc = ElementTree.SubElement(write_root, "doc") name = person.find("name").text name = re.sub(r'(.*)', '', str(name)) dis = person.find("dis").text refers.add(name) #将去掉小括号的名字放到doc的name属性中 xml_doc.set("name", name) #s删除作品名 text = re.sub(r'《.*》', '', str(dis)) words = pseg.cut(text) try: for w in words: if len(w.word) == 1: continue if str(w.flag) == "nr": refers.add(w.word) except: pass #去除本人的名字 refers.remove(name) for refer in refers: xml_refer = ElementTree.SubElement(xml_doc, "refer") xml_refer.text = refer