Example #1
0
def cixing(old):
    org = old
    words = pseg.cut(org)
    qingli = "清理"
    try:
        while 1:
            w = words.next()
            k = words.next()
            #去掉形容词
            if (w.flag == 'a' or w.flag == 'ad' or w.flag == 'an' or w.flag == 'ag' or w.flag == 'al') and w.word != qingli.decode('utf-8'):
                old = string.replace(old,w.word,"")
                if k.flag == 'uj':
                    old = string.replace(old,w.word,"")
    except StopIteration:
        print 'old'
    words = pseg.cut(org)

    try:
        w = words.next()
        while 1:
            w = words.next()
            k = words.next()
            #去掉形容词
            if (w.flag == 'a' or w.flag == 'ad' or w.flag == 'an' or w.flag == 'ag' or w.flag == 'al') and w.word != qingli.decode('utf-8'):
                old = string.replace(old,w.word,"")
                if k.flag == 'uj':
                    old = string.replace(old,w.word,"")
    except StopIteration:
        print 'old'
    return old
Example #2
0
    def comsini(self,data1,data2):
        test_words={}
        all_words={}
        f1_text=data1
        f1_seg_list =pseg.cut(f1_text)
        for w in f1_seg_list:
            if 'n' in w.flag or 'eng' in w.flag :
                test_words.setdefault(w.word,0)
                all_words.setdefault(w.word,0)
                all_words[w.word]+=1

        ftest1_text = data2
        mytest1_words = copy.deepcopy(test_words)
        ftest1_seg_list =pseg.cut(ftest1_text)
        for w in ftest1_seg_list:
            if 'n' in w.flag or 'eng' in w.flag :
                if mytest1_words.has_key(w.word):
                    mytest1_words[w.word]+=1
        sampdata=[]
        test1data=[]
        for key in all_words.keys():
            sampdata.append(all_words[key])
            test1data.append(mytest1_words[key])
        test1simi=self.get_cossimi(sampdata,test1data)
        return test1simi
def generate_keywords():
    client = MongoClient('localhost', 27017)
    db= client['baidu']
    characteristic_keywords = db.characteristic_keywords
    file = open(url + '1')
    features = []
    lines = file.readlines()
    keywords_count = {}
    for line in lines:
        news = line.split()[3]
        words = pesg.cut(news)
        for word, flag in words:
            if flag not in features:
                features.append(flag)
    for feature in features:
        generate_characteristic_keywords_monogodb(feature)
    for line in lines:
        news = line.split()[3]
        words = pesg.cut(news)
        for w, flag in words:
            keywords_count[w] = 0
    for line in lines:
        news = line.split()[3]
        words = pesg.cut(news)
        for w, flag in words:
            keywords_count[w] = keywords_count[w] + 1
            if keywords_count[w] > 0:
                keywords_count[w] = -10000000
                characteristic_keywords.update({ 'feature': flag }, {'$push': { 'keywords': w}})
def jieba_cut():
	#处理pos_all_dict文件
	fp_pos = open("hownet/pos_all_dict.txt", "r")   # 原始的积极词的词典
	fp_pos_cut = codecs.open('hownet/pos_all_cut.txt', "w+", encoding='UTF-8')  # 将结果保存到另一个文档中
	contents = fp_pos.readlines()
	for content in contents:
		word = content.decode("utf-8")  # 解码
		word_tag = pseg.cut(word)
		str_tag = ""
		for tag in word_tag:
			str_tag += str(tag.word) + '/' + str(tag.flag)
		p = re.compile(r'/x(.*)')
		str_tag = p.sub(r'\1', str_tag)   # 提取第一分组
		fp_pos_cut.write(str_tag)
	fp_pos.close()
	fp_pos_cut.close()

	#处理pos_all_dict文件
	fp_neg = open("hownet/neg_all_dict.txt", "r")   # 原始的积极词的词典
	fp_neg_cut = codecs.open('hownet/neg_all_cut.txt', "w+", encoding='UTF-8')  # 将结果保存到另一个文档中
	contents = fp_neg.readlines()
	for content in contents:
		word = content.decode("utf-8")  # 解码
		word_tag = pseg.cut(word)
		str_tag = ""
		for tag in word_tag:
			str_tag += str(tag.word) + '/' + str(tag.flag)
		p = re.compile(r'/x(.*)')
		str_tag = p.sub(r'\1', str_tag)  # 提取第一分组
		fp_neg_cut.write(str_tag)
	fp_neg.close()
	fp_neg_cut.close()
Example #5
0
    def st_parse(self, text, freq=.15, pos=.25, env=.4, le=.2):

        wdmap = {}

        pos = 0
        for w in pseg.cut(text.lower()):
            pos += 1
            if w.word not in self.model:
                continue
            if w.word not in wdmap:
                wdmap[w.word] = [0]*4
            wdmap[w.word][0] += freq/2 # frequence
            if pos < 5:
                wdmap[w.word][1] = pos # position
            if len(w.word)>2:
                wdmap[w.word][3] = le # length

        for keytext in re.findall(u"(#.*?#)|(【.*?】)|(《.*?》)|(\".*?\")|(“.*?”)", text.lower()):
            for t in keytext:
                for w in pseg.cut(t):
                    if w.word not in self.model:
                        continue
                    wdmap[w.word][2] = env # environment

        return wdmap
def getWordList():
    weight = [0, 1, 2, 4, 6, 6, 10, 30, 40, 50, 60, 70];
    ratio = [1, 0.8, 0.7, 0.9];
    neededFlag = ['a', 'ad', 'an', 'i', 'l', 'n',
                  'nr', 'ns', 'nt'];
    os.chdir("1");
    text = readText("hudong_type_info.txt");
    hudong_type_words = pseg.cut(text)
    word_list1 = [];
    for w in hudong_type_words:
        if (w.flag in neededFlag) and (len(w.word) > 1):
                addWord([w.word, w.flag, 2 * weight[len(w.word)]], word_list1);
    source_list1 = ["baidu_info.txt", "hudong_zoom_info.txt", 
                    "iqili_tag_info.txt", "mtime_info.txt"];
    for sourName in source_list1:
        text = readText(sourName);
        retWord = pseg.cut(text)
        for w in retWord:
            if (w.flag in neededFlag) and (len(w.word) > 1):
                #if (not (w.word in ban)):
                addWord([w.word, w.flag, weight[len(w.word)] * ratio[1]], word_list1);
    word_list2 = [];
    source_list2 = ["baiduwiki_info.txt", "douban_info.txt",
                    "hudong_info.txt",
                    "wiki_info.txt"];
    for sourName in source_list2:
        text = readText(sourName);
        retWord = pseg.cut(text)
        for w in retWord:
            if (w.flag in neededFlag and (len(w.word) > 1)):
                #if (not (w.word in ban)):
                addWord([w.word, w.flag, weight[len(w.word)] * ratio[2]], word_list2);
    word_list3 = [];
    source_list3 = ["sogou_title_", "soso_title_"];
    for sourName in source_list3:
        for i in range(12):
            text = readText(sourName + str(i + 1) + ".txt");
            retWord = pseg.cut(text)
            for w in retWord:
                if (w.flag in neededFlag) and (len(w.word) > 1):
                #if (not (w.word in ban)):
                    addWord([w.word, w.flag, weight[len(w.word)] * ratio[3]], word_list3);
    final_word = word_list1;
    for w in word_list2:
        addWord(w, final_word);
    for w in word_list3:
        addWord(w, final_word);
    deleteWord(final_word, banList);
    topWords = getTopWords(final_word);                
    os.chdir("..");
    #for w in topWords:
   #     print w[0], w[1], w[2];
    fout = codecs.open("print.txt", "w", encoding = "utf-8");
    for w in final_word:
        fout.write(w[0] + "    ");
     #   fout.write("\n");
    #print "f**k";
    return topWords;
Example #7
0
def test_pos():
    s = u'是谁呢'
    assert(u'是谁'== normal_pos(s))
    s = u'你会讲英语吗'
    assert(u'你会讲英语' == normal_pos(s))
    s = u'_2005年我们出去玩2,_ 然后聘情况!知道道理5abc如何走*。这么说不 *'
    print list(pseg.cut(s))
    s = u'户外活动有哪些'
    print list(pseg.cut(s))
    s = u'知道这条路怎么走吗'
    print list(pseg.cut(s))
    s = u'小突想知道这条路怎么走'
    print list(pseg.cut(s))
Example #8
0
def testcase():
    pattern = {}
    sh = xlrd.open_workbook(u'坚守模式.xls').sheet_by_index(0)
    for r in range(sh.nrows):
        for c in range(2,5):
            value = sh.cell_value(r, c)
            words = '\t'.join([word for word, tag in ps.cut(value, HMM=True)])
            tags = '\t'.join([tag for word, tag in ps.cut(value, HMM=True)])
            pattern[tags] = pattern.get(tags, []) + [words]
        #print ' '.join(['%s/%s' % (word, tag) for word, tag in ps.cut(' '.join(sh.row_values(r)[2:]))])
    for p, info in pattern.iteritems():
        print p
        print  '\n'.join(list(set(info)))
Example #9
0
def fenci(Num, segbook, typenum, segtables, sourcename):
	data = xlrd.open_workbook(sourcename)
	table = data.sheets()[typenum]
	nrows = table.nrows
	ncols = table.ncols
	row=1
	col=1
	ls=[]
	lsw=[]
	if ((Num%8)%4)%2==1:
		jieba.load_userdict('userdict.txt')
	while row<nrows :
		col=1
		length=0
		cell = table.cell(row,col).value
		s=cell
		ls=[]
		lsw=[]
		seg_list = pseg.cut(s.decode("utf-8"))
		for w in seg_list:
			length+=1
		if length<50:
			if ((Num%8)%4)/2==0:
				seg_list = pseg.cut(s.decode("utf-8"))
				for w in seg_list:
					segtables[typenum].write(row,col,w.word)
					col+=1
					segtables[typenum].write(row,col,w.flag)
					col+=1
					
			else:
				cell = table.cell(row,col).value
				s=cell
				seg_list = pseg.cut(s.decode("utf-8"))
				for ww in seg_list:
					ls.append(ww.flag)
					lsw.append(ww.word)
				for i in range(length):
					if i-1>0:
						if ls[i-1]=='uj' and ls[i]!='n':
							ls[i]='n'
					if i-1>0 and i-2>=0:
						if (ls[i-2]=='n' or ls[i-2]=='nz' or ls[i-2]=='vn' or ls[i-2]=='ng' or ls[i-2]=='nl') and (ls[i-1]=='d' or ls[i-1]=='vd' or ls[i-1]=='ad' or ls[i-1]=='zg') and ls[i]!='a':
							ls[i]='a'
				for j in range(length):
					segtables[typenum].write(row,col,lsw[j])
					col+=1
					segtables[typenum].write(row,col,ls[j])
					col+=1
		row+=1	
	segbook.save('Segmentation.xls')
Example #10
0
def My_make_word_could(fileStr,outPNGStr):

    jieba.load_userdict("ap_dict.txt")
    STOP_WORD = set()
    stopword_file = open("stopwords.txt")
    for each_line in stopword_file:
        each_line_list = pseg.cut(each_line)
        for elem in each_line_list:
            STOP_WORD.add(elem.word)
        STOP_WORD.add(each_line.strip().decode('utf-8'))
    stopword_file.close()

    ##-----------------------------------------------------cut and cul wrod freq------------------------------------
    word_freq = {}
##    fileStr = "kouzhao.txt"
    raw_file = open(fileStr)
    for line in raw_file:
        seg_list = pseg.cut(line)
        for ele in seg_list:
            words = ele.word.strip()
    ##        print words in STOP_WORD
            if ((ele.flag == 'n' or ele.flag == 'a' ) and (words not in STOP_WORD)):
                if(word_freq.has_key(words)):
                    word_freq[words] += 1
                else:
                    word_freq[words] = 1
    raw_file.close()

    ##---------------------------------------------sort the result--------------------------
    paixu= sorted(word_freq.iteritems(), key=lambda d:d[1], reverse = True)
    paixu_tiqu=paixu[0:25]
    print "over"


    ##for (k,v) in word_freq.items():
    ##    if v==1:
    ##        del word_freq[k]
    ##    print k,v
    ##for (k,v) in word_freq.items():    
    ##    print k,v
    ##for item in word_freq.keys():
    ##    print item
    ##for (k,v) in (dict (paixu_tiqu)).items():
    ##    print k,v
    ##--------------------------------------------make word cloud --------------------------------   
    tags = make_tags(dict(paixu_tiqu))
    ##print tags
##    outPNGStr = 'kouzhao.png'
    create_tag_image(tags, outPNGStr, size=(2000, 1600), fontname='haokan.ttf',fontzoom=4)
    print "all over"
def cuttest(sent):
    result_arr = []
    words_use = pseg.cut(test_sent)
    for word_use in words_use:
        result_arr.append({"pos":word_use.flag, "term":word_use.word})
    print("no ckip")
    return result_arr
Example #12
0
def type_features1(word):
    f = open(word)
    sentence = f.read()
    wordlist = pseg.cut(sentence)
    for w in wordlist:
        if w.flag.startswith('n'):
            return {'firstn':w.word}
Example #13
0
    def add_text_jieba(self, content):
        term_map = {}
        #jieba.enable_parallel(4)
        #tokens = jieba.cut(content)
        #for fet in tokens:
        words = pseg.cut(content)
        for w in words:
            fet = w.word
            pos = w.flag
            #if not pos in pos_list:
                #continue
            if pos != 'n' and pos != 'v' and pos != 'vn':
                continue

            #u0 = fet[0]
            #if not is_chinese_word(u0) :
                #continue
            if len(fet) < 2:
                continue

            #print fet, pos
            term_id = self.add_term(fet)

            if term_id in term_map:
                term_map[term_id] += 1
            else:
                term_map[term_id] = 1

        return term_map
def getWordsCount(in_file,out_file):
    # 读取excel文件
    # lib = load_workbook(file, use_iterators = True)
    # 创建新的excel文件
    wt_wb = Workbook(write_only=True)
    wt_ws = wt_wb.create_sheet()
    word_all=''
    fenci_list=[]
    dis_list = []
    words_sum=[]

    lib = csv.reader(file(in_file, 'rb'))
    # 文档分词
    for row in lib:
        fenci_list_p = list(pseg.cut(row[0]))

        for w in fenci_list_p:
            word = [w.word,w.flag]
            # print word
            words_sum.append(word)
    #生成字典    
    for word in words_sum:
        if word not in dis_list:
            dis_list.append(word)
    #计算词频
    wt_ws.append(['words','notes','count'])
    for word in dis_list:
        count = words_sum.count(word)
        word_c = [word[0],word[1],count]
        wt_ws.append(word_c)
    # 保存excel
    wt_wb.save(out_file)
Example #15
0
def tokenize(text):
    docid, body = text.split('\t', 1)
    items = []
    for word,flag in pseg.cut(body):
        items.append('%s/%s'%(word,flag))
    result = "%s\t%s"%(docid, ' '.join(items))
    return result
Example #16
0
def getWordsCount(file,sheets_name,anlysis_row_nm,out_file):
    # 读取excel文件
    lib = load_workbook(file, use_iterators = True)
    # 创建新的excel文件
    wt_wb = Workbook(write_only=True)
    wt_ws = wt_wb.create_sheet()
    word_all=''
    fenci_list=[]
    dis_list = []
    words_sum=[]
    # 文档分词
    for row in lib[sheets_name].iter_rows():
        # print row[1].value
        fenci_list = list(jieba.cut(row[anlysis_row_nm].value, cut_all=False))
        fenci_list_p = list(pseg.cut(row[anlysis_row_nm].value))

        for w in fenci_list_p:
            word = [w.word,w.flag]
            # print word
            words_sum.append(word)
    #生成字典    
    for word in words_sum:
        if word not in dis_list:
            dis_list.append(word)
    #计算词频
    wt_ws.append(['words','notes','count'])
    for word in dis_list:
        count = words_sum.count(word)
        word_c = [word[0],word[1],count]
        wt_ws.append(word_c)
    # 保存excel
    wt_wb.save(out_file)
Example #17
0
    def segment(self, text, lower=True, use_stop_words=True, use_speech_tags_filter=False):
        """对一段文本进行分词,返回list类型的分词结果

        Keyword arguments:
        lower                  -- 是否将单词小写(针对英文)
        use_stop_words         -- 若为True,则利用停止词集合来过滤(去掉停止词)
        use_speech_tags_filter -- 是否基于词性进行过滤。若为True,则使用self.default_speech_tag_filter过滤。否则,不过滤。    
        """
        text = util.as_text(text)
        jieba_result = pseg.cut(text)

        if use_speech_tags_filter == True:
            jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter]
        else:
            jieba_result = [w for w in jieba_result]

        # 去除特殊符号
        word_list = [w.word.strip() for w in jieba_result if w.flag != 'x']
        word_list = [word for word in word_list if len(word) > 0]

        if lower:
            word_list = [word.lower() for word in word_list]

        if use_stop_words:
            word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]

        return word_list
    def run(self, textLine):    

        array = pseg.cut(textLine)
        tag ='O'
        inner = 0
        words =[]

        for w in array:
            words.append(w)
            line =""

        for i in range(0, len(words)):

            if words[i].word == u'【':
                inner =1
                continue
            if words[i].word == u'】':
                tag = 'O'
                inner = 0
                continue
            if inner == 1:
                if i+1 < len(words) and words[i-1].word ==  u'【' and words[i+1].word == u'】':
                    tag = 'S'
                elif words[i-1].word == u'【':
                    tag = 'B'
                elif i+1 < len(words) and words[i+1].word == u'】':
                    tag = 'E'
                else:
                    tag = 'I'
            line = line + words[i].word + '\t' + words[i].flag + '\t' + tag  + '\n'
        if inner == 0:
            print line

        return True
def passage_second_level_classify(content):
    """
    given a passage content, return its second level class
    :param content:
    :return: a topic list with probablity
    """
    first_class = passage_first_level_classify(content)
    print first_class
    lda_model = gensim.models.LdaModel.load('%s/wechat_data/lda_in_classify/%s.model' % (apath, first_class))
    word_list = []
    words = pseg.cut(content)
    for item in words:
        if item.flag in [u'n', u'ns']:
            word_list.append(item.word)
    train_set = [word_list]
    dic = gensim.corpora.Dictionary(train_set)
    corpus = [dic.doc2bow(text) for text in train_set]
    doc_lda = lda_model.get_document_topics(corpus)
    count = 0
    # for j in lda_model.print_topics(20):
    #     print count, j
    #     count += 1
    # print doc_lda
    topic_list = []
    for i in lda_model[corpus]:
        for k in i:
            print lda_model.print_topic(k[0], 7), k[1]
            topic_list.append(
                {'topic_tag': u'%s-%s' % (first_class, k[0]), 'topic_content': lda_model.print_topic(k[0], 7),
                 'topic_prob': k[1]})
    return topic_list
def postagger(sentence):
	pos_data = pseg.cut(sentence)
	pos_list = []
	for w in pos_data:
		pos_list.append((w.word, w.flag))
	#print pos_list[:]
	return pos_list
Example #21
0
 def __ansj_seg(self, content, tool = 'ansj_seg'):
     """ 默认使用ansj_seg分词工具 """
     if tool == 'ansj_seg':
         ws = CrfppUtil.ansj_seg.cut(content)
         return ws
     else:
         return pseg.cut( content )
Example #22
0
def jieba_pseg(fname,fenci_fname, pos_fname, tag_fname):
    f1 = open(fenci_fname,'w')
    f2 = open(pos_fname,'w')
    f3 = open(tag_fname, 'w')
    with open(fname) as xs:
        for l in xs.readlines():
            l = l.strip()
            res = pseg.cut(l)
            token_list = []
            pos_list = []
            tag_list = []
            for token, pos in res:
                token_list.append(token)
                pos_list.append(pos)
                tag = token + '/' + pos
                tag_list.append(tag)
            token_str = ' '.join(token_list)
            pos_str = ' '.join(pos_list)
            tag_str = ' '.join(tag_list)
            f1.write(token_str.encode('utf8') + '\n')
            f2.write(pos_str.encode('utf8') + '\n')
            f3.write(tag_str.encode('utf8') + '\n')
    f1.close()
    f2.close()
    f3.close()
Example #23
0
File: temp.py Project: mindawei/p2p
def extractEntity():
    db = client.holmesdb
    t_news = db.t_news_di
    res_list = t_news.find()
    last_name_dict = getLastNameDict()

    ntoken_dict = {}
    people_dict = {}
    row_cnt = 0
    for res in res_list:
        row_cnt += 1
        title = res["title"]
        content = res["content"]
        doc = myio.handleContent(title) + " " + myio.handleContent(content)
        words = pseg.cut(doc)
        for (word, flag) in words:
            if flag.find("n") != -1:
                print word, flag
                word1 = word[0].encode("utf-8")
                word2 = word[:2].encode("utf-8")
                if word1 in last_name_dict or word2 in last_name_dict:
                    #print word[0], word[:2]
                    people_dict[word] = people_dict.setdefault(word, 0) + 1
                else:
                #print w.word, w.flag
                   ntoken_dict[word] = ntoken_dict.setdefault(word, 0) + 1
    ntoken_list = sorted(ntoken_dict.items(), lambda a, b: -cmp(a[1], b[1]))
    people_list = sorted(people_dict.items(), lambda a, b: -cmp(a[1], b[1]))
def read_sentences(sentence_path) :
	with codecs.open(sentence_path, 'r', 'gb18030') as fo :
		sentences = [line.strip().split('\t') for line in fo.readlines()]
	for sentence in sentences :
		words = pseg.cut(sentence[1])
		for word, flag in words :
			print word.encode('utf8'), flag
    def __is_clause_pattern3(self, the_clause, seg_result):
        for a_phrase in self.__phrase_dict:
            keys = a_phrase.keys()
            to_compile = a_phrase["key"].replace("……", "[\u4e00-\u9fa5]*")

            if "start" in keys:
                to_compile = to_compile.replace("*", "{" + a_phrase["start"] + "," + a_phrase["end"] + "}")
            if "head" in keys:
                to_compile = a_phrase["head"] + to_compile

            match = re.compile(to_compile).search(the_clause)
            if match is not None:
                can_continue = True
                pos = [flag for word, flag in posseg.cut(match.group())]
                if "between_tag" in keys:
                    if a_phrase["between_tag"] not in pos and len(pos) > 2:
                        can_continue = False

                if can_continue:
                    for i in range(len(seg_result)):
                        if seg_result[i].word in match.group():
                            try:
                                if seg_result[i + 1].word in match.group():
                                    return self.__emotional_word_analysis(
                                        a_phrase["key"] + ":" + match.group(), a_phrase["value"],
                                        [x for x, y in seg_result], i)
                            except IndexError:
                                return self.__emotional_word_analysis(
                                    a_phrase["key"] + ":" + match.group(), a_phrase["value"],
                                    [x for x, y in seg_result], i)
        return ""
def human2machine(msg):
    if not isinstance(msg, unicode):
        msg = msg.decode('utf-8')

    #: process with some hard coded translations first
    for k, v in h_d.items():
        if k in msg.split('@3bugs')[-1]:
            return v[0]

    action = None
    action_type = None
    obj = None
    repeated_duration = 0

    import jieba.posseg as pseg
    seg = classify(pseg.cut(msg), l_d.keys())

    for v in seg['verb']:
        action = ch_d.get(v, None)[0] or action
    action_type = action_type_d.get(action, None)[0]

    for n in seg['noun']:
        obj = ch_d.get(n, None)[0] or obj

    repeated_duration = find_repeated(seg) or 0

    if action and (action_type is not None) and \
            (action == 'capture' or obj):
        return action, action_type, obj, repeated_duration
    else:
        logger.info('Found unknown command %s' % msg)
        logger.debug('%s %s %s %s' % (str(action), str(action_type), str(obj),
                str(repeated_duration)))
        logger.debug(seg)
        return None
Example #27
0
def get_content(biz, sn):
    try:
        string = ""
        with open("../public/" + biz + "/" + sn + ".txt", 'r') as f:
            for l in f.readlines():
                l = l.strip('\n')
                if (l == ""):
                    continue
                if (l.find(biz) != -1):
                    info = l.split(',')
                    title = ','.join(info[4:-2])
                    author = info[-1]
                    string = title
                    print title, author
                    continue
                string += l
            seg_list = pseg.cut(string)
        string = ""
        for word, flag in seg_list:
            if (flag in save):
                string += unicode.encode(word, 'utf-8') + ' '
        return string + '\n'
    except:
        return None
        print("error")
Example #28
0
def normal_pos(ins):
    if ins.strip() == '':
        return ins
    #s = q2b(ins)
    s = ins

    words = ['']
    for seg, zh in find_zh(s):
        #seg = zh_s.strip()
        if seg == '':
            continue
        if not zh:
            words.append(seg)
            continue
        for w in pseg.cut(seg):
            t = (w.word, w.flag)
            if any(t[1].find(fi) >= 0 for fi in reserve_pos):
                words.append(t[0])
            elif any(t[1].find(fi) >= 0 for fi in filter_pos) \
                    or (t[1].find('d') >= 0 and all(t[0].find(ig) < 0 for ig in ignore_pos)):
                #if words[-1] == ' ':
                #    continue
                #else:
                #    words.append(' ')
                #print w
                continue
            else:
                words.append(t[0])
    #print 'BEGPOS', ''.join(words), 'END'
    return merge_zh(''.join(words))
Example #29
0
    def testReference(self):
        import jieba # May fail to load jieba
        jieba.initialize(usingSmall=False)
        import jieba.posseg as pseg
        pwords = []
        content = u'上海今日新确诊3例人感染H7N9禽流感病例'
        _ = """
ns 上海
t 今日
a 新
v 确诊
m 3
n 例人
v 感染
eng H7N9
n 禽流感
n 病例
"""
        content = u'李克强:在半岛挑事无异于搬石头砸自己脚'
        _ = """
nr 李克强
p 在
n 半岛
v 挑事
l 无异于
v 搬
l 石头砸
r 自己
n 脚
"""
        for word in pseg.cut(content):
            print word.flag, word.word
Example #30
0
def is_question(s):
    s = s.strip()
    if s == '':
        return False

    cuts = list(pseg.cut(s))
    pos = [w.word+w.flag for w in cuts]
    pos_set = set(pos)
    words = [w.word for w in cuts]
    flags = [w.flag for w in cuts]
    
    if u'是v' in pos_set and u'还是c' in pos_set:
        return True
    if pos[-1] == u'不d':
        return True
    if len(pos_set & questions_pos) > 0:
        return True
    sel = next((x for x in range(len(pos)) if pos[x] in [u'不d',u'还是c']), 0)
    if sel > 0:
        p1, p2 = set(pos[0:sel]), set(pos[sel+1:])
        if len(p1 & p2) > 0:
            return True
    sel = next((x for x in range(len(words)) if words[x] == u'不'), 0)
    if sel > 0 and words[sel-1] == words[sel+1]:
        return True
    
    return False
Example #31
0
def get_overseas_exp():
    select_sql = "select * from teacherdata_info"

    teacher_list = dbs.getDics(select_sql)
    print(len(teacher_list))
    extractor = Extractor()

    jieba.load_userdict("E:\\shixi\\justcoding\\extract_v1.0\\user_dict.txt")

    result_list = []
    w_list = []
    for teacher in teacher_list:
        if re.search(r'cksp\.eol\.cn', teacher["homepage"]) is not None:
            info_dict = eval(teacher["info"])
            extractor.set_text(info_dict["个人简介"])
        else:
            try:
                info = eval(teacher['info'])
                person_info = "".join(list(info.values()))
            except Exception as e:
                person_info = teacher['info']
            if person_info is None:
                continue
            extractor.set_text(person_info)
            reList = [r'教育经历|学习经历|教育背景', r'个人简介|个人简历', teacher["name"]]
            extractor.cut_blocks(reList)

        extractor.compute_gravity()
        sentences = extractor.filter_sentence()
        if sentences is None:
            continue
        label = ["ns", "nt"]
        description = ""
        wo_list = []
        for sentence in sentences:
            if re.search(r'留学', sentence):
                description = description + ";" + sentence
                continue
            words = pseg.cut(sentence)

            # for w in words:
            #     if w.flag in label and re.search(r'大学|学院|院', w.word) and school_dict.get(w.word):
            #         description = description + ";" + sentence
            #         break
            words = [w for w in words if w.flag in label]
            if len(words) > 0:
                description = description + ";" + sentence
                wo_list = [w.word for w in words]

        if not description == "":
            w_list.extend(wo_list)
            result_list.append(("-".join(wo_list), teacher['id'], description.lstrip(";")))

    print(len(result_list))
    fw = open("5.csv", "a+", encoding="utf-8")
    for i in result_list:
        fw.write("%s,%s,%s\n" % i)
    fw.close()

    print(len(w_list))
    fw1 = open("6.csv", "a+", encoding="utf-8")
    for i in w_list:
        fw1.write("%s\n" % i)
    fw1.close()
Example #32
0
def gen_data(graph_path, lda_path, user_lim=200, user_wb_lim=200):

    import jieba
    import jieba.posseg as pseg
    jieba.load_userdict(u"/etc/jieba/jieba.dic")

    G = nx.Graph()
    ldaf = open(lda_path, 'w')
    ldaf.write("%d\n" % user_lim)
    ucnt = 0
    for item in Weibo.objects.values('owner').annotate(cnt=Count('owner')):
        if item['cnt']>450:
            user = Account.objects.get(id=item['owner'])
            logging.info(u'%5d Dealing with %s' % (ucnt, user))
            logging.info(u'Current graph:%d nodes and %d edges' % (G.number_of_nodes(), G.number_of_edges()))
            user_words = []
            for wb in user.ownweibo.order_by("-created_at").all()[:user_wb_lim]:
                #filter(retweeted_status__exact=None).all():
                text = wb.text.lower()
                #TODO
                #if wb.retweeted_status:
                #   text = wb.retweeted_status.text.lower() + text

                text = re.sub("@[^\s@:]+", "", text)
                text = re.sub(u"http://t.cn[^ ]*", u"", text)
                text = re.sub(u"\[[^ ]{1,3}\]", u"", text)
                for word in re.findall(u"【.+?】|#.+?#|《.+?》|“.+?”|\".+?\"", text):
                    for w in pseg.cut(word):
                        if len(w.word)<2 or w.word in Config.STOP_WORDS or 'n' not in w.flag:
                            continue
                        wd = w.word.encode('utf-8')
                        if G.has_node(wd) and 'weight' in G.node[wd]:
                            G.node[wd]['weight'] += 1.0
                        else:
                            G.add_node(wd, weight=1.0)

                wb_words = []
                for w in pseg.cut(text):
                    if len(w.word)>1 and 'n' in w.flag and w.word not in Config.STOP_WORDS:
                        wb_words.append(w.word.encode('utf-8'))
                if not wb_words:
                    continue
                for w1, w2, w3 in zip(wb_words[:-2], wb_words[1:-1], wb_words[2:]):
                    if not G.has_edge(w1, w2):
                        G.add_edge(w1, w2, weight=1.0)
                    else:
                        G[w1][w2]['weight'] += 1.0
                    if not G.has_edge(w1, w3):
                        G.add_edge(w1, w3, weight=1.0)
                    else:
                        G[w1][w3]['weight'] += 1.0
                user_words.extend(wb_words)

            if not user_words:
                continue
            ldaf.write(' '.join(user_words)+'\n')
            ucnt += 1
            if ucnt>=user_lim:
                break

    if ucnt<user_lim:
        logging.error("no enough docs, %d/%d" % (ucnt, user_lim))

    if graph_path:
        nx.write_yaml(G, graph_path, encoding='UTF-8')
    ldaf.close()

    return G
Example #33
0
    wyz 左引号,全角:“ ‘ 『 
    wyy 右引号,全角:” ’ 』 
    wj 句号,全角:。 
    ww 问号,全角:? 半角:? 
    wt 叹号,全角:! 半角:! 
    wd 逗号,全角:, 半角:, 
    wf 分号,全角:; 半角: ; 
    wn 顿号,全角:、 
    wm 冒号,全角:: 半角: : 
    ws 省略号,全角:…… … 
    wp 破折号,全角:—— -- ——- 半角:— —- 
    wb 百分号千分号,全角:% ‰ 半角:% 
    wh 单位符号,全角:¥ $ £ ° ℃ 半角:$
"""

import jieba.posseg as pseg
words = pseg.cut("我很饿,你知道吗?你有上过马来西亚大学吗?")

for word, flag in words:
    print('%s %s' % (word, flag))

#----------------------------------------

#返回词语在原文的起止位置

import jieba
result = jieba.tokenize(u'我还是围绕着想吃什么')

for tk in result:
    print("Word: %s\t\t Start: %s\t\t End: %d" % (tk[0], tk[1], tk[2]))
# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals
import sys
sys.path.append("../")
import jieba
jieba.load_userdict("userdict.txt")
import jieba.posseg as pseg
test_sent = "l+navtion+cf1 l-navtion-cf1 l-navtion-co1 l+navtion+cn2 l.navtion.cn2 l#navtion#cn2 l:navtion:cn2 l:navtion|cn2 l-navtion.co1 Edu Trust认证 Edu Trust认"
words = pseg.cut(test_sent)
for k,v in words:
    print(k,v)
w = jieba.cut(test_sent)
print(",".join(w))
Example #35
0
#get data, 500 per time
cursor.execute("select user_id from users where is_evil=1 limit 0,500")
allUser = cursor.fetchall()
if len(allUser) == 0:
    exit(0)
for eachUser in allUser:
    total += 1
    print 'evil user is processing...' + str(eachUser[0])
    cursor.execute('select content from post where user_id = %s',[eachUser[0]])
    allMsg = cursor.fetchall()
    x = [0 for i in range(0, vLen)]
    for eachMsg in allMsg:
        #print eachMsg[1]
        soup = BeautifulSoup(str(eachMsg[0]))
        plaintext = soup.get_text().strip()
        seg_list = pseg.cut(plaintext)
        # x 为词库长度大小的词频列表 初始为 0
        for w in seg_list:
            #tmp_word 为分词结果
            tmp_word = w.word.strip()
            tmp_flag = w.flag[0]
            if len(tmp_word) == 0 or tmp_flag != 'n':
                continue
            else:
                if tmp_word not in v:
                    continue
                else:
                    x[v.index(tmp_word)] += 1
    #x = [rate*1.0/(total+1) for rate in x]
    #使用之前的模型进行学习
    new_class =  clf.predict(x)
Example #36
0
import jieba.analyse
import jieba.posseg as peg
jieba.load_userdict('./new_words.txt')

words = peg.cut('事业编')
tag = str(list(words)).split('/')
print(tag)
 def distance(self, text1, text2):#相似性计算主函数
     word_list1=[word.word for word in pesg.cut(text1) if word.flag[0] not in ['w','x','u']]
     word_list2=[word.word for word in pesg.cut(text2) if word.flag[0] not in ['w','x','u']]
     return self.similarity_cosine(word_list1,word_list2)
Example #38
0
def cut_and_flag(HAN_SENTENCE):
    WORDS_AND_FLAGS = pseg.cut(HAN_SENTENCE)
    return WORDS_AND_FLAGS
Example #39
0
def txt2label(txt, sfsfile=None, style='default'):
    '''Return a generator of HTS format label of txt.
    
    Args:
        txt: like raw txt "向香港特别行政区同胞澳门台湾同胞"
             or txt with prosody make like "向#1香港#2特别行政区#1同胞#3澳门台湾#1同胞",
             punctuation is also allow in txt
        sfsfile: absolute path of sfs file (alignment file). A sfs file
            example(measure time by 10e-7 second, 12345678 means 1.2345678
            second)
            --------
            239100 s 
            313000 a 
            323000 d
            400000 b 
            480000 s 
            ---------
            a stands for consonant
            b stands for vowel
            d stands for silence that is shorter than 100ms
            s stands for silence that is longer than 100ms
        style: label style, currently only support the default HTS format
        
    Return:
        A generator of phone label for the txt, convenient to save as a label file
    '''
    assert style == 'default', 'Currently only default style is support in txt2label'

    # del all Chinese punctuation
    # punctuation = "·!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
    # txt = re.sub(r'[%s]'%punctuation, '', txt)

    # delete all character which is not number && alphabet && chinese word
    txt = re.sub(r'(?!#)\W', '', txt)

    # If txt with prosody mark, use prosody mark,
    # else use jieba position segmetation
    if '#' in txt:
        words, poses, rhythms = _adjust(txt)
    else:
        txt = re.sub('[,.,。]', '#4', txt)
        words = []
        poses = []
        for word, pos in posseg.cut(txt):
            words.append(word)
            poses.append(pos[0])
        rhythms = ['#0'] * (len(words) - 1)
        rhythms.append('#4')

    syllables = txt2pinyin(''.join(words))

    phone_num = 0
    for syllable in syllables:
        phone_num += len(syllable)  # syllable is like ('b', 'a3')

    if sfsfile:
        phs_type = []
        times = ['0']
        with open(sfsfile) as fid:
            for line in fid.readlines():
                line = line.strip().rstrip('\n')
                assert len(line.split(' ')) == 2, 'check format of sfs file'
                time, ph = line.split(' ')
                times.append(int(float(time)))
                phs_type.extend(ph)
    else:
        phs_type = []
        for i, rhythm in enumerate(rhythms):
            single_word_pinyin = txt2pinyin(words[i])
            single_word_phone_num = sum(
                [len(syllable) for syllable in single_word_pinyin])
            phs_type.extend(['a'] * single_word_phone_num)
            if i != (len(rhythms) - 1) and rhythm == '#4':
                phs_type.append('s')
        '''
        phs_type = ['a'] * phone_num
        '''
        phs_type.insert(0, 's')
        phs_type.append('s')
        times = [0] * (len(phs_type) + 1)
    '''
    for item in words:
        print(item)

    print (words)
    print (rhythms)
    print (syllables)
    print (poses)
    print (phs_type)
    print (times)
    '''

    phone = tree(words, rhythms, syllables, poses, phs_type)
    return LabGenerator(phone, rhythms, times)
Example #40
0
# 主函数
if __name__ == '__main__':
    # 1.输入文件
    data_xls = pd.read_excel('词云/法国新闻.xlsx')

    # 2.这种方法是因为我用直接读取的方法有点麻烦
    test_data = []
    for i in data_xls.index.values:  # 获取行号的索引,并对其进行遍历:
        # 根据i来获取每一行指定的数据 并利用to_dict转成字典
        row_data = data_xls.loc[i, ['链接', '新闻', '日期', '来源', '内容']].to_dict()
        test_data.append(row_data)
    print("最终获取到的数据是:{0}".format(test_data))

    #3.抽取内容中的地名和机构名
    for i in test_data:
        words = pseg.cut(i["内容"])
        i["地点"] = ""
        for word, flag in words:
            if (flag == 'ns' or flag == "nt"):
                print('%s, %s' % (word, flag))
                i["地点"] += word + "\n"

    #4.保存抽取出来的地点
    # 创建工作簿
    file_name = "涉侨资讯_慈善公益.xlsx"
    workbook = xlsxwriter.Workbook(file_name)
    # 创建工作表
    worksheet = workbook.add_worksheet('慈善公益')
    # 写单元格
    worksheet.write(0, 0, '链接')
    worksheet.write(0, 1, '新闻')
def _firstWordSegmentationWithPOS(cleaned_raw_data_dict: dict,
                                  tools: str = 'jieba'):
    assert tools in ('pkuseg', 'jieba')
    print("Chinese word segmenting and Pre-part-of-speech tagging using {}...".
          format(tools))

    word_seg_list_dict = defaultdict(list)
    word_seg_dict = {}

    pre_pos_list_dict = defaultdict(list)
    pre_pos_dict = {}

    for seq_num, string in cleaned_raw_data_dict.items():

        if tools == 'pkuseg':
            words = pseg.cut(string)
        elif tools == 'jieba':
            words = jseg.cut(string)

        spaceDetector = 0
        for word, flag in words:

            # For POS tagging "b"
            if len(word) >= 2 and word[
                    -1] == '状' and word[-2:] != '症状':  # end with 状 but not 症状
                flag = 'b'
            if len(word) >= 2 and word[-1] == '性' and (
                    word[-2:] != '毒性'
                    and len(word) == 2):  # end with 性, and not 毒性
                flag = 'b'

            word_with_tag = word + '/' + flag

            if word == '\n':  # jieba will retain last \n as word
                continue

            if flag == 'm' and re.match(
                    r"\w+\.$",
                    word):  # Split "3." ("3./m") to "3 ." and "3/m ./w"
                word_seg_list_dict[seq_num].append(word[:-1])
                pre_pos_list_dict[seq_num].append(word[:-1] + '/m')
                word = '.'
                word_with_tag = './w'

            if flag == 'nr':  # people name
                # print(word, flag) # useful to find mis-classified name
                if len(word) >= 2 and word[0:2] in lastName:  # e.g. 司馬
                    word_seg_list_dict[seq_num].append(word[0:2])
                    pre_pos_list_dict[seq_num].append(word[0:2] + '/nr')
                    word = word[2:]
                    word_with_tag = word_with_tag[2:]
                    if len(word) == 2:  # only lastname
                        continue
                elif word[0] in lastName:
                    word_seg_list_dict[seq_num].append(word[0])
                    pre_pos_list_dict[seq_num].append(word[0] + '/nr')
                    if len(word) == 1:  # only lastname
                        continue
                    word = word[1:]
                    word_with_tag = word_with_tag[1:]

            word_seg_list_dict[seq_num].append(word)
            pre_pos_list_dict[seq_num].append(word_with_tag)

            # only work with jieba (pkuseg will change $$_ to $$&...)
            if word == '$' and spaceDetector == 0:
                spaceDetector += 1
            elif word == '$' and spaceDetector == 1:
                spaceDetector += 1
            elif word == '_' and spaceDetector == 2:
                spaceDetector = 0
                for _ in range(3):
                    word_seg_list_dict[seq_num].pop()
                    pre_pos_list_dict[seq_num].pop()
                word_seg_list_dict[seq_num].append('$$_')
                pre_pos_list_dict[seq_num].append('$$_')
                spaceDetector = 0
            else:
                spaceDetector = 0

    for seq_num in word_seg_list_dict.keys():
        word_seg_dict[seq_num] = " ".join(word_seg_list_dict[seq_num])
        pre_pos_dict[seq_num] = " ".join(pre_pos_list_dict[seq_num])

    return word_seg_dict, word_seg_list_dict, pre_pos_dict, pre_pos_list_dict
Example #42
0
 def cut_sentence(self, sent):
     words = []
     _words = pseg.cut(sent)
     for _word in _words:
         words.append(_word.word)
     return words
import jieba.posseg as seg
import codecs

cnt0 = {}
with codecs.open("./ind_keyword.ind", "r", "utf-8") as f:
    str0 = f.read()
    lis0 = str0.split("#")
    # print(lis0)
    for i in lis0:
        ls1 = seg.cut(i)
        for w in ls1:
            if w.flag in cnt0.keys():
                cnt0[w.flag] += 1
            else:
                cnt0[w.flag] = 1

cnt1 = {}
with codecs.open("./tensorflow/data/embedding/sgns.wiki.bigram-char", "r",
                 "utf-8") as f:
    str0 = f.readline()
    n = int(str0.split(' ')[0])
    cnt00 = 0
    for i in range(0, n):
        str0 = f.readline()
        lis = []
        lis0 = str0.split(' ')
        if len(lis0) < 301:
            continue
        if not ('\u4e00' <= lis0[0] <= '\u9fff'):
            # print("jumped")
            continue
Example #44
0
 def get_word_objects(sentence):
     # 将自然语言转为Word对象
     return [Word(word.encode('utf-8'), tag) for word, tag in pseg.cut(sentence)]
Example #45
0
import jieba
import jieba.posseg as pseg
import time

# sent = '明明是數字鎖,卻需要畫圖形,滑動解鎖竟然是將手機翻轉90度,圖形鎖則變成射擊到螢幕右上角。'
# wordlist = jieba.cut(sent, cut_all=True)
# print(" | ".join(wordlist))

# wordlist = jieba.cut(sent)
# print(" | ".join(wordlist))

# wordlist = jieba.cut_for_search(sent)
# print(" | ".join(wordlist))

start = time.time()

# words = pseg.cut("這隻程式可以幫我們把網站資料爬下來")  # jieba默认模式

jieba.enable_paddle()  # 启动paddle模式。 0.40版之后开始支持,早期版本不支持

words = pseg.cut("這隻程式可以幫我們把網站資料爬下來", use_paddle=True)  # paddle模式

for word, flag in words:
    print('%s %s' % (word, flag))

# print(words)
# print(flags)

end = time.time()
print(end - start)
Example #46
0
def search_and_destory(file_path, library_path):
    words = pseg.cut('江桥收费站至中环路严重堵塞,大量外地车辆涌入上海')

    for word in words:
        print word.word, word.flag
#-*-coding:utf-8 -*-
## python2.7 bin/jieba_cli.py 我爱北京 我爱Clojure
import jieba.posseg as pseg
import sys
import json
str_arrays = sys.argv
str_arrays.pop(0)
print str_arrays
print json.dumps([[(word, flag) for word, flag in pseg.cut(words)]
                  for words in str_arrays],
                 ensure_ascii=False)
Example #48
0
# -*- coding: utf-8 -*-
import os, sys
import jieba, codecs, math
import jieba.posseg as pseg

names = {}  # 姓名字典
relationships = {}  # 关系字典
lineNames = []  # 每段内人物关系

# count names
jieba.load_userdict("dict.txt")  # 加载字典
with codecs.open("busan.txt", "r", "utf8") as f:
    for line in f.readlines():
        poss = pseg.cut(line)  # 分词并返回该词词性
        lineNames.append([])  # 为新读入的一段添加人物名称列表
        for w in poss:
            if w.flag != "nr" or len(w.word) < 2:
                continue  # 当分词长度小于2或该词词性不为nr时认为该词不为人名
            lineNames[-1].append(w.word)  # 为当前段的环境增加一个人物
            if names.get(w.word) is None:
                names[w.word] = 0
                relationships[w.word] = {}
            names[w.word] += 1  # 该人物出现次数加 1

for name, times in names.items():
    print(name, times)
Example #49
0
from jieba import posseg
import sys
import multiprocessing

input_file = sys.argv[1]
output_file = sys.argv[2]

stopword = set([x.strip() for x in open('stopword.txt', encoding='utf8').readlines()])

corpus = [x.strip() for x in open(input_file, encoding='utf8').readlines()]
corpus = [[y for y, z in posseg.cut(x) if z not in ['x', 'm', 'eng'] and y not in stopword] for x in corpus]

open(output_file, 'w', encoding='utf8').writelines([' '.join(x) + '\n' for x in corpus])
Example #50
0
def test_thulac(text):
    words = pseg.cut(text)
    print("jieba分词:")
    for word, flag in words:
        print('%s_%s' % (word, flag))
    return
 def get_features(self, string):
     word_list = [
         word.word for word in pseg.cut(string)
         if word.flag[0] not in ['u', 'x', 'w', 'o', 'p', 'c', 'm', 'q']
     ]
     return word_list
Example #52
0
# coding: utf-8
import jieba.posseg as pseg
 
words = pseg.cut("15亿光年神秘太空信号王源粉丝")
for word, flag in words:
    print("%s %s" % (word, flag))
def entity_analysis(entity):
    db = neo_con
    words = entity.split(' ')
    if len(words) == 1:
        if is_loc(words[0]):
            return db.match_location4event_patient(entity)
        else:
            wordp = posseg.cut(words[0])
            for w in wordp:
                if w.flag in ['v', 'vd', 'vn', 'vg']:
                    return db.match_topic4event(entity)
                elif w.flag in ['nr']:
                    return db.match_patient_name(entity)
    elif len(words) == 2:
        isloc_dict = {}
        flag = 0
        for word in words:
            isloc_dict[word] = is_loc(word)
            if isloc_dict[word]:
                flag = 1
        if isloc_dict[words[0]]:
            wordp = posseg.cut(words[1])
            for w in wordp:
                if w.flag in ['v', 'vd', 'vn', 'vg']:
                    return db.match_location_topic4event(words[0], words[1])
                elif w.flag in ['m']:
                    return db.match_location_time4event_patient(
                        words[0], words[1])
                else:
                    gender = words[1].replace('性', '').replace('生', '')
                    return db.match_location_gender4patient(words[0], gender)
        else:
            wordp = posseg.cut(words[0])
            for w in wordp:
                if w.flag in ['v', 'vd', 'vn', 'vg']:
                    return db.match_location_topic4event(words[1], words[0])
                elif w.flag in ['m']:
                    return db.match_location_time4event_patient(
                        words[1], words[0])
                else:
                    gender = words[0].replace('性', '').replace('生', '')
                    return db.match_location_gender4patient(words[1], gender)

        if not flag:
            wordp = posseg.cut(words[0])
            for w in wordp:
                if w.flag in ['m']:
                    return db.match_name_time4location_event(
                        words[1], words[0])
                else:
                    return db.match_name_time4location_event(
                        words[0], words[1])
    elif len(words) == 3:
        loc = ''
        time = ''
        topic = ''
        for word in words:
            if is_loc(word):
                loc = word
                words.remove(word)
                break
        wordp = posseg.cut(words[0])
        for w in wordp:
            if w.flag in ['m']:
                return db.match_location_time_topic4patient(
                    loc, words[0], words[1])
            else:
                return db.match_location_time_topic4patient(
                    loc, words[1], words[0])

    else:
        answer = db.match_location4event_patient(words[0])
        if len(answer) == 0:
            answer = db.match_topic4event(words[0])
        return answer
Example #54
0
import re
import jieba.posseg as psg
import numpy as np

# 去重,去除完全重复的数据
reviews = pd.read_csv("../tmp/reviews.csv")
reviews = reviews[['content', 'content_type']].drop_duplicates()
content = reviews['content']

# 去除去除英文、数字等
# 由于评论主要为京东美的电热水器的评论,因此去除这些词语
strinfo = re.compile('[0-9a-zA-Z]|京东|美的|电热水器|热水器|')
content = content.apply(lambda x: strinfo.sub('', x))

# 分词
worker = lambda s: [(x.word, x.flag) for x in psg.cut(s)]  # 自定义简单分词函数
seg_word = content.apply(worker)

# 将词语转为数据框形式,一列是词,一列是词语所在的句子ID,最后一列是词语在该句子的位置
n_word = seg_word.apply(lambda x: len(x))  # 每一评论中词的个数

n_content = [[x + 1] * y for x, y in zip(list(seg_word.index), list(n_word))]
index_content = sum(n_content, [])  # 将嵌套的列表展开,作为词所在评论的id

seg_word = sum(seg_word, [])
word = [x[0] for x in seg_word]  # 词

nature = [x[1] for x in seg_word]  # 词性

content_type = [[x] * y
                for x, y in zip(list(reviews['content_type']), list(n_word))]
from collections import Counter
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import jieba

jieba.load_userdict('symptom.txt')
total = []
wordtype = []
with open('segwords.txt','w') as f:
    with open('training.txt','r') as k:
        for i in k:
            #print i
            i = i.replace('"','')
            i = i.replace("'", '')
            seg = pseg.cut(i.strip())

            for word, flag in seg:
                total.append(word)
                wordtype.append(flag)
                #f.write('\n'.join(seg))
                f.write(word + flag + "\n")

c = Counter(total)
with open('dicmap.txt','w') as f:
    for i in c.most_common():
        f.write('"'+ i[0].decode('utf-8').encode('unicode_escape')+'": ' + str(i[1]) + ', ')



Example #56
0
 def term_segment(self, phrase):
     words = pos_seg.cut(phrase)
     for w, t in words:
         if "n" in t:
             yield w
Example #57
0
# -*- coding: utf-8 -*-

import jieba.posseg as pseg

wordList = []
file = open(r'./true_short.txt.bak', 'r').read()
wfile = open('./true_short.txt', 'w')
words = list(pseg.cut(file))
for w in words:
    wordList.append(w.word)

# 去除停用词
stopwords = []
for line in open("stopword.txt"):
    line = line.strip('\n')
    line = line.strip(' ')
    stopwords.append(line)
print stopwords
for w in wordList:
    if w in stopwords:
        wordList.remove(w)
for w in wordList:
    wfile.write(w.encode('utf-8'))
Example #58
0
jieba.load_userdict("userdict.txt")
import jieba.posseg as pseg

jieba.add_word('石墨烯')
jieba.add_word('凱特琳')
jieba.del_word('自定义词')

test_sent = ("李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n"
             "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n"
             "「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。")
words = jieba.cut(test_sent)
print('/'.join(words))

print("=" * 40)

result = pseg.cut(test_sent)

for w in result:
    print(w.word, "/", w.flag, ", ", end=' ')

print("\n" + "=" * 40)

terms = jieba.cut('easy_install is great')
print('/'.join(terms))
terms = jieba.cut('python 的正则表达式是好用的')
print('/'.join(terms))

print("=" * 40)
# test frequency tune
testlist = [
    ('今天天气不错', ('今天', '天气')),
def evaluate_line(text):
    segwords = " ".join(
        [w.word for w in pseg.cut(text) if w.word not in stopwords])
    prediction = classifier.predict([segwords])
    return prediction[0][0]
Example #60
0
#写roo
write_root =ElementTree.Element("documents")

for person in persons:
    refers = set()
    xml_doc = ElementTree.SubElement(write_root, "doc")
    name = person.find("name").text
    name = re.sub(r'(.*)', '', str(name))
    dis = person.find("dis").text
    refers.add(name)
    #将去掉小括号的名字放到doc的name属性中
    xml_doc.set("name", name)
    #s删除作品名
    text = re.sub(r'《.*》', '', str(dis))
    words = pseg.cut(text)
    try:
        for w in words:
            if len(w.word) == 1:
                continue
            if str(w.flag) == "nr":
                refers.add(w.word)
    except:
        pass
    #去除本人的名字
    refers.remove(name)
    for refer in refers:
        xml_refer = ElementTree.SubElement(xml_doc, "refer")
        xml_refer.text = refer