Ejemplo n.º 1
0
def segment_pos(dir='rawdata', datetype='all', outdir='nohref_seg'):
	jieba.set_dictionary('dict/dict.txt.big')
	for tag in loadTag():
		jieba.add_word(tag)
	
	chinese_postagger = StanfordPOSTagger('tagger/chinese-distsim.tagger', 'tagger/stanford-postagger.jar', encoding='utf-8')
	
	for file in parseDateType(dir,datetype):
		dirname, filename = os.path.split(file)
		head = filename.split('.')[0]
		outfile = outdir + '/' + head + '.txt'
		if os.path.isfile(outfile):
			print 'pass %s...' %head
			continue

		print 'segment %s ...' %head
		f = open(outfile, 'w')
		dataList = readJson(file)
		p = re.compile("http[s]?://.*\n")
		for data in dataList:
			content = data['content']
			content = re.sub(p, '', content)
			segList = jieba.cut(content)
			wordList, tagList = postagging(chinese_postagger, segList)
			for w, t in zip(wordList, tagList):
				f.write(w.encode('utf-8'))
				f.write(' ')
				f.write(t)
				f.write(' ')
			f.write('\n')
		f.close()
Ejemplo n.º 2
0
def fetch_data(readfile, writefile):
    f = open('custom.dict', 'r')
    for line in f:
        jieba.add_word(line.strip())
    f.close()

    fp = open(readfile, 'r')
    ID = []
    words = []
    for line in fp:
        test = []
        content = line.split('\t')
        ID.append(content[0])
        final = jieba.cut(content[1])
        a = ' '.join(final)
        test.append(a.strip())
        words.append(test)


    fpp = open(writefile, 'wb')
    i = 0
    for item in words:
        fpp.write(ID[i] + '\t')
        for itemo in item:
            fpp.write(itemo)
        fpp.write('\n')
        i = i + 1
    fp.close()
    fpp.close()

    return
Ejemplo n.º 3
0
def init_jieba():
    jieba.dt.check_initialized()
    with codecs.open('new_words', 'r', 'utf-8') as f:
        for line in f:
            w = line.strip()
            if w:
                jieba.add_word(w)
Ejemplo n.º 4
0
def generate_top():
    from collections import defaultdict
    import simplejson as json
    import operator
    from_product = defaultdict(lambda: 0)
    results = defaultdict(lambda: 0)
    for product in cols['product'].find().sort('_id', -1):
        for k in product.keys():
            from_product[k] += 1
    product_keys = dict(from_product)
    for w in list(product_keys.keys()):
        jieba.add_word(w, tag='nz')
    progress = 0
    for comment in cols['mobile_comment'].find(projection={'content': 1}):
        c = comment['content']
        words = jieba.analyse.extract_tags(c, topK=20, withWeight=False, allowPOS=('ns', 'n', 'nz'))
        for w in words:
            results[w] += 1
        progress += 1
        if progress % 100 == 0:
            print('Current Progress: ', progress)
    sorted_x = reversed(sorted(dict(results).items(), key=operator.itemgetter(1)))
    json.dump(
        list(sorted_x),
        open('sorted_mobile.json', mode='w', encoding='utf-8'),
        ensure_ascii=False,
        indent=2
    )
Ejemplo n.º 5
0
def cut_news(news, names):
    #添加自定义词典
    for name in names:
        jieba.add_word(name, freq = 1000, tag = 'nr')
    #初步分词
    words = [[word.encode('utf-8'), flag.encode('utf-8')] for word, flag in pesg.cut(news)]
    #纪录名字与关系关键词在words中的index
    k = 0
    names_loc = []
    rels_loc = []
    for word, flag in words:
        cur = relationship_keywords.find_one({'keywords': word})
        if cur:
            rel = cur['name'].encode('utf-8')
            if rel in rel_one_way:
                words[k][1] = 'relationwords_one'
            elif rel in rel_two_way_single:
                words[k][1] = 'relationwords_two_single'
            elif rel in rel_two_way_double:
                words[k][1] = 'relationwords_two_double'
            rels_loc.append(k)
        elif word in names:
            words[k][1] = 'namewords'
            names_loc.append(k)
        k += 1

    if (len(names_loc) != len(names)) | (len( rels_loc) < 1):
        return False

    return [words,  rels_loc, names_loc]
Ejemplo n.º 6
0
def main():
    starttime = datetime.datetime.now()

    path = os.path.abspath('.')
    path = path.split('/')
    basepath = "/".join(path[:-2])

    dictpath = os.path.join(basepath,'data/myDict.txt')
    jieba.load_userdict(dictpath)

    target_rel = u'夫妻'

    train_user_path =  os.path.join(basepath,'data/train_user.txt')
    with open(train_user_path) as f:
        userdata = f.readlines()

    userset = []
    for line in userdata:
        userset.append(line[:-1])

    for user in userset[0:1]:
        tupu_path = os.path.join(basepath,'data/train/entity_tupu/entity_tupu.%s' % user)
        with open(tupu_path) as f:
            tupu_data = f.readlines()

        entity_pair = []
        for line in tupu_data:
            data = line[:-1].split('\t')
            rel = data[0].decode('utf-8')
            entity1 = data[1].decode('utf-8')
            entity2 = data[2].decode('utf-8')
            if rel == target_rel:
                entity_pair.append([entity1,entity2])


        datapath = os.path.join(basepath,'data/train/entity_sentence/entity_sentence.%s' % user)
        with open(datapath) as f:
            dataset = f.readlines()    

        three_split_set = []
        for line in dataset:
            try:
                data = line[:-1].split('\t')
                entity1 = data[1].decode('utf-8')
                entity2 = data[2].decode('utf-8')
                sentence = data[0].decode('utf-8')
                if [entity1,entity2] in entity_pair or [entity2,entity1] in entity_pair:
                    print sentence,entity1,entity2
                    jieba.add_word(entity1,1000)
                    jieba.add_word(entity2,1000)
                    three_split = cut_sentence(sentence,entity1,entity2)
                    if three_split == None:
                        continue
                    three_split_set.append(three_split)

                    # if rel in sentence:
                        # print sentence
            except Exception, e:
                print e
Ejemplo n.º 7
0
def addToDictionary():
	"""Grabs list of checked words and adds to the operating dictionary
	Note: If word already exists in dictionary, increments frequency"""
	wordList = request.form.get('segCheckbox')
	for word in wordList:
		jieba.add_word(word,1)
	flash("You successfully updated the dictionary!")
	return redirect(url_for('model_show_entries'))
Ejemplo n.º 8
0
def __init():
    user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
    jieba.load_userdict(user_dict_path)
    jieba.add_word("快递", 10000)
    jieba.suggest_freq(("面", "太厚"))
    jieba.suggest_freq(("价格", "便宜"))
    jieba.suggest_freq(("服务", "周到"))
    jieba.suggest_freq(("速度", "快"))
def tokenize(sentence,addwords=None):
    if(addwords!=None):
        for word in addwords:
            jieba.add_word(word)
    tokens = []
    for term in jieba.tokenize(sentence):
        tokens.append(term[0])
    return tokens
Ejemplo n.º 10
0
 def add_word(self, key, code):
     """动态向结巴词典添加自定义的词"""
     if code not in MYSELF_ADD_DICT_TYPE:
         logger.info("动态添加字词的类型出错")
         return
     user_weight, user_type = MYSELF_ADD_DICT_TYPE.get(code)
     # 使用 add_word(word, freq=None, tag=None) 和 del_word(word) 可在程序中动态修改词典。
     jieba.add_word(key, user_weight, user_type)
     self.ADD_MAP_DATA.setdefault(key, {})
     self.ADD_MAP_DATA[key][user_type] = user_weight
def load_user_library(file):
    '''
        Load user dictionary to increase segmentation accuracy
    '''
    
    if isinstance(file, str):
        jieba.load_userdict(file)
    elif isinstance(file, list):
        for value in file:
            jieba.add_word(value.lower())
    else:
        pass
def cn_ci(dir_path):
    for rdf in ci_list:
        jieba.add_word(rdf[0])
    all_text = u""
    for file_name in os.listdir(dir_path):
        if file_name.find(".txt") != -1:
            file_path = "/".join([dir_path, file_name])
            with open(file_path, "r") as f:
                all_text += f.read().decode("utf-8")

    terms = jieba.cut(all_text)

    return [ci for ci in ','.join(terms).split(',') if ci not in [u'', u" "]]
def addDictToJieba():
    ##### roadList
    content=open('../data_crawl/finalRoads.txt','r').read().strip('\n')
    contentList=content.split('\n');print len(contentList)
    #############load district dict
    districtNameList=grab('/home/yr/intellicredit/data/'+'districtNameList0503')
    test_sent = [
    "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿,上海市浦东区\n"
    ]
    ######## print cut before add dictionary
    #print test_sent[0].decode('utf-8')
    words = jieba.cut(test_sent[0].decode('utf-8'))
    #print('/'.join(words))

    ####add word_dictionary to jieba
    for w in districtNameList[:]+contentList:
	    #print w
	    jieba.add_word(w)
        ####add district-name-not-in-dictionary to jieba
    jieba.add_word('浦东区');
    jieba.add_word('浦东新区')
    jieba.del_word('上海市')
    jieba.add_word('兰城路')



    words = jieba.cut(test_sent[0].decode('utf-8'))
Ejemplo n.º 14
0
def jieba_processing_txt(text):
    for word in userdict_list:
        jieba.add_word(word)

    mywordlist = []
    seg_list = jieba.cut(text, cut_all=False)
    liststr = "/ ".join(seg_list)

    with open(stopwords_path, encoding='utf-8') as f_stop:
        f_stop_text = f_stop.read()
        f_stop_seg_list = f_stop_text.splitlines()

    for myword in liststr.split('/'):
        if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
            mywordlist.append(myword)
    return ' '.join(mywordlist)
def get_key_words():
    client = MongoClient('localhost', 27017)
    db= client['baidu']
    relationship_keywords = db.relationship_keywords
    file = open('/Users/wutong/workspace/da/baidu/data/condensedata/condensetraindata.txt')
    for line in file.readlines():
        items = line.split()
        names = [items[1], items[2]]
        news = items[3]
        rel = items[0]
        for name in names:
            jieba.add_word(name, freq = None, tag = 'nr')
        words = pesg.cut(news)
        for word, flag in words:
            if flag == 'n':
                relationship_keywords.update({ 'name': rel }, {'$addToSet': { 'keywords': word}})
Ejemplo n.º 16
0
    def Get_fenci(self):

        # jieba.add_word('石墨烯')#动态添加自定义单词
        jieba.add_word('凱特琳')
        jieba.del_word('自定义词')
        jieba.add_word("易风化")
        filtered_tokens = []
        test_sent = ""
        for i in range(1,2):
            Data_path = path + "he"+".txt"
            test_sent ="".join(open(Data_path, 'rb').read())
        print (test_sent)
        words = jieba.cut(test_sent)
        filtered_tokens.append([each for each in jieba.cut(test_sent)])
        print ('-'*40)
        print (json.dumps(filtered_tokens))
        print("="*40)
Ejemplo n.º 17
0
def handle(data):
    oper = json.loads(data)
    if oper[0] == 'cut':
        return json.dumps(tuple(jieba.cut(*oper[1], **oper[2]))).encode('utf-8')
    elif oper[0] == 'cut_for_search':
        return json.dumps(tuple(jieba.cut_for_search(*oper[1], **oper[2]))).encode('utf-8')
    elif oper[0] == 'tokenize':
        return json.dumps(tuple(jieba.tokenize(*oper[1], **oper[2]))).encode('utf-8')
    elif oper[0] == 'add_word':
        jieba.add_word(*oper[1], **oper[2])
    elif oper[0] == 'load_userdict':
        jieba.load_userdict(*oper[1])
    elif oper[0] == 'set_dictionary':
        jieba.set_dictionary(*oper[1])
    elif oper[0] == 'stopserver':
        return b'stop'
    elif oper[0] == 'ping':
        return b'pong'
Ejemplo n.º 18
0
    def rich_train_data_by_editor(self, files=[]):
        """ 通过人工编辑规则增强Train Data """
        # 20140910_1427 没效果,反而有一两个百分点下降。
        import jieba
        dict_dir = None

        for file1 in files:
            parsed = ReadManualKps.process(dict_dir + file1)
            for node_name1, node_features in parsed.iteritems():
                features2 = [node_name1] + node_features
                # 1. Add features to jieba
                for name2 in features2:
                    jieba.add_word(name2, 1000000)  # 1000000 copied from lianhua
                # 2. Add features to tags_tree.name_to_nodes
                nodes_set2 = self.name_to_nodes.get(node_name1, set([]))
                for node3 in nodes_set2:
                    node3_feature_max_value = max(node3.features_weight.values() or [0.25])
                    for feature4 in features2:
                        node3.features_weight[feature4] = node3_feature_max_value
                        self.feature_to_nodes[feature4].add(node3)
def condense_key_words(rel):
    client = MongoClient('localhost', 27017)
    db= client['baidu']
    relationship_keywords = db.relationship_keywords
    file = open('/Users/wutong/workspace/da/baidu/data/condensedata/condensetraindata.txt')
    file1 = ''
    name_list = []
    for line in file.readlines():
        items = line.split()
        names = [items[1], items[2]]
        news = items[3]
        rela = items[0]
        if rela == rel:
            for name in names:
                name_list.append(name)
                jieba.add_word(name, freq = None, tag = 'nr')
            file1 = file1 + news + '\n'
    tags = jieba.analyse.extract_tags(file1, topK=20, withWeight=False, allowPOS=())
    for tag in tags:
        if tag.encode('utf-8') not in name_list:
            relationship_keywords.update({ 'name': rel }, {'$addToSet': { 'keywords': tag}})
Ejemplo n.º 20
0
def handlemsg(data):
    oper = loadsjson(data)
    if oper[0] == 'c2m':
        return dumpsjson(mc.c2m.translate(*oper[1:]))
    elif oper[0] == 'm2c':
        return dumpsjson(mc.m2c.translate(*oper[1:]))
    elif oper[0] == 'c2m.raw':
        return dumpsjson(mc.c2m.rawtranslate(oper[1]))
    elif oper[0] == 'm2c.raw':
        return dumpsjson(mc.m2c.rawtranslate(oper[1]))
    elif oper[0] == 'modelname':
        return dumpsjson(mc.name())
    elif oper[0] == 'cut':
        return dumpsjson(tuple(jieba.cut(*oper[1], **oper[2])))
    elif oper[0] == 'cut_for_search':
        return dumpsjson(tuple(jieba.cut_for_search(*oper[1], **oper[2])))
    elif oper[0] == 'tokenize':
        return dumpsjson(tuple(jieba.tokenize(*oper[1], **oper[2])))
    elif oper[0] == 'jiebazhc.cut':
        return dumpsjson(tuple(jiebazhc.cut(*oper[1], **oper[2])))
    elif oper[0] == 'jiebazhc.cut_for_search':
        return dumpsjson(
            tuple(jiebazhc.cut_for_search(*oper[1], **oper[2])))
    elif oper[0] == 'jiebazhc.tokenize':
        return dumpsjson(tuple(jiebazhc.tokenize(*oper[1], **oper[2])))
    elif oper[0] == 'add_word':
        jieba.add_word(*oper[1], **oper[2])
    elif oper[0] == 'load_userdict':
        jieba.load_userdict(*oper[1])
    elif oper[0] == 'set_dictionary':
        jieba.set_dictionary(*oper[1])
    elif oper[0] == 'stopserver':
        return b'stop'
    elif oper[0] == 'ping':
        return b'pong'
    else:
        return dumpsjson('Command not found')
Ejemplo n.º 21
0
    def getCorpusFromFile(self, filename):
        '''
        该函数通过读入文本,得到分词结果的语料库
        '''
        #处理掉特殊符号
        punctuation = re.compile("[a-zA-Z0-9\s+\.\!\/_,$%^*(+\"\']+|[-+——!,。?、~@#¥%……&*()]+".decode('utf-8'))

        #对关键词提高权重
        with open('keywords.txt', 'r') as parse_file: 
            for eachline in parse_file:
                jieba.add_word(eachline.strip())

        #处理停用词
        jieba.analyse.set_stop_words('stop_words.txt')


        #读入训练集数据
        with open(filename, "rb") as fp:
            trainData = [line.strip().split("\t") for line in fp]

 
        #分词处理
        self.corpus = [[wordSplit(content[0], punctuation), int(content[1])]
              for content in trainData]
Ejemplo n.º 22
0
 def add_usr_dict(path, sep=','):
     with open(path, 'r') as f:
         for l in f.xreadlines():
             items = l.split(sep)
             if len(items) == 3:
                 jieba.add_word(items[0].rstrip(), int(items[1].rstrip()), items[2].rstrip())
             elif len(items) == 2:
                 jieba.add_word(items[0].rstrip(), int(items[1].rstrip()))
             elif len(items) == 1:
                 jieba.add_word(items[0].rstrip())
             else:
                 raise ValueError('too less number of word info \'%s\'' % (l.strip()))
Ejemplo n.º 23
0
def add_word(list):
    for items in list:
        jieba.add_word(items)
Ejemplo n.º 24
0
            end_index = end_index + 1
    return sub_contents


def build_new_sub_content(sub_content):
    cut_content_word_list = pseg.cut(sub_content, use_paddle=True)
    new_content_list = []
    for word, flag in cut_content_word_list:
        new_content_list.append(word)
        new_content_list.append(flag)
    return ''.join(new_content_list)


# ORG用于删除

jieba.add_word('热情', tag='a')
jieba.add_word('位置', tag='n')
jieba.add_word('预定', tag='n')
jieba.add_word('卫生', tag='n')
jieba.add_word('不值', tag='v')
jieba.add_word('齐全', tag='a')
jieba.add_word('也', tag='ORG')
jieba.add_word('服务态度', tag='n')
jieba.add_word('隔音', tag='n')
jieba.add_word('房间隔音', tag='n')
jieba.add_word('安静', tag='a')
jieba.add_word('无语')
jieba.add_word('有味道')
jieba.add_word('极')
jieba.add_word('最')
jieba.add_word('太')
def add_word(text, number):
    strs1 = getNewWordsByNLPIR(text, number)
    for i in strs1:
        jieba.add_word(i)
    for i in my_words_list:
        jieba.add_word(i)
Ejemplo n.º 26
0
            for name, data in self.data[index].items():
                for alias in set(data['alias']):
                    if re.search(ur"[省市县]$", alias):
                        jieba.add_word(alias, 10000000)
                    elif re.search(ur"[区]$", alias):
                        jieba.add_word(alias, 1000000)
                    else:
                        jieba.add_word(alias, 100000)

                        for suffix in u"路镇乡圩河区村":
                            jieba.add_word(u"{}{}".format(
                                alias, suffix), 1000000)

        names = file2iter(file2abspath('region_dict.txt', __file__))
        for name in names:
            jieba.add_word(name.strip(), 1)

        # jieba.del_word(u"广州药业")

    def normalize_region_name(self, name, xtype):
        if not hasattr(self, "normalizeRegion_mapped"):
            setattr(self, "normalizeRegion_mapped", collections.Counter())
        mapped = getattr(self, "normalizeRegion_mapped")

        if len(name) > 2:
            name = re.sub(u"[省市]+$", "", name)

        if name in [u"市辖区"]:
            return name

        if name in ["", u"省市"]:
Ejemplo n.º 27
0
    def __init__(self, url):
        # 爬取团队介绍的页面
        res = requests.get(url)
        # 采集介绍信息
        selector = etree.HTML(res.text)
        content = selector.xpath(
            "//div[@class='piece-body p-lg clearfix']/p/text()")
        content = re.sub('[\\\\x|\s|,|)|(|,|:|:|!|!|、|。]', '',
                         ''.join(content)).strip()

        # jieba 分词
        jieba.add_word('学号')
        jieba.add_word('Python')
        jieba.add_word('MySQL')
        jieba.add_word('JavaScript')
        jieba.add_word('HTML')
        jieba.add_word('Java')
        self.__words = list(jieba.cut(content))
        self.__record_set = set(self.__words)
        self.__record_dict = {}
        for word in self.__record_set:
            self.__record_dict[word] = self.__words.count(word)

        print(self.__record_dict)
Ejemplo n.º 28
0
seg_list = jieba.cut("网易智造N520除螨吸尘器", cut_all=True)
print("Full Mode: " + "/ ".join(seg_list))  # 全模式

seg_list = jieba.cut("宇宙沙盘控温被 薄被", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
print(", ".join(seg_list))

seg_list = jieba.cut_for_search("心想智能胶囊咖啡机")  # 搜索引擎模式
for seg in seg_list:
    print(seg)

# 自定义关键词库
jieba.load_userdict('./user_dict.txt')

jieba.add_word('石墨烯')
jieba.add_word('凱特琳')
jieba.del_word('自定义词')

test_sent = ("李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n"
             "例如我输入一个带“韩玉赏鉴”的标题,宇宙沙盘控温被 薄被在自定宇宙沙盘控温被 薄被此词为N类\n"
             "「台中」正確應該网易智造N520除螨吸尘器不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。")
words = jieba.lcut('心想智能胶囊咖啡机', cut_all=False)
for word in words:
    if word in jieba.user_word_tag_tab:
        print(word)
print(jieba.user_word_tag_tab.__contains__('薄被'))

# 将所有的产品名进行分词,按词频进行排序并统计,词频较高的作为分类关键词
Ejemplo n.º 29
0
import numpy as np
import matplotlib.pyplot as plt
#词云生成工具
from wordcloud import WordCloud, ImageColorGenerator
#需要对中文进行处理
import matplotlib.font_manager as fm

#背景图
bg = np.array(Image.open("alice.png"))

#获取当前的项目文件加的路径
d = path.dirname(__file__)
#读取停用词表
stopwords_path = 'wangfeng.txt'
#添加需要自定以的分词
jieba.add_word("丁")

#读取要分析的文本
text_path = "wangfeng.txt"
#读取要分析的文本,读取格式
text = open(path.join(d, text_path), encoding="utf8").read()
#删除指定内容
text = re.sub(u'.*?::', '', text)


#定义个函数式用于分词
def jiebaclearText(text):
    #定义一个空的列表,将去除的停用词的分词保存
    mywordList = []
    #进行分词
    seg_list = jieba.cut(text, cut_all=False)
Ejemplo n.º 30
0
def pro(filename, path, choose):
    global xxxxxx
    f = open(path, 'r', encoding='utf-8')
    text = f.read()
    f.close()
    keyWords = re.findall(r'\((.*?)\)', text)
    for word in keyWords:
        jieba.add_word(word)
        xxxxxx.append(word)

    xx = []
    text = jieba.cut(text)
    for i in text:
        xx.append(i)
    text = ' '.join(xx)

    sss = []
    numberOfDelete = 0
    for m in re.finditer(r'\( (.*?) \)|\[ (.*?) \]|\{ (.*?) \}', text):
        #print(m.start(), m.end(), text[m.start():m.end()])
        numberOfDelete += 2
        dic = {}
        beginPos = m.start() + 2 - numberOfDelete
        endPos = m.end() - numberOfDelete - 2
        dic['beginPos'] = str(beginPos)
        dic['endPos'] = str(endPos)
        dic['text'] = m.group(0)[2:-2]
        if (text[m.start()] == '('):
            dic['type'] = 'LOC'
        elif (text[m.start()] == '['):
            dic['type'] = 'PER'
        elif (text[m.start()] == '{'):
            dic['type'] = 'ORG'
        numberOfDelete += 2
        sss.append(dic)

    number = 1
    m = []
    for obj in sss:
        x = []
        x.append('T' + str(number))
        x.append(obj['type'])
        x.append(obj['beginPos'])
        x.append(obj['endPos'])
        x.append(obj['text'])
        m.append('\t'.join(x))
        number += 1

    filename = filename.split('.')[0]

    f = open('../data/conll2003/en/' + choose + '/' + filename + '.ann',
             'w',
             encoding='utf-8')
    f.write('\n'.join(m))
    f.close()
    f1 = open('../data/conll2003/en/' + choose + '/' + filename + '.txt',
              'w',
              encoding='utf-8')
    text = text.replace('( ', '').replace(' )', '')
    text = text.replace('[ ', '').replace(' ]', '')
    text = text.replace('{ ', '').replace(' }', '')
    f1.write(text)
    f1.close()
import re
data = pd.read_excel('Excel/Laptop_data_processed.xls')

title_list = data.raw_title.values.tolist()

new_title_list = []
#对每一个title进行提取清洗, 删除所有符号
for title in title_list:
    new_title = "".join(re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9]', title))
    new_title_list.append(new_title)

import jieba
#添加自定义中文词
corpus = ['吃鸡', 'ThinkPad', 'MacBookAir', 'IdeaPad']
for word in corpus:
    jieba.add_word(word)

#分词
title_list = []
for title in new_title_list:
    title_cut = jieba.lcut(title)  #list
    title_list.append(title_cut)

#创建停用词
stop_words = []
with open('Text/Stop words.txt') as f:
    for line in f:
        word = f.readline()
        word = word.strip()
        stop_words.append(word)
stop_words.append('笔记本电脑')
Ejemplo n.º 32
0
def init():
    # dic_path = '/Users/li/PycharmProjects/huihongcaihui/src/corpus'
    dic_path = conf.dic_path

    # 读取词典
    d_path = dic_path + "/程度副词_datatang.txt"
    s_path = dic_path + "/senti.txt"
    f_path = dic_path + "/fou.txt"
    b_path = dic_path + "/but.txt"
    e_path = dic_path + "/eng.txt"
    l_path = dic_path + "/limit.dict"
    a_path = dic_path + "/dic.txt"
    ns_path = dic_path + "/新增_stock"
    n_path = dic_path + "/新增"
    n2_path = dic_path + "/新增2"
    st_path = dic_path + "/stock_words.txt"
    st_new_path = dic_path + "/stock.csv"
    zhi_ne_path = dic_path + "/知网/zhi_neg.txt"
    zhi_po_path = dic_path + "/知网/zhi_pos.txt"
    jg_path = dic_path + "/机构"

    # 添加基金公司实体名字,比如("工银瑞信基金", "华泰柏瑞基金", "东方基金")

    # 结巴新词
    word_add = set()

    for d in open(d_path):
        # temp = d.decode("utf-8").split(" ")
        temp = d.split(" ")
        word_arr = temp[1].strip("\n").rstrip(" ").split("、")
        for w in word_arr:
            deg_dict[w] = float(temp[0])
            word_add.add(temp[0])

    for s in open(s_path):
        # temp = s.decode("utf-8").split(" ")
        temp = s.split(" ")
        senti_dict[temp[0]] = float(temp[1])
        word_add.add(temp[0])

    for e in open(e_path):
        temp = e.split(" ")
        eng_dict[temp[0]] = float(temp[1])
        word_add.add(temp[0])

    for f in open(f_path):
        # f = f.decode("utf-8-sig")
        fou_dict.append(f.strip("\n"))
        word_add.add(f.strip("\n"))

    for b in open(b_path):
        but_dict.append(b.strip("\n"))
        word_add.add(b.strip("\n"))

    for l in open(l_path):
        lim_dict.append(l.strip("\n"))
        word_add.add(l.strip("\n"))

    for a in open(a_path):
        new_dict.append(a.strip("\n"))
        word_add.add(a.strip("\n"))

    for st in open(st_path):
        # st = st.decode("utf8")
        code1, st_code = st.split("\t")
        code, stock = st_code.split(",")
        stock_code_dict.append(code.strip("\n"))
        stock_dict.append(stock.strip("\n"))
        word_add.add(code.strip("\n"))
        word_add.add(stock.strip("\n"))
    stocks_df = pd.read_csv(st_new_path, encoding='utf-8')
    stock_df.append(stocks_df.set_index('SESNAME'))
    for index, row in stocks_df.iterrows():
        stock_dict.append(row.SESNAME)

    for z1 in open(zhi_ne_path):
        # z1 = z1.decode("utf8")
        new_dict.append(z1.strip("\n"))
        word_add.add(z1.strip("\n"))

    for z2 in open(zhi_po_path):
        # z2 = z2.decode("utf8")
        z2_data = z2.strip("\n")
        new_dict.append(z2_data)
        word_add.add(z2_data)

    for jg in open(jg_path):
        # jg = jg.decode("utf8")
        jg_data = jg.split("\t")[0].strip("\n")
        new_dict.append(jg_data)
        word_add.add(jg_data)
    '''
    # 将stock_words.txt中的股票词转换成jieba用户自定义词典的格式,然后添加到jieba的userdict中
    for st in open(st_path):
        code1, st_code = st.split("\t")
        code, stock = st_code.split(",")
        stock_dict.append(code + ' ' + '5' + ' ' + 'n')
        stock_dict.append(stock.strip('\n').decode('utf-8') + ' ' + '5' + ' ' + 'n')
    apply_func = codecs.open(n_path, 'w', 'utf-8')
    for i in stock_dict:
        apply_func.write(i + '\n')  # \n为换行符
    apply_func.close()
    '''
    # 添加用户自定义字典
    jieba.load_userdict(ns_path)
    jieba.load_userdict(n_path)
    jieba.load_userdict(jg_path)
    jieba.load_userdict(n2_path)

    # 添加新词
    for w in word_add:
        jieba.add_word(w)

    # 结巴添加新词
    jieba.add_word("淡定")
    # jieba.add_word("加多宝")
    # jieba.add_word("红罐")
    jieba.add_word("非公开")
    jieba.add_word("不成人形")
    jieba.add_word("中美贸易战")
    logging.logger.info("[Info] jieba总共添加了{}个自定义词汇。".format(len(word_add)))
Ejemplo n.º 33
0
    - Download the traditional chinese dictionary from [`jieba-tw`](https://raw.githubusercontent.com/ldkrsi/jieba-zh_TW/master/jieba/dict.txt)
    
```
jieba.set_dictionary(file_path)
```

- Add own project-specific dictionary

```
jieba.load_userdict(file_path)
```

- Add add-hoc words to dictionary

```
jieba.add_word(word, freq=None, tag=None)
```

- Remove words

```
jieba.del_word(word)
```

- Chinese stopwords (See [林宏任老師 GitHub](https://github.com/tomlinNTUB/Python/tree/master/%E4%B8%AD%E6%96%87%E5%88%86%E8%A9%9E)

    - `jieba.cut()` does not interact with stopword list
    - `jieba.analyse.set_stop_words(file_apth)`

- Word segmentation
    - `jieba.cut()` returns a `generator` object
Ejemplo n.º 34
0
#   			entity mention: 识别出的实体名称
#   			pos_b: entity mention在文件中的起始位置(从0开始编号)
#   			pos_e: entity mention在文件中的终止位置
#   			category:entity所属的类别
#     DICT_NOW.csv:
#           所有标记对应的语言组成的dictionary
#           这一部分只需要将***.txt中的entity mention和entity category进行对应输出即可
#           得到DICT_NOW.csv文件。(实际中可以在医药网站或者医学百科中爬取一些医学类entity)
dics = csv.reader(open("./source_data/DICT_NOW.csv", 'r', encoding='utf8'))

# =======================================================================================
# 利用jieba自定义分词,进行专有名词输入
# 将识别对象加入jieba识别词表,标记视为词性
for row in dics:
    if len(row) == 2:
        jieba.add_word(row[0].strip(), tag=row[1].strip())
        # 强制加入词为一个joined整体
        jieba.suggest_freq(row[0].strip())

# =======================================================================================
# 读取目标文件,进行IOB格式的标记,并写入dev、train、test文件
split_num = 0
start_time = time.time()

for file in os.listdir(c_root):
    if "txtoriginal.txt" in file:
        fp = open(c_root + file, 'r', encoding='utf8')
        for line in fp:
            split_num += 1
            words = pseg.cut(line)  # 带词性切词
            # key: word; value: part of speech
Ejemplo n.º 35
0
plt.rcParams[
    'savefig.dpi'] = 200  # 定义图形清晰度,set dpi for figure, affect the figure's size
plt.rcParams['figure.dpi'] = 200  #set dpi for figure
w, l, p = plt.pie(values,
                  explode=[0.02 for i in range(16)],
                  labels=labels,
                  pctdistance=0.8,
                  radius=1,
                  rotatelabels=True,
                  autopct=make_autopct(values))
[t.set_rotation(315) for t in p]  # rotate the text for the labels
plt.title('豆瓣 TOP250 电影种类', y=-0.1)
plt.show()

# word cloud
jieba.add_word('久石让')
jieba.add_word('谢耳朵')
# 一些语气词和没有意义的词
del_words = [
    '的', ' ', '人', '就是', '一个', '被', '电影', '我们', '不是', '也', '最', '了', '才', '给',
    '要', '就', '让', '在', '都', '是', '与', '和', '不', '有', '我', '你', '能', '每个',
    '不会', '中', '没有', '这样', '那么', '不要', '如果', '来', '它', '对', '当', '比', '不能',
    '却', '一种', '而', '不过', '只有', '不得不', '再', '不得不', '比', '一部', '啦', '他', '像',
    '会', '得', '里'
]
all_quotes = ''.join(quote_list)  # 将所有代表性评论拼接为一个文本
# 去掉标点符号
all_quotes = re.sub(
    r"[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+", " ",
    all_quotes)
words = jieba.lcut(all_quotes)
Ejemplo n.º 36
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time : 2020/9/1 18:22
# @Author : way
# @Site :
# @Describe: 结巴分词

# import jieba_fast as jieba
import jieba

for i in ('悟空', '沙僧', '沙和尚', '猪八戒', '唐僧', '唐三藏'):
    jieba.add_word(i)

path = r"C:\Users\Administrator\Desktop\西游记.txt"
with open(path, 'r', encoding='utf-8') as f:
    result = jieba.lcut(f.read())
    new_result = [(i, result.count(i)) for i in set(result)
                  if i in ('悟空', '沙僧', '沙和尚', '猪八戒', '唐僧', '唐三藏')]
    new_result.sort(key=lambda x: x[1], reverse=True)
    for i in new_result:
        print(i)
# -*- coding: utf-8 -*-
# @Time : 2020/7/5 12:57
# @Author : MYH
# @File : 6.5 jieba库的使用.py
# @Software: PyCharm

# 如果对于英文文本可以通过split方法进行分离提取单词 可是对于中文文本中缺少分隔符,所以有了"分词"问题 而jieba库则是python中一个重要的第三方中文分词函数库
# jieba库不是python自带库,所以需要我们自行安装 通过pip install jieba 命令完成安装 但是太慢了 需要使用国内的镜像源

# 引入jieba库
import jieba

# jieba.cut("你你你好")  # 精确模式 返回一个可迭代的数据类型
# jieba.cut(s, cut_all=True)  # 全模式 输出文本中所有可能的单词
# jieba.cut_for_search(s)  # 搜索引擎模式,蛇和搜索引擎建立索引的分词结束
# jieba.lcut(s)  # 建议使用 精确模式,返回一个列表类型
# jieba.lcut(s, cut_all=True)  # 建议使用 全模式 返回一个列表类型
# jieba.lcut_for_search(s)  # 搜索引擎模式 返回一个列表类型 建议使用
# jieba.add_word(w)  # 向分词词典中添加一个新词w
print(jieba.lcut("中国是我的祖国,中华人民共和国是一个伟大的国家"))  # 精确模式,输出的分词能够完整且不多于组成原始文本
print(jieba.lcut("中国是我的祖国,中华人民共和国是一个伟大的国家", cut_all=True))  # 返回安全模式,输出原始文本中可能产生的所有问题,冗余最大
print(jieba.lcut_for_search("中国是我的祖国,中华人民共和国是一个伟大的国家"))  # 搜索引擎模式,首先执行精确模式,然后再对其中的长词进行进一步切分

print(jieba.lcut("习大大在西湖游玩"))
jieba.add_word('习大大')  # 添加单词
print(jieba.lcut("习大大在西湖游玩"))
Ejemplo n.º 38
0
u'郑州',u'武汉',u'长沙',u'厦门',u'苏州',u'南京',u'合肥',u'哈尔滨',u'青岛',u'湖州',u'无锡'
,u'福州',u'阜阳',u'贵阳',u'沈阳',u'浦东新区',u'通化']
stop_words_file_list = [
    "hit_stopword.txt",
    "baidu_stopword.txt",
    "sichuan_stopword.txt",
    "chinese_stopword.txt"
]

for file_path in stop_words_file_list:
    with open(os.path.join(os.path.dirname(__file__), file_path)) as f:
        lines = f.readlines()
        for line in lines:
            stop_words.append(line.strip().decode('utf-8'))

jieba.add_word("location_dic.txt")

with open(os.path.join(os.path.dirname(__file__), "fensi.txt")) as f:
    lines = f.readlines()
    for line in lines:
        clean_line = line.strip()
        if len(clean_line) > 0:
                seg_list = jieba.cut(clean_line)
                for seg in seg_list:
                    if seg not in stop_words:
                        tokens.append(seg)
                    
counter = Counter(tokens)
result=""
totalnum=0
for a in counter.most_common(57):
 def load_word_dict(self):
     if self.external_word_dict:
         for word in self.external_word_dict:
             jieba.add_word(word, freq=1000000)
Ejemplo n.º 40
0
def add_word_dict(word, freq=None, tag=None):
    '''
    向词典中添加新单词
    '''
    jieba.add_word(word, freq=None, tag=None)
Ejemplo n.º 41
0
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')  #gb2312
import random
import numpy as np
from tflearn.data_utils import pad_sequences
#from pypinyin import pinyin,lazy_pinyin

from collections import Counter
import os
import pickle
import csv
import jieba
from data_mining.data_util_tfidf import cos_distance_bag_tfidf, get_tfidf_score_and_save
jieba.add_word('花呗')
jieba.add_word('借呗')
PAD_ID = 0
UNK_ID = 1
_PAD = "_PAD"
_UNK = "UNK"
TRUE_LABEL = '1'
splitter = "&|&"
special_start_token = [u'怎么', u'如何', u'为什么', u'为何']


def load_data(traning_data_path,
              vocab_word2index,
              vocab_label2index,
              sentence_len,
              name_scope,
Ejemplo n.º 42
0
        yield questions, answers


def convert_to_str(string):

    segvec = segmentor.jieba_segment(string)
    cleanedvec = string_process.remove_invalid_string(segvec)
    sortedvec = sorted(cleanedvec)
    return (u'|'.join(sortedvec))


if __name__ == '__main__':

    logging.basicConfig(format='[%(asctime)-15s] : %(levelname)s : %(message)s', level=logging.INFO)

    jieba.add_word('做什么')
    jieba.add_word('吃什么')
    jieba.add_word('叫什么')
    jieba.add_word('唱什么')
    jieba.add_word('想什么')
    jieba.add_word('是什么')
    jieba.add_word('有什么')
    jieba.add_word('用什么')
    jieba.add_word('看什么')
    jieba.add_word('玩什么')
    jieba.add_word('爱什么')
    jieba.add_word('干什么')
    jieba.add_word('整什么')
    jieba.add_word('为什么')

    jieba.add_word('什么事')
Ejemplo n.º 43
0
    def __init__(self, strict_mode=True):
        data = file2json(file2abspath('region_data.json', __file__))
        counter = collections.Counter()
        self.strict_mode = strict_mode

        self.data = {
            'items': {},  # 原始数据,基于cityid(多种指代,可以市省,市,区县级别)

            # 基于别名的索引, NER使用
            'province': {},  # 无重名
            'city': {},  # 有重名
            'district': {},  # 有重名


            'alias': {},  # 别名索引

            'lookup': collections.defaultdict(set),
        }

        # copy data
        for item in data:
            self.data['items'][item['cityid']] = item

        # process province
        map_province = collections.defaultdict(set)
        for item in data:
            p = item.get('province')
            c = item.get('city')
            d = item.get('district')
            if p and not c:
                # cityid 为省的ID
                item["type"] = "province"
                item["name"] = p
                map_province[p].add(item['cityid'])
        assert 34 == len(map_province), len(map_province)
        # logging.info(json.dumps(list(map_province.keys()), ensure_ascii=False))

        for p in sorted(list(map_province)):
            alias_list = normalize_province(p)
            pnorm = alias_list[1]
            self.data['province'][p] = {
                'province': self._get_list_province_unique(map_province[p]),
                'cityid_list': list(map_province[p]),
                'alias': [p] + alias_list}
            map_province[p] = pnorm
            if pnorm.startswith(u'安徽'):
                logging.info(json.dumps(alias_list))
            # print json.dumps(list(set([p,pnorm,pnorm2])),ensure_ascii=False)

        # process city
        map_city = collections.defaultdict(set)
        for item in data:
            c = item.get('city')
            d = item.get('district')
            if c in [u"市辖区", u"县", u"省直辖县级行政区划", u"自治区直辖县级行政区划"]:
                continue
            """
                {
                    "city": "市辖区",
                    "cityid": "310105",
                    "district": "长宁区",
                    "province": "上海市"
                }

                {
                    "city": "南通市",
                    "cityid": "320601",
                    "district": "市辖区",
                    "province": "江苏省"
                },
            """

            if c and not d:
                item["type"] = "city"
                item["name"] = c
                map_city[c].add(item['cityid'])
                if len(map_city[c]) != 1:
                    logging.error(json.dumps(item, ensure_ascii=False))
                    logging.error(len(map_city[c]))
                    assert len(map_city[c]) == 1

        assert 333 == len(map_city), len(map_city)
        # logging.info(json.dumps(list(map_city.keys()), ensure_ascii=False))

        for p in sorted(list(map_city)):
            alias_list = normalize_city(p)
            assert pnorm
            # print p, '-->',pnorm, '-->', pcompact
            self.data['city'][p] = {
                'province': self._get_list_province_unique(map_city[p]),
                'cityid_list': list(map_city[p]),
                'alias': [p] + alias_list}
        assert len(map_city) == len(self.data['city']), len(self.data['city'])

        # process district
        map_district = collections.defaultdict(set)
        for item in data:
            d = item.get('district')
            if d in [u"市辖区"]:
                # check above 市辖区 is used both as value of city and district
                # simply drop them since they already defined in city level
                continue

            if d:
                item["type"] = "district"
                item["name"] = d
                map_district[d].add(item['cityid'])
        assert 2821 == len(map_district), len(map_district)

        for p in sorted(list(map_district)):
            alias_list = normalize_district(p)
            # print p, '-->',pnorm, '-->', pcompact
            cityid_list = list(map_district[p])
            if len(cityid_list) > 1:
                # logging.info( len(cityid_list) )
                # logging.info( p )
                pass

            self.data['district'][p] = {
                'province': self._get_list_province_unique(map_district[p]),
                'cityid_list': cityid_list,
                'alias': [p] + alias_list}

        # process duplicated name 别名索引
        for index in ['province', 'city', 'district']:
            for name, data in self.data[index].items():
                for alias in set(data['alias']):
                    # if alias.startswith(u"清"):
                    #    logging.info(alias)
                    self.data['lookup'][alias].update(data['cityid_list'])

        for alias, alias_cityid_list in self.data['lookup'].items():
            alias_cityid_list_unique = set(alias_cityid_list)
            if len(alias_cityid_list_unique) > 1:
                # logging.debug(u"{} {}".format(alias, len(alias_cityid_list_unique)))
                # print alias
                for code in alias_cityid_list_unique:
                    # print json.dumps(self.data['items'][code], ensure_ascii=False)
                    pass

        # 有唯一省的地点名, 歧义地点名不管
        for alias, alias_cityid_list in self.data['lookup'].items():
            alias_cityid_list_unique = set(alias_cityid_list)
            province = self._get_list_province_unique(alias_cityid_list_unique)
            if province:
                self.data['alias'][alias] = province

        # with codecs.open(getTheFile('libcity_cn.new.json'),'w',encoding='utf-8') as f:
        #    json.dump(self.data, f,ensure_ascii=False, indent=4)
        # 统计
        for index in self.data:
            counter[index] = len(self.data[index])

        # validation
        for alias, entities in self.data['lookup'].items():
            if len(alias) == 1:
                logging.error(json.dumps(
                    entities, ensure_ascii=False, indent=4, sort_keys=True))
                if self.strict_mode:
                    exit()

            if alias in [u'自治']:
                logging.error(json.dumps(
                    entities, ensure_ascii=False, indent=4, sort_keys=True))
                if self.strict_mode:
                    exit()

            if len(entities) > 1:
                counter["one-alias-many-entities"] += 1
                # logging.info(u"{}[{}] {}".format(alias, len(entities), u",".join([x["name"]+x["type"] for x in entities])))

        # prepare for NER
        for index in ['province', 'city', 'district']:
            for name, data in self.data[index].items():
                for alias in set(data['alias']):
                    if re.search(ur"[省市县]$", alias):
                        jieba.add_word(alias, 10000000)
                    elif re.search(ur"[区]$", alias):
                        jieba.add_word(alias, 1000000)
                    else:
                        jieba.add_word(alias, 100000)

                        for suffix in u"路镇乡圩河区村":
                            jieba.add_word(u"{}{}".format(
                                alias, suffix), 1000000)
Ejemplo n.º 44
0
# -*- coding: utf-8 -*-
import jieba

jieba.add_word("台中")
Ejemplo n.º 45
0
cutted_words = jieba.cut(input_str)
words = [word for word in cutted_words]
print(words)

# 辨識新字詞
# 啟用 HMM 已辨識新字詞 (預設 HMM 功能即為啟用,可以不用特地設為 True)
input_string = "他来到了网易杭研大厦"
cutted_words = jieba.cut(input_string, HMM=True)
words = [word for word in cutted_words]
print(words)

# 在既有使用的字典下新增自定義字詞
jieba.load_userdict("jieba/test/userdict.txt")

# 動態加入字典
jieba.add_word("國立臺灣大學")
# 動態調整詞頻
jieba.suggest_freq("國立臺灣大學", True)

# 進行詞性標注(PoS Tagging)

import jieba.posseg as pseg

input_str = "小明碩士畢業於國立臺灣大學,現在在日本東京大學進修深造"
cutted_words = pseg.cut(input_str)
words = [(word, flag) for (word, flag) in cutted_words]
print(words)

# 取出斷詞位置

input_str = u'小明碩士畢業於國立臺灣大學,現在在日本東京大學進修深造'  #在此將字串轉為unicode
Ejemplo n.º 46
0
import jieba

a = "我想听祖国祖国我们爱你"
b = "我想听祖国我爱你"

jieba.add_word("我想听")
jieba.add_word("请播放")
jieba.add_word("我要听")

# res = list(jieba.cut(a))
# res = list(jieba.cut(a))
res_search = list(jieba.cut_for_search(a))
res = list(jieba.cut_for_search(b))
print(res)
print(res_search)

# 0:我想听  1:祖国  2:我爱你  3:我们  4:爱  5:你
# 01(2,45)
# 01(345,45)

def add_words():
    import jieba
    jieba.add_word('大数据')
    jieba.add_word('深度学习')
    jieba.add_word('机器学习')
    jieba.add_word('数据分析')
print("精准模式: " + "/ ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造")  #默认是全模式
print("搜索引擎模式: " + "/ ".join(seg_list))
seg_list = jieba.lcut("我来到北京清华大学", cut_all=False)
print(seg_list)  # 精准模式

import os
root_path = ''
jieba.set_dictionary(os.path.join(root_path, 'corpus/dict.txt.big'))
seg_list = jieba.lcut("今年他不幸得了新冠肺炎住院")
print(seg_list)

jieba.load_userdict(os.path.join(root_path, 'corpus/medical_term.txt'))
seg_list = jieba.lcut("今年他不幸得了COVID-19住院")
print(seg_list)
jieba.add_word('新冠肺炎')
seg_list = jieba.lcut("今年他不幸得了新冠肺炎住院")
print(seg_list)

seg_list = jieba.lcut("王小二是一个农民")
print(seg_list)

seg_list = jieba.lcut("发展社会主义的新乡村")
print(seg_list)
jieba.add_word('新乡村')
seg_list = jieba.lcut("发展社会主义的新乡村")
print(seg_list)

seg_list = jieba.lcut("王军虎去广州了")
print(seg_list)
seg_list = jieba.cut("王军虎去广州了。", HMM=False)
Ejemplo n.º 49
0
def add_to_dict(characters_names):
    for name in characters_names:
        jieba.add_word(name)
Ejemplo n.º 50
0
                idf_score += self.tfidf.idf_freq.get(match_keyword, self.tfidf.median_idf)
            score = (len(match_keywords), idf_score)
            if score > max_score and len(match_keywords) == len(and_keywords):
                max_score = score
        return max_score


if __name__ == '__main__':
    clf = KeywordClassifier()

    attrs = {
        "pos": {
            "target": "了解情况:1",
            "keywords": ["嗯", "可以", "好的"]
        },
        "neg": {
            "target": "挽回:1",
            "keywords": ["不需要", "不用了", "没有", "物业费,价格"],
            "patterns": ["不.*要"]

        },
        "spe": {
            "target": "挽回:2",
            "keywords": ["你是谁", "干什么", "楼盘,价格"]
        }
    }

    jieba.add_word("你是谁")
    print(clf.predict(attrs, '干什么'))
    print(clf.predict(attrs, '物业费价格'))
Ejemplo n.º 51
0
def __init__():
    global FULL_DICT
    temp_full_dict = {
        # u'test': [],
        u'question': [],
        u'assessment-factor': [],
        # u'application': [],
        u'school': [],
        u'faculty': [],
        u'program': [],
        u'country': [],
        u'location': [],
        u'rank-source': [],
        u'keyword': [],
        u'faculty-type': [],
        u'location-field': [],
        u'school-field': [],
        u'faculty-field': [],
        u'program-field': [],
    }
    global STOP_WORDS
    # dict init
    for theme in temp_full_dict:
        print(theme)
        dict_reader = csv.reader(open('resource/sym_dict/' + theme + '.csv', 'rb'))
        rule_list = []
        for origin_rule in dict_reader:
            if len(origin_rule) < 2:
                continue
            rule_dict = {}
            try:
                rule_dict['value'] = int(origin_rule[0])
            except ValueError:
                rule_dict['value'] = smart_unicode(origin_rule[0])

            for rule_type in ['keyword', 'regex', 'entity']:
                rule_dict[rule_type] = list(map(
                    lambda x: smart_unicode(x)[len(rule_type) + 1:],
                    filter(
                        lambda x: x.find(rule_type) is 0,
                        origin_rule[1:]
                    )
                ))

            for rule_type in ['keyword']:
                rule_dict[rule_type] = list(map(
                    lambda x: x.lower(),
                    rule_dict[rule_type]
                ))

            for rule_type in ['keyword', 'entity']:
                rule_dict[rule_type] = set(rule_dict[rule_type])

            rule_list.append(rule_dict)
        temp_full_dict[theme] = rule_list

    # cut dict
    for theme in FULL_DICT:
        for rule_dict in FULL_DICT[theme]:
            for keyword in rule_dict['keyword']:
                jieba.add_word(keyword)
    jieba.load_userdict('resource/jieba_dict/other_dict.txt')

    temp_stop_words = set(
        map(
            lambda x: x.strip().decode('utf-8'),
            open('resource/jieba_dict/stop_word_dict.txt').readlines()
        ) +
        [u' ']
    )
    print('Dicts are successfully loaded!')
    FULL_DICT = temp_full_dict
    STOP_WORDS = temp_stop_words
Ejemplo n.º 52
0
    del tf_counter[term]

adjusted_size = len(tf_counter)

df_counter = tf_counter & df_counter

print('Origin Size', origin_size)
print('Adjusted Size', adjusted_size)

print('----------------------------------')
print('\n')



for term in tf_counter:
    jieba.add_word(term, tf_counter[term] * 100000)



the_term_sets = []


for i in range(ws_max_row):
    
    the_row_idx = i + 1
    
    if the_row_idx == 1:
        continue
    
    the_title = ws['D' + str(the_row_idx)].value
    the_content = ws['E' + str(the_row_idx)].value
Ejemplo n.º 53
0
"历史回顾",
"方针思想",
"人才培育",
"法制建设",
"环境保护",
"产业结构",
"海外投资",
"企业改革",
"土地改革",
"服务业",
"私营经济",
"科技研究"
]

for word in label:
    jieba.add_word(word, freq=10)

train_data = []
train_target= []

with io.open('newkeywords.txt', 'r', encoding='utf8') as f:       
    # sourceInLines = f.readlines()  
    #按行读出文件内容

    line = "" 
    for lines in f:
        if lines.strip().split(" ")[0] in label:
            if line == "":
                continue
            lables_in_sentence = lines.encode('utf-8').strip().split(" ")
Ejemplo n.º 54
0
from collections import Counter
import jieba
import read_data
import json

counter = Counter()
data = read_data.collection.find()
all_list = []
jieba.add_word("大数据")
jieba.add_word("大数据挖掘")

stop_words = [
    '(', ')', ':', '-', '(', ')', '/', '-', '+', ' ', '丨', 'k', '6300'
]
for item in data:
    list = jieba.lcut(item['pname'])
    # print(item['pname'])
    for i in list:
        if i not in stop_words:
            counter[i] += 1
        # all_list.append(i)

counter = counter.most_common(250)
# print(c + " : " + str(counter[c]))

counter = dict(counter)

print(counter)
# print(str(counter)c)
# print(str(json.dumps(counter, ensure_ascii=False)))
Ejemplo n.º 55
0
#common functions in jieba
import jieba
print(jieba.lcut("中国是一个伟大的国家"))  #precision mode
print(jieba.lcut("中国是一个伟大的国家", cut_all=True))  #full mode
print(jieba.lcut_for_search("中国人民共和国是伟大的"))  #search mode
jieba.add_word("蟒蛇语言")  #add a word to the dict
Ejemplo n.º 56
0
#coding:gbk
'''
作者:王冠超
程序目标:获取小说《黎明破晓的街道》里的人物出现频率及人物关系。
'''
import re
import jieba
from collections import Counter

characters = ['渡部','秋叶','有美子','仲西达彦','新谷','妙子','园美','芦原','钉宫真纪子','本条'] #小说里的主要人物
for character in characters:
	jieba.add_word(character)  #加进jieba字典里
#初始化容器分别存储每行的人物名和人物关系
line_names = list()
relationships = {i:dict() for i in characters}
character_attention = list()
#预处理文本并统计人物频率作为节点权重
with open('黎明破晓的街道.txt','r',encoding = 'utf-8')as f:
	for line in f.readlines():
		#找出每行问题中出现的人物名
		tmp_line_names = [word for word in jieba.cut(line) if word in characters]
		character_attention += tmp_line_names
		if len(tmp_line_names) > 1:
			line_names.append(tmp_line_names)
#统计人物出场频率
character_counter = Counter(character_attention)
#统计人物关系(以每行一起出现的次数代表人物关系)
for line in line_names:
	for character1 in line:
		for character2 in line:
			if character1 == character2:
Ejemplo n.º 57
0
# coding=utf-8
'''
Created on 2015年5月11日

@author: BFD474
'''

from __future__ import print_function, unicode_literals
import sys
import jieba
import jieba.posseg as pseg

sys.path.append( "../" )
jieba.load_userdict( "userdict.txt" )

jieba.add_word( '石墨烯' )
jieba.add_word( '凱特琳' )
jieba.del_word( '自定义词' )

test_sent = ( 
"李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n"
"例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n"
"「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。"
 )
words = jieba.cut( test_sent )
print( '/'.join( words ) )

print( "="*40 )

result = pseg.cut( test_sent )
Ejemplo n.º 58
0
def add_words(words):
    if words:
        for word in words:
            jieba.add_word(word)
    else:
        jieba.add_word('')
Ejemplo n.º 59
0
MONGO_CONCEPT_COLLECTION = MONGO_CONCEPT_DB['conceptlist']

XML_SOURCE_FILE = '/home/daoming/data/*.xml'
XML_DEST_DIR = '/home/daoming/xml/'

def jiebaToList(data):
    wordList = []
    for tk in data:
        wordList.append({'word':tk[0],'pos':tk[1]})
    return wordList

jieba.initialize()
jieba.set_dictionary('dict.txt.big')
reader = MONGO_WORD_COLLECTION.find({'user':1,'cut':1})
for word in reader:
    jieba.add_word(word['name'],word['weight'],word['tag'])
dictConceptList = {}
reader = MONGO_CONCEPT_COLLECTION.find()
for word in reader:
    jieba.add_word(word['conceptname'],999,'NCI')
    dictConceptList[word['conceptname']] = word['conceptid']

filenameList=glob.glob(XML_SOURCE_FILE)
fileList = []
i = 0

for filename in filenameList:
    basename = os.path.basename(filename)
    reader = open(filename)
    xmldict = xmltodict.parse(reader.read())
    reader.close()
# -*- coding: utf-8 -*-
import jieba
from jieba import del_word, add_word

# 分词
# 精确模式,试图将句子最精确地切开,适合文本分析;
# 全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义;
# 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。
# paddle模式,利用PaddlePaddle深度学习框架,训练序列标注(双向GRU)网络模型实现分词。同时支持词性标注.

seg_list = jieba.cut('我来到北京清华大学', cut_all=True)
print('全模式分词结果: ' + '/ '.join(seg_list))

seg_list = jieba.cut('我来到北京清华大学', cut_all=False)
print('精确模式分词结果: ' + '/ '.join(seg_list))

seg_list = jieba.cut('他来到了网易杭研大厦')
print(', '.join(seg_list))

seg_list = jieba.cut_for_search('小明硕士毕业于中国科学院计算所,后在日本京都大学深造')
print('搜索引擎模式分词结果: ' + ', '.join(seg_list))

# 载入词典 载入文件格式一个词占一行,每一行分三部分:词语、词频(可省略)、词性(可省略),用空格隔开,顺序不可颠倒,词频省略时使用自动计算的能保证分出该词的词频。
jieba.load_userdict('vocabularity.txt')
# 使用 add_word(word, freq=None, tag=None) 和 del_word(word) 可在程序中动态修改词典
# del_word('杭研大厦')
add_word('杭研大厦')
# 比如医疗,旅游中有一些词分不出来,可以加大这些词的权重,这样就可以分出来了
seg_list = jieba.cut('他来到了网易杭研大厦', cut_all=False)
print('载入词典分词结果: ' + '/ '.join(seg_list))