Exemple #1
0
    def __init__(self,
                 max_vocab=None,
                 pad_token="<pad>",
                 unk_token="<unk>",
                 pad_id=0,
                 unk_id=1,
                 tokenize_method="char",
                 user_dict=None,
                 min_count=None):
        self.max_vocab = max_vocab
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.pad_id = pad_id
        self.unk_id = unk_id
        self.word2index = {pad_token: pad_id, unk_token: unk_id}
        self.index2word = {pad_id: pad_token, unk_id: unk_token}
        self.min_count = min_count

        if tokenize_method.lower() == "char":
            self.tokenize_method = self.char_tokenize
        elif tokenize_method.lower() == "word":
            jieba.setLogLevel(20)
            self.tokenize_method = self.jieba_tokenize
            if user_dict is not None:
                jieba.load_userdict(user_dict)
        else:
            raise TypeError(f"bad tokenize method: {tokenize_method}")
Exemple #2
0
def bare_dict():
    negations = set(json.load(open(os.path.join(DATA_DIR, 'negations.json'))))

    with open(os.path.join(DATA_DIR, "degrees.json")) as f:
        degrees = json.load(f)

    with open(os.path.join(DATA_DIR, 'pos.txt')) as f:
        pos_emotion = set([x.strip() for x in f.readlines()])

    with open(os.path.join(DATA_DIR, 'neg.txt')) as f:
        neg_emotion = set([x.strip() for x in f.readlines()])

    # with open(os.path.join(DATA_DIR, 'pos_eva.txt')) as f:
    #     pos_envalute = set([x.strip() for x in f.readlines()])

    # with open(os.path.join(DATA_DIR, 'neg_eva.txt')) as f:
    #     neg_envalute = set([x.strip() for x in f.readlines()])
    # places = os.path.join(os.path.dirname(__file__), "../dictionaries/places.txt")
    # tokenizer.load_userdict(places)

    # with open(os.path.join(DATA_DIR, 'pos_sentence.txt')) as f1,\
    #         open(os.path.join(DATA_DIR, 'neg_sentence.txt')) as f2:
    #     s1 = set([x.strip() for x in f1.readlines()])
    #     s2 = set([x.strip() for x in f2.readlines()])
    #     pos_emotion.union(s1)
    #     neg_emotion.union(s2)
    pos_neg = pos_emotion.union(neg_emotion)
    # pos_neg_eva = pos_envalute.union(neg_envalute)
    tokenizer.load_userdict(pos_neg)
def evaluate(cut_mode):
    if cut_mode == "word":
        import jieba_fast as jieba
        jieba.load_userdict("dict_fasttext.txt")
    vocab_map, _ = dataset.read_map('corpus/mapping')
    sess = tf.Session()
    Model = create_model(sess, 'test')
    Model.batch_size = 1

    sys.stdout.write('>')
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    sentence = sentence_cutter(sentence, cut_mode)

    while (sentence):
        print('sentence: ', sentence)
        token_ids = dataset.convert_to_token(sentence, vocab_map)
        print('toekn_ids: ', token_ids)
        encoder_input, encoder_length, _ = Model.get_batch([(0, token_ids)])
        print('encoder_input: ', encoder_input, encoder_input.shape)
        print('encoder_length: ', encoder_length)
        score = Model.step(sess, encoder_input, encoder_length)
        print('Score: ', score[0][0])
        print('>', end='')
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        sentence = sentence_cutter(sentence, cut_mode)
Exemple #4
0
def my_wordcloud(filename):
    punct = str.maketrans("!.,:;-?※></()=,、。/[]《》", "                      ")
    plt.rcParams['font.sans-serif'] = 'PingFang TC'  # 設字型

    # 讀取停用字
    stop = [line.strip() for line in open('stopwords.txt').readlines()]

    print('停用字長度', len(stop))
    all_segs = []
    with open(filename) as file:
        for line in file:
            # print(line)
            line = line.translate(punct)
            segs = line.split(' ')
            for anyy in segs:
                if len(anyy.strip()) > 2:
                    all_segs.append(anyy.strip())
    # print(all_segs)
    print(len(all_segs))

    jieba.load_userdict('userdict.txt')

    word_appear_times = {}
    for i in all_segs:
        # print('-'*30)
        # print(i,':',list(jieba.cut(i,cut_all=False)))
        for anyy in list(jieba.cut(i, cut_all=True)):
            anyy = anyy.lower()
            if anyy not in stop and len(anyy.strip()) > 2:
                # print(anyy)
                if anyy not in word_appear_times:
                    word_appear_times[anyy] = 1
                else:
                    word_appear_times[anyy] += 1
            else:
                continue
        # print('-'*30,'\n')
        # time.sleep(3)
    # print(word_appear_times)

    word_appear_times_ordered = sorted(word_appear_times.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
    top150 = word_appear_times_ordered[0:150]
    # print(top150)
    top150_word = ' '.join([x[0] for x in top150])
    print(top150_word)

    cloud_mask = np.array(Image.open("cloud_mask.png"))
    wc = WordCloud(colormap='RdYlGn',
                   mask=cloud_mask,
                   max_words=150,
                   background_color="black",
                   scale=4,
                   font_path='/System/Library/Fonts/PingFang.ttc')  # 產生文字雲
    wc.generate(top150_word)
    wc.to_file(f'{filename[:-4]}.jpg')
def make_segment_file(file_path):
    print("start seg file")
    jieba.load_userdict("./seg_dict.txt")
    with open(file_path) as f:
        document = f.read()
    d_cut = jieba.cut(document)
    res = " ".join(d_cut)
    with open("./segment_wiki.txt", "w") as f:
        f.write(res)
    print("segment file ok")
Exemple #6
0
def map_get_words(txts, kind="char", return_type="str"):
    if isinstance(txts, str):
        with open(txts, "r") as f:
            txts = [row.strip() for row in f.readlines()]
    jieba = None
    if kind == "word":
        import jieba_fast as jieba
        jieba.initialize()
        jieba.load_userdict("dict_fasttext.txt")
    txts = list(map(lambda txt: get_words(txt, kind, return_type, jieba),
                    txts))
    return txts
def extract_tag():
    ## 分词 ieba_fast
    jieba.load_userdict(dictionary)

    data = []
    jiebaAnalyse.set_stop_words("../resources/stopWord.txt")
    with open(data_file, 'r', encoding='utf8') as f:
        for no, line in enumerate(f):
            data.append(
                jiebaAnalyse.extract_tags(line, topK=50, allowPOS=['n']))

    print(data)
Exemple #8
0
def word_seg(input_file, output_file, mode):
    if mode == 'word':
        jieba.load_userdict(dict_path)

    with open(output_file, 'w') as f, open(input_file, 'r') as fi:
        for l in fi:
            # remove all whitespace characters
            l = ''.join(l.split())
            if mode == 'char':
                f.write(' '.join(list(l)) + '\n')
            else:
                seg = jieba.cut(l, cut_all=False)
                f.write(' '.join(seg) + '\n')
def update_userdict(userdict_old_file, userdict_new_file):

    userdict_old_df = pd.read_csv(userdict_old_file,
                                  names=['word', 'freq', 'pos_tag'],
                                  sep=' ',
                                  encoding='utf-8')  #[['word',]]
    userdict_old_df['word'] = userdict_old_df['word'].astype('str')
    # pool = Pool(processes=num_of_cpu)
    #userdict_old_df =pool.map(read_userdict,userdict_old_file)
    #pool.close()
    #userdict_old_df.drop('word_len', axis=1, inplace=True)
    userdict_old_df['word_len'] = userdict_old_df['word'].str.len()
    userdict_old_df.sort_values('word_len', ascending=True, inplace=True)
    #for i in tqdm(range(1,37)):
    word_len_list = sorted(
        userdict_old_df['word_len'].value_counts().index.tolist())
    for i in tqdm(word_len_list):
        try:
            jieba.load_userdict(userdict_new_file)
            print('loaded new userdict')
        except:
            print('cannot loaded new userdict')
            pass

        df_processing = userdict_old_df[(userdict_old_df['word_len'] == i)]
        print('words_len: {}, no. of words in old userdict:{}'.format(
            i, df_processing.shape[0]))
        #if df_processing.shape[0] == 0:
        #   print('No word with length of {}'.format(i))
        #  pass
        #else:
        # print('processing word with length of {}'.format(i))
        df_processing.drop('word_len', axis=1, inplace=True)
        df_processing.drop_duplicates('word', keep='last', inplace=True)
        df_processing['freq'] = df_processing['word'].swifter.apply(
            cal_jieba_freq)
        print('words_len: {},  new userdict:{}'.format(i,
                                                       df_processing.shape[0]))
        #print(df_processing.head(20))
        df_processing.to_csv(userdict_new_file,
                             mode='a',
                             sep=' ',
                             index=None,
                             header=None,
                             encoding='utf-8')
def test(filename):
    if FLAGS.src_word_seg == 'word':
        import jieba_fast as jieba
        jieba.load_userdict("dict_fasttext.txt")
    sess = tf.Session()
    src_vocab_dict, _ = data_utils.read_map(source_mapping)
    trg_vocab_dict, _ = data_utils.read_map(target_mapping)
    model = create_seq2seq(sess, 'TEST')
    model.batch_size = 1
    #model.decoder_max_len = None

    #sources = ["你是誰","你是誰"]
    #targets = ["你是不是想人家","我是說你是我老婆"]
    df = pd.read_csv(filename)
    df = df.fillna('')
    sources = list(df["context"])
    targets = list(df["utterance"])
    scores = []
    for source, target in zip(sources, targets):
        if FLAGS.src_word_seg == 'word':
            source = (' ').join(jieba.lcut(source))
        elif FLAGS.src_word_seg == 'char':
            source = (' ').join([s for s in source])
        if FLAGS.trg_word_seg == 'word':
            target = (' ').join(jieba.lcut(target))
        elif FLAGS.trg_word_seg == 'char':
            target = (' ').join([t for t in target])
        src_token_ids = data_utils.convert_to_token(tf.compat.as_bytes(source),
                                                    src_vocab_dict, False)
        trg_token_ids = data_utils.convert_to_token(tf.compat.as_bytes(target),
                                                    trg_vocab_dict, False)
        trg_len = len(trg_token_ids)
        for i, bucket in enumerate(buckets):
            if bucket[0] >= len(src_token_ids):
                bucket_id = i
                break
        encoder_input, decoder_input, weight = model.get_batch(
            {bucket_id: [(src_token_ids, [])]}, bucket_id)
        output = model.run(sess, encoder_input, decoder_input, weight,
                           bucket_id)[:trg_len]
        output = [o[0][t] for t, o in zip(trg_token_ids, output)]
        output = np.mean(output)
        scores.append(output)
    scores = np.mean(scores)
    return scores
Exemple #11
0
    def gen_tokenize_method(self, split_type, user_dict=None, bert_vocab=None):
        lower_split_type = split_type.lower()

        if lower_split_type == "char":
            return self._char_split

        if lower_split_type == "word":
            jieba.setLogLevel(20)
            if user_dict is not None:
                jieba.load_userdict(user_dict)
            return self._word_split

        if lower_split_type == "word_piece":
            bert_vocab = bert_vocab or self.local_bert
            tokenizer = BertTokenizer.from_pretrained(bert_vocab)
            return partial(self._piece_split, tokenizer)

        raise TypeError(f"error tokenize type: {split_type}")
Exemple #12
0
    def get_name(self, sentence):  # 获取sentence中的公司名字
        # 加载公司字典
        jieba.load_userdict(os.path.join(self.abs_path, 'company_list.txt'))

        sentence_seged = jieba.cut(sentence.strip(),
                                   cut_all=False)  # 不进行完全分割,公司名字粒度相对较大
        stopwords = self.stopwordslist(
            os.path.join(self.abs_path, "HIT_STOP.txt"))
        com_name = ''  # 公司名字
        for word in sentence_seged:
            if word not in stopwords:
                if word != '\t':
                    com_name = com_name + word + ' '
        com_name = jieba_fast.analyse.extract_tags(com_name,
                                                   topK=1,
                                                   withWeight=False,
                                                   allowPOS=('x'))  # 过滤词性
        return com_name[0] if len(com_name) > 0 else ' '  # 返回公司名
Exemple #13
0
    def _data_pre(self, title_list, content_list, label_list, pred_list):
        if len(title_list) == len(content_list) == len(label_list):
            self.data_size = len(title_list)
        else:
            print("The lengh of input list should be equal.")
            return
        data = [
            a + b[:min(400, len(b))] for a, b in zip(title_list, content_list)
        ]
        count_vect = CountVectorizer()
        jieba_fast.load_userdict(
            "/data1/sina_dw/shichen/FastText/cppjieba/dict/user.dict.utf8")
        train_data_terms = [self._cut(r) for r in data]
        train_data = count_vect.fit_transform(train_data_terms)
        lb = sklearn.preprocessing.LabelBinarizer()
        Y = lb.fit_transform(label_list)
        if self.classes_ is None:
            self.classes_ = lb.classes_
        if self.vocabulary_ is None:
            self.vocabulary_ = count_vect.vocabulary_

        return train_data, Y, train_data_terms, count_vect
from torchtext.data import Field, LabelField, TabularDataset, BucketIterator
from torchtext.vocab import Vectors
from config import user_dict
from utils.text_util import pretreatment
from utils.ml_util import init_unk
from functools import partial
import jieba_fast as jieba
import torch

jieba.setLogLevel(20)
jieba.load_userdict(user_dict)


class BatchWrapper(object):
    def __init__(self, batch_iter, x_var, y_vars):
        self.batch_iter = batch_iter
        self.x_var = x_var
        self.y_vars = y_vars

    def __iter__(self):
        for batch in self.batch_iter:
            x, lengths = getattr(batch, self.x_var)

            y_tensors = [
                getattr(batch, y_var).unsqueeze(1) for y_var in self.y_vars
            ]
            y = torch.cat(tuple(y_tensors), dim=1)

            yield x, y, lengths

    def __len__(self):
Exemple #15
0
'''
import sys
reload(sys)
sys.setdefaultencoding('utf8')
"""数据处理"""
'''
结巴分词模块
'''
import time
import jieba_fast
import jieba_fast.posseg as pseg
import sys
reload(sys)
sys.setdefaultencoding('utf8')

jieba_fast.load_userdict('/Users/zhuxinquan/Desktop/mykeyword.dict')
jieba_fast.add_word('烤鸭炉')
#存储停用词
fid2 = '/Users/zhuxinquan/Desktop/停用词调整_二手.txt'
stopword = {}
fid2 = open(fid2, 'r')
for j in fid2.readlines():
    stopword[j.strip().decode("utf-8")] = 1


def stop_word(line):
    data_line = line.strip()
    wordList = jieba_fast.cut(data_line)  # wordlist是一个生成器
    outStr = ''
    t1 = time.time()
    for word in wordList:
Exemple #16
0
 def __init__(self):
     # 加载蚂蚁相关的字典
     jieba.load_userdict(JIEBA_DICT_SELF_DEFINE)
     pass
import glob, codecs, re, gzip
import jieba_fast as jieba

pdpath = '/home/fqx/Documents/pd-corpus/**/*.txt'
corpuspath = 'corpus/pd-aio.txt.gz'
paragraphbreak = re.compile('[‖ ]')
linebreak = re.compile('[【】。!?… ]')
jieba.load_userdict('names.txt')
jieba.enable_parallel(4)
aiofile = gzip.open(corpuspath, 'wt', encoding='utf-8')

pdfiles = glob.glob(pdpath,recursive=True)

for addr in pdfiles:
    print('Processing %s' % addr)
    try:
        file = codecs.open(addr, 'r', 'GB18030')
        lines = file.readlines()
        file.close()
    except UnicodeDecodeError:
        print('Decoding Error!')
        continue

    for line in lines:
        paras = re.split(paragraphbreak,line)
        for para in paras:
            reallines = re.split(linebreak,para)
            for realline in reallines:
                if len(realline) > 19:
                    words = jieba.cut(realline)
                    realwords = ' '.join(words)
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re,time,glob
import jieba_fast as jieba
from gensim import corpora, models, similarities
import logging
import gensim
from gensim.similarities import Similarity
import thulac
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#thu1 = thulac.thulac(user_dict = 'E:\\codetest\\fintech\\topic2\\THUOCL_caijing.txt',seg_only=True, filt=True)
dicts = glob.glob('E:\\codetest\\fintech\\topic2\\userdict\\*')
for d in dicts:
    print(d)
    jieba.load_userdict(d)

def readFile():
    train_data = 'E:\\codetest\\fintech\\topic2\\train_data.csv'
    test_data = 'E:\\codetest\\fintech\\topic2\\test_data.csv'
    stop = 'E:\\codetest\\fintech\\topic2\\stopwords.dat'
    trainD = pd.read_csv(train_data, skiprows=0,index_col='id')
    testD = pd.read_csv(test_data,index_col='id')
    stopword = [line.strip() for line in open(stop,encoding='utf-8').readlines()]
    return trainD,testD,stopword

def tcutword(data,stopword):
    corpora_documents = []
    for i in data.index:
        text = data.loc[i].values[0].strip()
        text = re.sub('[\"*\【\】\[\]\s*]','',text) # sub Special symbol
Exemple #19
0
import sys
from google_images_download import google_images_download
import speech_recognition as sr
from jieba_fast.analyse import extract_tags
import jieba_fast as jieba
jieba.set_dictionary("app/dictionary.txt")
jieba.load_userdict("app/dictionary.txt")

with open("app/dictionary.txt", "r") as f:
    places_list = [d.strip().split(' ')[0] for d in f.readlines()]


def asr_result(way="file", file_path=None):
    counter = 0
    r = sr.Recognizer()
    if way == "mic":
        with sr.Microphone() as source:
            print("請開始說話:")
            audio = r.listen(source)
    elif way == "file":
        with sr.AudioFile(file_path) as source:
            audio = r.record(source)  # read the entire audio file
    try:
        # for testing purposes, we're just using the default API key
        # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")`
        # instead of `r.recognize_google(audio)`
        result = r.recognize_google(audio, language='zh-tw')
        print("辨識結果: " + result)
        return result
    except sr.UnknownValueError:
        counter += 1
                              sep=' ',
                              names=['word', 'ner'])

#####----------------  Load jieba data ----------------------------------------------------
jieba_dict_folder = existing_ner_folder
jieba_dict_filename = 'userdict.txt'
jieba_dict_file = os.path.join(jieba_dict_folder, jieba_dict_filename)

print('loading jieba_dict')
jieba_dict_df = pd.read_csv(jieba_dict_file,
                            sep=' ',
                            names=['word', 'freq', 'pos'])
print('loading jieba.set_dictionary')
jieba.set_dictionary(jieba_dict_file)
print('loading jieba.load_userdict')
jieba.load_userdict(jieba_dict_file)

#####----------------  Load ner2pos data ----------------------------------------------------
ner2pos_folder = existing_ner_folder
ner2pos_filename = 'ner2pos.json'
ner2pos_file = os.path.join(ner2pos_folder, ner2pos_filename)

print('loading ner2pos')
with open(ner2pos_file, 'r') as fp:
    ner2pos = json.load(fp)

pos2ner = {v: k for k, v in ner2pos.items()}
#print(pos2ner)

print('\n\n')
#####----------------  Load ner2pos data ----------------------------------------------------
Exemple #21
0
    posdelim = args.pos

    def cutfunc(sentence, _, HMM=True):
        for w, f in jieba_fast.posseg.cut(sentence, HMM):
            yield w + posdelim + f
else:
    cutfunc = jieba.cut

delim = text_type(args.delimiter)
cutall = args.cutall
hmm = args.hmm
fp = open(args.filename, 'r') if args.filename else sys.stdin

if args.dict:
    jieba.initialize(args.dict)
else:
    jieba.initialize()
if args.user_dict:
    jieba.load_userdict(args.user_dict)

ln = fp.readline()
while ln:
    l = ln.rstrip('\r\n')
    result = delim.join(cutfunc(ln.rstrip('\r\n'), cutall, hmm))
    if PY2:
        result = result.encode(default_encoding)
    print(result)
    ln = fp.readline()

fp.close()
Exemple #22
0
def tag_one_file_test(file: str):
    '''
    测试版,在控制台输出提取的标签。
    内部需要:NBA专用名词的词典txt的路径,停用词表txt的路径;
    :param file: 一个json文件的路径,键为 title、url、reply、views、comefrom、time、text、tags
    :return:
    '''
    import json
    import jieba_fast as jb
    jb.load_userdict('dict3.txt')  #专用字典
    import jieba_fast.analyse as jbana
    textranker = jbana.TextRank()
    tfidfer = jbana.TFIDF()  # 分词器
    textranker.set_stop_words('stop.txt')
    tfidfer.set_stop_words('stop.txt')  # stop words

    try:
        json_file = open(file, 'r', encoding='utf-8')
    except IOError:
        print('fail to open', file, IOError, sep=' ')
        return
    doc_list = json.load(json_file)
    for doc in doc_list:
        title = doc['title']
        text = doc['text']
        true_tag = doc['tags'] if 'tags' in doc.keys() else None
        whole = title + ' '
        for string in text:
            whole = whole + string  # 整个文章拼在一起

        split_gen = jb.cut(whole)  # 是生成器
        split_whole = ''
        for s in split_gen:
            split_whole = split_whole + ' ' + s  # 分词后的文章

        tag1w = tfidfer.extract_tags(split_whole,
                                     topK=10,
                                     withWeight=True,
                                     allowPOS=('nr', 'nz'))
        tag2w = tfidfer.extract_tags(split_whole,
                                     topK=10,
                                     withWeight=True,
                                     allowPOS=('an', 'b', 'j', 'l', 'Ng', 'n',
                                               'nr', 'ns', 'nz', 'nt'))
        tag3w = textranker.textrank(
            split_whole,
            topK=10,
            withWeight=True
            # , allowPOS=('nr', 'nz', 'n')
            #                   , allowPOS=(
            # 'Ag', 'a', 'ad', 'an', 'b', 'c', 'dg', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'Ng',
            # 'n', 'nr', 'ns', 'nt', 'nz', 'o', 'p', 'q', 'r', 's', 'tg', 't', 'u'
            # , 'vg', 'v', 'vd', 'vn', 'w', 'x', 'y', 'z', 'un')  # all pos
            #                   , allowPOS=(
            # 'Ag', 'a', 'ad', 'an', 'b', 'c', 'dg', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'Ng',
            # 'n', 'nr', 'ns', 'nt', 'nz', 'o', 'p', 'q', 'r', 's', 'u', 'w', 'y', 'z', 'un')
            ,
            allowPOS=('an', 'b', 'j', 'k', 'l', 'Ng', 'n', 'nr', 'ns', 'nz',
                      'vn', 's'))
        tag1 = []
        wei1 = []
        for tup in tag1w:
            tag1.append(tup[0])
            wei1.append(tup[1])
        tag2 = []
        wei2 = []
        for tup in tag2w:
            tag2.append(tup[0])
            wei2.append(tup[1])
        tag3 = []
        wei3 = []
        for tup in tag3w:
            tag3.append(tup[0])
            wei3.append(tup[1])


#: 下面是让3个提取器投票选出最后的关键词
# 方案1:
# final_tag = set()
# import random
# choose_size = 5
# if len(tag1) < choose_size:
#     final_tag = final_tag.union(set(tag1))
# else:
#     cho1 = random.choices(tag1, weights=wei1, k=choose_size)
#     final_tag = final_tag.union(set(cho1))
#
# if len(tag2) < choose_size:
#     final_tag = final_tag.union(set(tag2))
# else:
#     cho2 = random.choices(tag2, weights=wei2, k=choose_size)
#     final_tag = final_tag.union(set(cho2))
#
# if len(tag3) < choose_size:
#     final_tag = final_tag.union(set(tag3))
# else:
#     cho3 = random.choices(tag3, weights=wei3, k=choose_size)
#     final_tag = final_tag.union(set(cho3))
# 方案2:
        final_tag = list()
        import random
        choose_size = 5  # 一个参数,控制随机选择时选择的数目
        if len(tag1) < choose_size:
            final_tag.extend(tag1w)
        else:
            cho1 = random.choices(
                tag1w, weights=wei1,
                k=choose_size)  # 根据分词器判断的权重加权随机选择choose_size个关键词
            final_tag.extend(cho1)  # 合并每个提取器的提议

        if len(tag2) < choose_size:
            final_tag.extend(tag2w)
        else:
            cho2 = random.choices(tag2w, weights=wei2, k=choose_size)
            final_tag.extend(cho2)

        if len(tag3) < choose_size:
            final_tag.extend(tag3w)
        else:
            cho3 = random.choices(tag3w, weights=wei3, k=choose_size)
            final_tag.extend(cho3)

        final_tag.sort(key=lambda x: float(x[1]),
                       reverse=True)  # 所有提议的关键词按权重排序(有重复)
        tag = []
        wei = []
        for tup in final_tag:
            tag.append(tup[0])
            wei.append(tup[1])
        if len(tag) < 2 * choose_size:
            final_tag = tag
        else:
            choose = random.choices(tag, weights=wei, k=2 * choose_size)
            final_tag = choose  # 再做一次按权重的随机抽取,大小为2*choose_size
        final_tag = set(final_tag)  # 去重
        print('get tags:', final_tag)
        # 这里还可以考虑如下修改:先去重保持降序性,不过现在的结果似乎还可以,甚至比排序的更好些
        # final_tag = set(final_tag)
        # final_tag = sorted(final_tag, key=lambda x: float(x[1]), reverse=True)     # 关键词按权重排序
        # tag = []
        # wei = []
        # for tup in final_tag:
        #     tag.append(tup[0])
        #     wei.append(tup[1])
        # if len(tag) < 2 * choose_size:
        #     final_tag = tag
        # else:
        #     choose = random.choices(tag, weights=wei, k=2*choose_size)
        #     final_tag = choose
    json_file.close()
Exemple #23
0
        if isNature:
            nture = w.nature
            chN = natures[nture.toString()]
            r.append(word + ":( " + nture.toString() + chN + ")")
        else:
            r.append(word.strip())
    return r


if __name__ == '__main__1':
    out_file = "./data/userdict1.txt"
    user_dict(out_file)

if __name__ == '__main__':

    jieba.load_userdict('./data/userdict.txt')
    stopwords = [
        line.rstrip()
        for line in open('./data/stopwords.txt', encoding='utf-8')
    ]

    file = "./data/hotword.txt"
    with open(file, mode="r", encoding="utf-8") as fp:
        i = 0
        for line in fp.readlines()[:]:
            text = json.loads(line)['content']
            print(i)
            cleanedText = clean_text(text)
            print("-" * 60 + "清洗之后:")
            print(cleanedText)
            c = [word for word in jieba.cut(cleanedText)]
# e_ner=pd.read_csv(e_ner_file,sep=' ',header=0,encoding='utf-8')['word'].values
#
#
sentences_folder = 'sentences'
NER_resources_folder = 'NER_resources'
bio_folder = 'bio_corpus'

ner_txtfilename = 'ner_results.txt'
ner_txtfile = os.path.join(NER_resources_folder, ner_txtfilename)
ners = pd.read_csv(ner_txtfile,
                   sep=' ',
                   names=['word', 'ner'],
                   encoding='utf-8')

jieba.set_dictionary(os.path.join(NER_resources_folder, 'userdict_ner.txt'))
jieba.load_userdict(os.path.join(NER_resources_folder, 'userdict_ner.txt'))

sentence_filename = 'whole.txt'
sentence_file = os.path.join(sentences_folder, sentence_filename)


def convert_sentence_bio(sentence):

    word_list, label_list = [], []

    for c in jieba.lcut(sentence.strip('\n'), cut_all=False, HMM=False):
        if c not in ners.word.values:
            c_tag = 'O'
            word_tag = list(zip(list(c), [c_tag] * len(c)))
        else:
            c_ner = ners[ners['word'] == c]['ner'].values[0]
Exemple #25
0
 def __init__(self):
     if comConfig.use_dict:
         jieba.load_userdict(fileConfig.dir_jieba +
                             fileConfig.file_jieba_dict)
Exemple #26
0
from train import run 
from flags import FLAGS 
import jieba_fast as jieba
jieba.load_userdict("data/dict_fasttext.txt")

if __name__ == "__main__":
    #infer_file = "data/train_pos.csv"
    infer_file = FLAGS.inference_data_path 
    mean_prob = run([infer_file], mode="infer", jieba=jieba)
    print("coh2 score: ",mean_prob)
    if FLAGS.log_path: 
        with open(FLAGS.log_path,"a") as f:
            f.write("coh2: %s\n"%mean_prob)
import json
import os
import jieba_fast as jieba
import gensim
from gensim import corpora

jieba.load_userdict("F:/代码code/DSTS/dict/dict_baidu_utf8.txt")
jieba.load_userdict("F:/代码code/DSTS/dict/dict_pangu.txt")
jieba.load_userdict("F:/代码code/DSTS/dict/dict_sougou_utf8.txt")
jieba.load_userdict("F:/代码code/DSTS/dict/dict_tencent_utf8.txt")
jieba.load_userdict("F:/代码code/DSTS/dict/my_dict.txt")
stopwords = []  # 创建停用词列表
for line in open('F:/代码code/DSTS/dict/Stopword.txt', encoding='UTF-8'):
    x = line.split('\n')[0]
    stopwords.append(x)


def LDA_topic(text):
    vector = []
    seg_list = jieba.lcut(text, cut_all=False)  # 使用分词,将文本分开 生成列表
    result = []
    for j in seg_list:  # 去掉停用词
        if j not in stopwords and j is not ' ':
            result.append(j)
    result = [result]
    dictionary = corpora.Dictionary(result)  # 构造词典,给每一个词创建一个索引号
    # 使用上面的词典,将转换文档列表(语料)变成 DT 矩阵
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in result]
    model = gensim.models.LdaModel.load('F:/代码code/DSTS/LDA/lda.model')  # 加载训练好的模型
    for e, values in enumerate(model.inference(doc_term_matrix)[0]):
        for ee, value in enumerate(values):
Exemple #28
0
def tag_one_file2(file: str):
    '''
    服务器版:为一个json文件提取标签,在输出 增加标签后的一整个json文件 到原文件。
    内部需要(可固定):NBA专用名词的词典的路径('../NBAdict.txt'),停用词表的路径('../stopword.txt'),
                        设置choose_size参数,控制随机采纳分词的提议时的选择的数目
    :param file: 一个json文件的路径,数组,每个数组单元为一篇新闻报道,其键为 title、url、reply、views、comefrom、time、text、tags
    :return:打印增加标签后的json文件
    '''
    import json
    import jieba as jb0
    jb0.setLogLevel(logging.INFO)
    import jieba_fast as jb
    jb.setLogLevel(logging.INFO)  # 让jieba不输出debug信息

    try:
        jb.load_userdict('NBAdict.txt')  # 专用字典
    except:
        logger.exception('fail to open dictionary')

    from jieba_fast.analyse.textrank import TextRank
    from jieba_fast.analyse.tfidf import TFIDF

    textranker = TextRank()
    tfidfer = TFIDF()  # 分词器
    try:
        textranker.set_stop_words('stopword.txt')
        tfidfer.set_stop_words('stopword.txt')  # stop words
    except:
        logger.exception('fail to set stop words')

    try:
        json_file = open(file, 'r', encoding='utf-8')
    except IOError as ioe:
        logger.exception('fail to open ' + file)  # 打开文件失败
        raise ioe
    try:
        doc_list = json.load(json_file)
    except Exception as e:  # 不知道error类型
        logger.exception('fail to load json file:' + file)  # 打开文件失败
        json_file.close()
        raise e

    for doc in doc_list:  # 对每一篇报道
        keys = doc.keys()
        title = doc['title'] if 'title' in keys else ''
        text = doc['text'] if 'text' in keys else ''
        old_tag = doc['tags'] if 'tags' in keys else ''  # 原来爬到的tags
        whole = title + ' '
        for string in text:  # 迭代器:为空的话在for循环中也不会报错的
            whole = whole + string  # 整个文章拼在一起

        split_gen = jb.cut(whole)  # 按字典分词,是生成器
        split_whole = ''
        for s in split_gen:
            split_whole = split_whole + ' ' + s  # 分词后的文章
        # 3个分词器,每个提议的结果都是(tag,weight)形式的list
        tag1w = tfidfer.extract_tags(split_whole,
                                     topK=10,
                                     withWeight=True,
                                     allowPOS=('nr', 'nz'))
        tag2w = tfidfer.extract_tags(split_whole,
                                     topK=10,
                                     withWeight=True,
                                     allowPOS=('an', 'b', 'j', 'l', 'Ng', 'n',
                                               'nr', 'ns', 'nz', 'nt'))
        tag3w = textranker.textrank(split_whole,
                                    topK=10,
                                    withWeight=True,
                                    allowPOS=('an', 'b', 'j', 'k', 'l', 'Ng',
                                              'n', 'nr', 'ns', 'nz', 'vn',
                                              's'))
        #: 将所有的权重值提取出来形成一个list用于后面的随机选择
        # tag1 = []
        wei1 = []
        for tup in tag1w:
            # tag1.append(tup[0])
            wei1.append(tup[1])
        # tag2 = []
        wei2 = []
        for tup in tag2w:
            # tag2.append(tup[0])
            wei2.append(tup[1])
        # tag3 = []
        wei3 = []
        for tup in tag3w:
            # tag3.append(tup[0])
            wei3.append(tup[1])

        final_tagw = list()
        import random
        choose_size = 5  # 一个参数,控制随机选择时选择的数目
        if len(tag1w) < choose_size:
            final_tagw.extend(tag1w)
        else:
            cho1 = random.choices(
                tag1w, weights=wei1,
                k=choose_size)  # 根据分词器判断的权重加权随机选择choose_size个关键词
            final_tagw.extend(cho1)  # 合并每个提取器的提议

        if len(tag2w) < choose_size:
            final_tagw.extend(tag2w)
        else:
            cho2 = random.choices(tag2w, weights=wei2, k=choose_size)
            final_tagw.extend(cho2)

        if len(tag3w) < choose_size:
            final_tagw.extend(tag3w)
        else:
            cho3 = random.choices(tag3w, weights=wei3, k=choose_size)
            final_tagw.extend(cho3)

        final_tagw.sort(key=lambda x: float(x[1]),
                        reverse=True)  # 所有提议的关键词按权重排序(有重复)
        tag = []
        wei = []
        for tup in final_tagw:
            tag.append(tup[0])
            wei.append(tup[1])

        final_tag = list()
        if len(tag) < 2 * choose_size:
            final_tag = tag
        else:
            choose = random.choices(tag, weights=wei, k=2 * choose_size)
            final_tag = choose  # 再做一次按权重的随机抽取,大小为2*choose_size
        final_tag = set(final_tag + old_tag)  # 合并原标签并去重

        doc['tags'] = list(final_tag)  # 修改原标签
    json_file.close()

    with open(file, 'wt', encoding='utf-8') as fo:
        json.dump(doc_list, fo, ensure_ascii=False)
Exemple #29
0
def my_wordcloud(filename):

    t1 = time.time()

    # 讀取停用字
    stopwords = [line.strip() for line in open('stopwords.txt').readlines()]
    print('停用字長度', len(stopwords))
    print('filename : ', filename)

    jieba.load_userdict('userdict.txt')

    words = []
    with open(filename, 'r') as fileread:
        for line in fileread.readlines():
            line = line.replace(' ', '')
            line = line.replace('7-11', '統一超商')
            line = line.replace('711', '統一超商')
            line = line.replace('SEVEN', '統一超商')
            line = line.replace('7-eleven', '統一超商')
            line = line.replace('小7', '統一超商')
            line = line.replace('7Eleven', '統一超商')
            line = line.replace('seven', '統一超商')
            line = line.replace('小七', '統一超商')
            cutted = jieba.cut(line, cut_all=False)
            for word in cutted:
                if word.lower() not in stopwords:
                    words.append(word.lower())
    t2 = time.time()
    print(f'分詞使用時間 {t2-t1:.4f} s')
    print(len(words))
    #words_in_string = ' '.join(words)

    word_appear_times = {}
    for i in words:
        if i not in word_appear_times:
            word_appear_times[i] = 1
        else:
            word_appear_times[i] += 1
    # print(word_appear_times)

    word_appear_times_ordered = sorted(word_appear_times.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
    # top150 = word_appear_times_ordered[0:150]
    # print(top150)
    top150_word = ' '.join([x[0] for x in word_appear_times_ordered[0:150]])
    #print(top150_word)

    cloud_mask = np.array(Image.open("cloud_mask.png"))
    wc = WordCloud(repeat=False,
                   include_numbers=False,
                   max_words=150,
                   min_word_length=2,
                   colormap='RdYlGn',
                   mask=cloud_mask,
                   background_color="black",
                   scale=4,
                   font_path='/System/Library/Fonts/PingFang.ttc')
    wc.generate(top150_word)
    wc.to_file(f'{filename[:-4]}.jpg')

    t3 = time.time()
    print(f'畫圖使用時間 {t3-t2:.4f} s')
Exemple #30
0
from sklearn.metrics import accuracy_score




model_path = r'/home/sunxianwei/python/result/kind2level4/test_model_path'
bs_kind_cat4_system_path = r'/home/sunxianwei/python/data/kind/test_config_filepath/kind_system_allcat_20180601.csv'
stopwords_path = r'/home/sunxianwei/python/data/kind/test_config_filepath/filter_char.txt'

dataseg_path = r'/home/sunxianwei/python/data/kind/test_config_filepath/prod_clothes20180601_to_keywords.csv'
userdict_path = False
tfidf_max_features_config_dict = {1:None,3:None,5:None,7:None,10:None}

if userdict_path:
    print('加载自定义词典')
    jieba.load_userdict(userdict_path)

bs_kind_cat4_system = pd.read_csv(bs_kind_cat4_system_path,encoding='utf8')
bs_kind_cat4_system.fillna(0,inplace=True)
bs_kind_cat4_system.parent_catid=bs_kind_cat4_system.parent_catid.astype(int)
bs_kind_cat4_system=[{'catid':row['cat_id'],'catname':row['cat_name'],'parent_catid':row['parent_catid']} for index,row in bs_kind_cat4_system.iterrows()]




def readdata_from_pgsql(username,passwd,queryStr):
    '''
    两个重点字段需要规范,其它字段保留
    prod_name:产品名称
    catid:品类ID
    '''