def __init__(self, max_vocab=None, pad_token="<pad>", unk_token="<unk>", pad_id=0, unk_id=1, tokenize_method="char", user_dict=None, min_count=None): self.max_vocab = max_vocab self.pad_token = pad_token self.unk_token = unk_token self.pad_id = pad_id self.unk_id = unk_id self.word2index = {pad_token: pad_id, unk_token: unk_id} self.index2word = {pad_id: pad_token, unk_id: unk_token} self.min_count = min_count if tokenize_method.lower() == "char": self.tokenize_method = self.char_tokenize elif tokenize_method.lower() == "word": jieba.setLogLevel(20) self.tokenize_method = self.jieba_tokenize if user_dict is not None: jieba.load_userdict(user_dict) else: raise TypeError(f"bad tokenize method: {tokenize_method}")
def gen_tokenize_method(self, split_type, user_dict=None, bert_vocab=None): lower_split_type = split_type.lower() if lower_split_type == "char": return self._char_split if lower_split_type == "word": jieba.setLogLevel(20) if user_dict is not None: jieba.load_userdict(user_dict) return self._word_split if lower_split_type == "word_piece": bert_vocab = bert_vocab or self.local_bert tokenizer = BertTokenizer.from_pretrained(bert_vocab) return partial(self._piece_split, tokenizer) raise TypeError(f"error tokenize type: {split_type}")
from torchtext.data import Field, LabelField, TabularDataset, BucketIterator from torchtext.vocab import Vectors from config import user_dict from utils.text_util import pretreatment from utils.ml_util import init_unk from functools import partial import jieba_fast as jieba import torch jieba.setLogLevel(20) jieba.load_userdict(user_dict) class BatchWrapper(object): def __init__(self, batch_iter, x_var, y_vars): self.batch_iter = batch_iter self.x_var = x_var self.y_vars = y_vars def __iter__(self): for batch in self.batch_iter: x, lengths = getattr(batch, self.x_var) y_tensors = [ getattr(batch, y_var).unsqueeze(1) for y_var in self.y_vars ] y = torch.cat(tuple(y_tensors), dim=1) yield x, y, lengths def __len__(self):
def tag_one_file2(file: str): ''' 服务器版:为一个json文件提取标签,在输出 增加标签后的一整个json文件 到原文件。 内部需要(可固定):NBA专用名词的词典的路径('../NBAdict.txt'),停用词表的路径('../stopword.txt'), 设置choose_size参数,控制随机采纳分词的提议时的选择的数目 :param file: 一个json文件的路径,数组,每个数组单元为一篇新闻报道,其键为 title、url、reply、views、comefrom、time、text、tags :return:打印增加标签后的json文件 ''' import json import jieba as jb0 jb0.setLogLevel(logging.INFO) import jieba_fast as jb jb.setLogLevel(logging.INFO) # 让jieba不输出debug信息 try: jb.load_userdict('NBAdict.txt') # 专用字典 except: logger.exception('fail to open dictionary') from jieba_fast.analyse.textrank import TextRank from jieba_fast.analyse.tfidf import TFIDF textranker = TextRank() tfidfer = TFIDF() # 分词器 try: textranker.set_stop_words('stopword.txt') tfidfer.set_stop_words('stopword.txt') # stop words except: logger.exception('fail to set stop words') try: json_file = open(file, 'r', encoding='utf-8') except IOError as ioe: logger.exception('fail to open ' + file) # 打开文件失败 raise ioe try: doc_list = json.load(json_file) except Exception as e: # 不知道error类型 logger.exception('fail to load json file:' + file) # 打开文件失败 json_file.close() raise e for doc in doc_list: # 对每一篇报道 keys = doc.keys() title = doc['title'] if 'title' in keys else '' text = doc['text'] if 'text' in keys else '' old_tag = doc['tags'] if 'tags' in keys else '' # 原来爬到的tags whole = title + ' ' for string in text: # 迭代器:为空的话在for循环中也不会报错的 whole = whole + string # 整个文章拼在一起 split_gen = jb.cut(whole) # 按字典分词,是生成器 split_whole = '' for s in split_gen: split_whole = split_whole + ' ' + s # 分词后的文章 # 3个分词器,每个提议的结果都是(tag,weight)形式的list tag1w = tfidfer.extract_tags(split_whole, topK=10, withWeight=True, allowPOS=('nr', 'nz')) tag2w = tfidfer.extract_tags(split_whole, topK=10, withWeight=True, allowPOS=('an', 'b', 'j', 'l', 'Ng', 'n', 'nr', 'ns', 'nz', 'nt')) tag3w = textranker.textrank(split_whole, topK=10, withWeight=True, allowPOS=('an', 'b', 'j', 'k', 'l', 'Ng', 'n', 'nr', 'ns', 'nz', 'vn', 's')) #: 将所有的权重值提取出来形成一个list用于后面的随机选择 # tag1 = [] wei1 = [] for tup in tag1w: # tag1.append(tup[0]) wei1.append(tup[1]) # tag2 = [] wei2 = [] for tup in tag2w: # tag2.append(tup[0]) wei2.append(tup[1]) # tag3 = [] wei3 = [] for tup in tag3w: # tag3.append(tup[0]) wei3.append(tup[1]) final_tagw = list() import random choose_size = 5 # 一个参数,控制随机选择时选择的数目 if len(tag1w) < choose_size: final_tagw.extend(tag1w) else: cho1 = random.choices( tag1w, weights=wei1, k=choose_size) # 根据分词器判断的权重加权随机选择choose_size个关键词 final_tagw.extend(cho1) # 合并每个提取器的提议 if len(tag2w) < choose_size: final_tagw.extend(tag2w) else: cho2 = random.choices(tag2w, weights=wei2, k=choose_size) final_tagw.extend(cho2) if len(tag3w) < choose_size: final_tagw.extend(tag3w) else: cho3 = random.choices(tag3w, weights=wei3, k=choose_size) final_tagw.extend(cho3) final_tagw.sort(key=lambda x: float(x[1]), reverse=True) # 所有提议的关键词按权重排序(有重复) tag = [] wei = [] for tup in final_tagw: tag.append(tup[0]) wei.append(tup[1]) final_tag = list() if len(tag) < 2 * choose_size: final_tag = tag else: choose = random.choices(tag, weights=wei, k=2 * choose_size) final_tag = choose # 再做一次按权重的随机抽取,大小为2*choose_size final_tag = set(final_tag + old_tag) # 合并原标签并去重 doc['tags'] = list(final_tag) # 修改原标签 json_file.close() with open(file, 'wt', encoding='utf-8') as fo: json.dump(doc_list, fo, ensure_ascii=False)
help="don't use the Hidden Markov Model") parser.add_argument("-q", "--quiet", action="store_true", default=False, help="don't print loading messages to stderr") parser.add_argument("-V", '--version', action='version', version="Jieba " + jieba.__version__) parser.add_argument("filename", nargs='?', help="input file") args = parser.parse_args() if args.quiet: jieba.setLogLevel(60) if args.pos: import jieba_fast.posseg posdelim = args.pos def cutfunc(sentence, _, HMM=True): for w, f in jieba_fast.posseg.cut(sentence, HMM): yield w + posdelim + f else: cutfunc = jieba.cut delim = text_type(args.delimiter) cutall = args.cutall hmm = args.hmm fp = open(args.filename, 'r') if args.filename else sys.stdin