Exemple #1
0
    def __init__(self,
                 max_vocab=None,
                 pad_token="<pad>",
                 unk_token="<unk>",
                 pad_id=0,
                 unk_id=1,
                 tokenize_method="char",
                 user_dict=None,
                 min_count=None):
        self.max_vocab = max_vocab
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.pad_id = pad_id
        self.unk_id = unk_id
        self.word2index = {pad_token: pad_id, unk_token: unk_id}
        self.index2word = {pad_id: pad_token, unk_id: unk_token}
        self.min_count = min_count

        if tokenize_method.lower() == "char":
            self.tokenize_method = self.char_tokenize
        elif tokenize_method.lower() == "word":
            jieba.setLogLevel(20)
            self.tokenize_method = self.jieba_tokenize
            if user_dict is not None:
                jieba.load_userdict(user_dict)
        else:
            raise TypeError(f"bad tokenize method: {tokenize_method}")
Exemple #2
0
    def gen_tokenize_method(self, split_type, user_dict=None, bert_vocab=None):
        lower_split_type = split_type.lower()

        if lower_split_type == "char":
            return self._char_split

        if lower_split_type == "word":
            jieba.setLogLevel(20)
            if user_dict is not None:
                jieba.load_userdict(user_dict)
            return self._word_split

        if lower_split_type == "word_piece":
            bert_vocab = bert_vocab or self.local_bert
            tokenizer = BertTokenizer.from_pretrained(bert_vocab)
            return partial(self._piece_split, tokenizer)

        raise TypeError(f"error tokenize type: {split_type}")
from torchtext.data import Field, LabelField, TabularDataset, BucketIterator
from torchtext.vocab import Vectors
from config import user_dict
from utils.text_util import pretreatment
from utils.ml_util import init_unk
from functools import partial
import jieba_fast as jieba
import torch

jieba.setLogLevel(20)
jieba.load_userdict(user_dict)


class BatchWrapper(object):
    def __init__(self, batch_iter, x_var, y_vars):
        self.batch_iter = batch_iter
        self.x_var = x_var
        self.y_vars = y_vars

    def __iter__(self):
        for batch in self.batch_iter:
            x, lengths = getattr(batch, self.x_var)

            y_tensors = [
                getattr(batch, y_var).unsqueeze(1) for y_var in self.y_vars
            ]
            y = torch.cat(tuple(y_tensors), dim=1)

            yield x, y, lengths

    def __len__(self):
Exemple #4
0
def tag_one_file2(file: str):
    '''
    服务器版:为一个json文件提取标签,在输出 增加标签后的一整个json文件 到原文件。
    内部需要(可固定):NBA专用名词的词典的路径('../NBAdict.txt'),停用词表的路径('../stopword.txt'),
                        设置choose_size参数,控制随机采纳分词的提议时的选择的数目
    :param file: 一个json文件的路径,数组,每个数组单元为一篇新闻报道,其键为 title、url、reply、views、comefrom、time、text、tags
    :return:打印增加标签后的json文件
    '''
    import json
    import jieba as jb0
    jb0.setLogLevel(logging.INFO)
    import jieba_fast as jb
    jb.setLogLevel(logging.INFO)  # 让jieba不输出debug信息

    try:
        jb.load_userdict('NBAdict.txt')  # 专用字典
    except:
        logger.exception('fail to open dictionary')

    from jieba_fast.analyse.textrank import TextRank
    from jieba_fast.analyse.tfidf import TFIDF

    textranker = TextRank()
    tfidfer = TFIDF()  # 分词器
    try:
        textranker.set_stop_words('stopword.txt')
        tfidfer.set_stop_words('stopword.txt')  # stop words
    except:
        logger.exception('fail to set stop words')

    try:
        json_file = open(file, 'r', encoding='utf-8')
    except IOError as ioe:
        logger.exception('fail to open ' + file)  # 打开文件失败
        raise ioe
    try:
        doc_list = json.load(json_file)
    except Exception as e:  # 不知道error类型
        logger.exception('fail to load json file:' + file)  # 打开文件失败
        json_file.close()
        raise e

    for doc in doc_list:  # 对每一篇报道
        keys = doc.keys()
        title = doc['title'] if 'title' in keys else ''
        text = doc['text'] if 'text' in keys else ''
        old_tag = doc['tags'] if 'tags' in keys else ''  # 原来爬到的tags
        whole = title + ' '
        for string in text:  # 迭代器:为空的话在for循环中也不会报错的
            whole = whole + string  # 整个文章拼在一起

        split_gen = jb.cut(whole)  # 按字典分词,是生成器
        split_whole = ''
        for s in split_gen:
            split_whole = split_whole + ' ' + s  # 分词后的文章
        # 3个分词器,每个提议的结果都是(tag,weight)形式的list
        tag1w = tfidfer.extract_tags(split_whole,
                                     topK=10,
                                     withWeight=True,
                                     allowPOS=('nr', 'nz'))
        tag2w = tfidfer.extract_tags(split_whole,
                                     topK=10,
                                     withWeight=True,
                                     allowPOS=('an', 'b', 'j', 'l', 'Ng', 'n',
                                               'nr', 'ns', 'nz', 'nt'))
        tag3w = textranker.textrank(split_whole,
                                    topK=10,
                                    withWeight=True,
                                    allowPOS=('an', 'b', 'j', 'k', 'l', 'Ng',
                                              'n', 'nr', 'ns', 'nz', 'vn',
                                              's'))
        #: 将所有的权重值提取出来形成一个list用于后面的随机选择
        # tag1 = []
        wei1 = []
        for tup in tag1w:
            # tag1.append(tup[0])
            wei1.append(tup[1])
        # tag2 = []
        wei2 = []
        for tup in tag2w:
            # tag2.append(tup[0])
            wei2.append(tup[1])
        # tag3 = []
        wei3 = []
        for tup in tag3w:
            # tag3.append(tup[0])
            wei3.append(tup[1])

        final_tagw = list()
        import random
        choose_size = 5  # 一个参数,控制随机选择时选择的数目
        if len(tag1w) < choose_size:
            final_tagw.extend(tag1w)
        else:
            cho1 = random.choices(
                tag1w, weights=wei1,
                k=choose_size)  # 根据分词器判断的权重加权随机选择choose_size个关键词
            final_tagw.extend(cho1)  # 合并每个提取器的提议

        if len(tag2w) < choose_size:
            final_tagw.extend(tag2w)
        else:
            cho2 = random.choices(tag2w, weights=wei2, k=choose_size)
            final_tagw.extend(cho2)

        if len(tag3w) < choose_size:
            final_tagw.extend(tag3w)
        else:
            cho3 = random.choices(tag3w, weights=wei3, k=choose_size)
            final_tagw.extend(cho3)

        final_tagw.sort(key=lambda x: float(x[1]),
                        reverse=True)  # 所有提议的关键词按权重排序(有重复)
        tag = []
        wei = []
        for tup in final_tagw:
            tag.append(tup[0])
            wei.append(tup[1])

        final_tag = list()
        if len(tag) < 2 * choose_size:
            final_tag = tag
        else:
            choose = random.choices(tag, weights=wei, k=2 * choose_size)
            final_tag = choose  # 再做一次按权重的随机抽取,大小为2*choose_size
        final_tag = set(final_tag + old_tag)  # 合并原标签并去重

        doc['tags'] = list(final_tag)  # 修改原标签
    json_file.close()

    with open(file, 'wt', encoding='utf-8') as fo:
        json.dump(doc_list, fo, ensure_ascii=False)
Exemple #5
0
                    help="don't use the Hidden Markov Model")
parser.add_argument("-q",
                    "--quiet",
                    action="store_true",
                    default=False,
                    help="don't print loading messages to stderr")
parser.add_argument("-V",
                    '--version',
                    action='version',
                    version="Jieba " + jieba.__version__)
parser.add_argument("filename", nargs='?', help="input file")

args = parser.parse_args()

if args.quiet:
    jieba.setLogLevel(60)
if args.pos:
    import jieba_fast.posseg
    posdelim = args.pos

    def cutfunc(sentence, _, HMM=True):
        for w, f in jieba_fast.posseg.cut(sentence, HMM):
            yield w + posdelim + f
else:
    cutfunc = jieba.cut

delim = text_type(args.delimiter)
cutall = args.cutall
hmm = args.hmm
fp = open(args.filename, 'r') if args.filename else sys.stdin