Ejemplo n.º 1
0
def seg_with_ltp40(in_file, out_file_path, manual_seg_file):
    # initialization model
    ltp = LTP()
    line_list = []

    # save seg_result
    corpus = construct_corpus(in_file)
    f = open(out_file_path, "w", encoding='utf-8')
    for line in corpus:
        line_list.append(line)  # 将每句话变成列表["Xxxx"]
        seg_result, hidden = ltp.seg(line_list)
        f.write("=".join(seg_result[0]) + "\n")
        line_list.clear()
        f.flush()

    # test qps
    corpus = construct_corpus(in_file, 1)
    start = time.time()
    for line in corpus:
        segment, hidden = ltp.seg(list(line))
    end = time.time()
    qps = round(len(corpus) / (end - start), 2)

    # test accuracy
    p, r, f1, line_aver_length = evaluate(out_file_path, manual_seg_file)
    return qps, p, r, f1, line_aver_length
Ejemplo n.º 2
0
    def get_pos_tag(self, sentence):
        r"""
        pos tag function.

        :param str sentence: the sentence need to be ner
        :return: the triple form (tags,start,end)
        """

        assert isinstance(sentence, (list, str))
        from ltp import LTP
        if isinstance(sentence, list):
            # Turn the list into sentence
            tmp = ''
            for word in sentence:
                tmp += word
            sentence = tmp

        if not sentence:
            return []

        if self.__pos is None:
            # get pos tag
            self.__pos = LTP()
        seg, hidden = self.__pos.seg([sentence])
        pos = self.__pos.pos(hidden)
        seg = seg[0]
        pos = pos[0]
        pos_tag = []
        cnt = 0
        for tag in range(len(pos)):
            pos_tag.append([pos[tag], cnt, cnt + len(seg[tag]) - 1])
            cnt += len(seg[tag])

        return pos_tag
Ejemplo n.º 3
0
def mongo2ner(idx, ltp, offset, size):
    """
    根据offset从mongo中取指定size的文章
    :param idx:
    :param offset:
    :param size:
    :return:
    """
    entities = []
    pid = os.getpid()
    try:
        # debug_logger.debug("{} ---pid:{} MongoDB: Skip: {}, size: {}".format(idx, pid, offset, size))
        ltp = LTP(path=LTP4_MODEL_DIR)
        db_connect = MongoClient(host=MONGODB_HOST, port=MONGODB_PORT)
        db = db_connect[MONGODB_DATABASE_NAME]
        coll = db[MONGODB_ENTMT_COLLECTION]
        # debug_logger.debug("pid: {}, connected".format(pid))
        for art in coll.find(skip=offset, limit=size):
            debug_logger.debug(art['title'])
            text = art['title'] + art['content']
            entities_of_art = get_article_entities(idx, text, ltp)
            entities += entities_of_art

        # debug_logger.debug("pid: {}, write".format(pid))
        with open(os.path.join(USER_DICT_DIR, 'ners_' + str(idx) + '.txt'), 'w') as fw:
            for item in entities:
                for word, label in item:
                    fw.write(word + '\t' + label + '\n')
    except Exception as e:
        print("ERROR mongo2ner: {}".format(e))
        # debug_logger.debug("ERROR mongo2ner: {}".format(e))
    return entities
Ejemplo n.º 4
0
 def __init__(self,
              path: str = 'small',
              batch_size: int = 50,
              device: str = None,
              onnx: bool = False):
     self.ltp = LTP(path=path, device=device)
     self.split = lambda a: map(lambda b: a[b:b + batch_size],
                                range(0, len(a), batch_size))
Ejemplo n.º 5
0
    def __init__(self, seq_len=512):
        """
        Constructs Huggingface CN tokenizer & other
            col: What column to tokenize if pretraining
        """

        self.tokenizer_cn = AutoTokenizer.from_pretrained("bert-base-chinese")
        self.tokenizer_ltp = LTP("small")
        self.max_seq_length = seq_len
Ejemplo n.º 6
0
def ltp_func(text_list):
    ltp = LTP()
    seg, hidden = ltp.seg(text_list)
    pos = ltp.pos(hidden)
    result = []
    for idx, val in enumerate(seg[0]):
        pag = [val, pos[0][idx]]
        result.append('/'.join(pag))
    return result
Ejemplo n.º 7
0
 def dependency(self):
     sentence = self.sentence
     sentences = []
     sentences.append(sentence)
     ltp = LTP()
     seg, hidden = ltp.seg(sentences)
     dep = ltp.dep(hidden)
     print(seg)
     print(dep)
     pass
Ejemplo n.º 8
0
 def __init__(self,
              path: str = 'small',
              batch_size: int = 10,
              device: str = None,
              onnx: str = None,
              vocab: str = None):
     self.ltp = LTP(path=path,
                    batch_size=batch_size,
                    device=device,
                    vocab=vocab)
Ejemplo n.º 9
0
 def __init__(self,
              default_model_dir=LTP4_MODEL_DIR,
              user_dict_dir=USER_DICT_DIR):
     self.ltp = LTP(path=default_model_dir)
     for file in os.listdir(user_dict_dir):
         self.ltp.init_dict(path=os.path.join(user_dict_dir, file))
     self.sentences = []
     self.postags = []
     self.nertags = []
     self.dep = []
Ejemplo n.º 10
0
def work_summary_parser_ltp():
    f = csvReader("标准工作任务单")
    ltp = LTP()
    paList = []
    for i, row in enumerate(f):
        if i != 0:
            val = row[1][5:].split(',')
            paList.append(val[2])
    wa, ha = ltp.seg(paList)
    pa = ltp.pos(ha)
    return wa, pa
Ejemplo n.º 11
0
 def __init__(self,
              path: str = 'small',
              batch_size: int = 50,
              device: str = None,
              onnx: bool = False):
     if onnx:
         self.ltp = FastLTP(path=path, device=device, need_config=True)
     else:
         self.ltp = LTP(path=path, device=device, need_config=True)
     self._split = lambda a: map(lambda b: a[b:b + batch_size],
                                 range(0, len(a), batch_size))
Ejemplo n.º 12
0
 def __init__(self, model_type='base', user_dict_dir=RESOURCE_DIR):
     self.default_user_dict_dir = user_dict_dir
     # 加载ltp模型
     self.ltp = LTP(model_type)
     # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快
     files = os.listdir(user_dict_dir)
     for file in files:
         file_path = os.path.join(user_dict_dir, file)
         # 文件夹则跳过
         if os.path.isdir(file):
             continue
         self.ltp.init_dict(file_path)
Ejemplo n.º 13
0
 def test_nlp_model(self):
     ltp1 = LTP(LTP4_MODEL_DIR)
     ltp2 = LTP(LTP4_MODEL_DIR)
     ltp3 = LTP(LTP4_MODEL_DIR)
     ltp4 = LTP(LTP4_MODEL_DIR)
     ltp5 = LTP(LTP4_MODEL_DIR)
     ltp6 = LTP(LTP4_MODEL_DIR)
     ltp7 = LTP(LTP4_MODEL_DIR)
     print('-------')
     import time
     time.sleep(10)
Ejemplo n.º 14
0
def main(args):
    # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
    # If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp)
    with open(args.file_name, "r", encoding="utf-8") as f:
        data = f.readlines()

    ltp_tokenizer = LTP(args.ltp)  # faster in GPU device
    bert_tokenizer = BertTokenizer.from_pretrained(args.bert)

    ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer)

    with open(args.save_path, "w", encoding="utf-8") as f:
        data = [json.dumps(ref) + "\n" for ref in ref_ids]
        f.writelines(data)
Ejemplo n.º 15
0
def create():
    """create profession keywords json file.
    """
    ltp = LTP()  # 默认加载 Small 模型
    # import the professions file
    with open('./dataset/profession.json', 'rb') as jsonfile:
        profession_json = json.load(jsonfile, encoding='utf-8')

    for i, profession in enumerate(profession_json['data']):
        profession_json['data'][i]['kwords'] = find_kwords_by_ltp(
            profession['name'], ltp)

    with open('./dataset/profession2.json', 'w', encoding='utf-8') as jsonfile:
        json.dump(profession_json, jsonfile, ensure_ascii=False)
Ejemplo n.º 16
0
 def findFood(self, sentence):
     ltp = LTP()
     words, hidden = ltp.seg([sentence])
     posTags = ltp.pos(hidden)
     words = words[0]  #分词结果list
     posTags = posTags[0]  #词性标注结果list
     dep = ltp.dep(hidden)[0]  #依存句法分析结果list
     relyId = [d[1] for d in dep]  #父节点id list
     relation = [d[2] for d in dep]  #关系结果 list
     heads = ['Root' if id == 0 else words[id - 1] for id in relyId]  #父节点内容
     string = ''
     for i in range(len(words)):
         if 'n' in posTags[i] and heads[i] == '吃' and relation[i] == 'VOB':
             string += words[i]
             string += ' '
     return string
Ejemplo n.º 17
0
    def get_ner(self, sentence):
        r"""
        NER function.

        :param str sent: the sentence need to be ner
        :return two forms of tags
            The first is the triple form (tags,start,end)
            The second is the list form, which marks the ner label of each word
            such as 周小明去玩
            ['Nh', 'Nh', 'Nh', 'O', 'O']
        """
        assert isinstance(sentence, (list, str))
        from ltp import LTP
        if isinstance(sentence, list):
            # Turn the list into sentence
            tmp = ''
            for word in sentence:
                tmp += word
            sentence = tmp

        if not sentence:
            return [], []

        if self.__ner is None:
            self.__ner = LTP()
        seg, hidden = self.__ner.seg([sentence])
        seg = seg[0]
        ner = self.__ner.ner(hidden)
        ner = ner[0]

        ner_label = len(sentence) * ['O']
        for i in range(len(ner)):
            tag, start, end = ner[i]
            tmp = 0
            for j in range(start):
                tmp += len(seg[j])
            start = tmp
            tmp = 0
            for j in range(end + 1):
                tmp += len(seg[j])
            end = tmp
            ner[i] = (tag, start, end - 1)
            for j in range(start, end):
                ner_label[j] = tag

        return ner, ner_label
Ejemplo n.º 18
0
def WriteTest(readfile, savefile):
    with open(readfile, "r", encoding="utf-8") as rfp:
        ltp = LTP()
        logger.info("Processing file:%s ." % (readfile))
        with open(savefile, 'w', encoding='utf-8') as wfp:

            for row in tqdm(rfp, desc="file %s process" % (readfile)):
                sent1, sent2 = row.split('\t')
                seg, hid = ltp.seg([sent1, sent2])
                sdp = ltp.sdp(hid, mode='tree')
                pos = ltp.pos(hid)
                tmpitem = {
                    'sentence1': [seg[0], pos[0], sdp[0]],
                    'sentence2': [seg[1], pos[1], sdp[1]]
                }
                jsonline = json.dumps(tmpitem)
                wfp.write(jsonline + "\n")
Ejemplo n.º 19
0
def findFood(sentence):
    ltp = LTP()
    words, hidden = ltp.seg([sentence])
    posTags = ltp.pos(hidden)
    words = words[0]  #分词结果list
    print(words)
    posTags = posTags[0]  #词性标注结果list
    print(posTags)
    dep = ltp.dep(hidden)[0]  #依存句法分析结果list
    for t in dep:
        print(t)
    relyId = [d[1] for d in dep]  #父节点id list
    relation = [d[2] for d in dep]  #关系结果 list
    heads = ['Root' if id == 0 else words[id - 1] for id in relyId]  #父节点内容
    for i in range(len(words)):
        if 'n' in posTags[i] and heads[i] == '吃' and relation[i] == 'VOB':
            print("找到了一种食物:" + words[i])
Ejemplo n.º 20
0
    def is_word(sentence):
        from ltp import LTP
        r""" 
        Judge whether it is a word.

        :param str sentence: input sentence string
            sentence: input sentence string
        :return bool: is a word or not
        
        """
        if sentence[0] == sentence[1]:
            return True
        ltp = LTP()
        seg, hidden = ltp.seg([sentence])
        pos = ltp.pos(hidden)
        pos = pos[0]
        if len(pos) == 1 and pos[0] == 'n':
            return False
        return True
Ejemplo n.º 21
0
def main(args):
    # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
    # 如果要微调这些模型,则必须使用相同的tokenizer  : LTP (https://github.com/HIT-SCIR/ltp)
    with open(args.file_name, "r", encoding="utf-8", errors='ignore') as f:
        data = f.readlines()
    print(f'开始处理数据,共有{len(data)}条')
    data = [
        line.strip() for line in data if len(line) > 0 and not line.isspace()
    ]  # avoid delimiter like '\u2029'
    print(f"开始加载ltp和bert的tokenizer模型")
    ltp_tokenizer = LTP(path=args.ltp)  # faster in GPU device
    bert_tokenizer = BertTokenizer.from_pretrained(args.bert)
    #准备映射关系
    ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer)
    #保存映射关系
    with open(args.save_path, "w", encoding="utf-8") as f:
        data = [json.dumps(ref) + "\n" for ref in ref_ids]
        f.writelines(data)
    print(f"保存所有{len(data)}条数据的映射关系到文件{args.save_path}")
Ejemplo n.º 22
0
def thread_main(args, gpu=True):
    """
    多线程处理
    Args:
        args:
        gpu: 是否使用gpu
    Returns:

    """
    from functools import partial
    from multiprocessing import Pool
    from tqdm import tqdm
    # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
    # 如果要微调这些模型,则必须使用相同的tokenizer  : LTP (https://github.com/HIT-SCIR/ltp)
    with open(args.file_name, "r", encoding="utf-8") as f:
        data = f.readlines()
    print(f'开始处理数据,共有{len(data)}条')
    data = [
        line.strip() for line in data if len(line) > 0 and not line.isspace()
    ]  # avoid delimiter like '\u2029'
    print(f"开始加载ltp和bert的tokenizer模型")
    ltp_tokenizer = LTP(path=args.ltp)  # faster in GPU device
    bert_tokenizer = BertTokenizer.from_pretrained(args.bert)
    newdata = [data[i:i + 1000] for i in range(0, len(data), 1000)]
    #准备映射关系, 并行线程数
    #如果使用GPU,请设置如下
    if gpu:
        import torch
        torch.multiprocessing.set_start_method('spawn')
    with Pool(processes=args.processes) as p:
        # partial_clean 是封装一下函数
        partial_clean = partial(prepare_ref,
                                ltp_tokenizer=ltp_tokenizer,
                                bert_tokenizer=bert_tokenizer)
        # chunksize8,就是数据分成8份
        ref_ids_nest = list(
            tqdm(p.imap(partial_clean, newdata, chunksize=8), desc="开始处理数据"))
    ref_ids = [ref for nest in ref_ids_nest for ref in nest]
    #保存映射关系
    with open(args.save_path, "w", encoding="utf-8") as f:
        data = [json.dumps(ref) + "\n" for ref in ref_ids]
        f.writelines(data)
    print(f"保存所有{len(data)}条数据的映射关系到文件{args.save_path}")
Ejemplo n.º 23
0
def load_word_segmentation_tool():
    """
    加载分词工具
    :return: HanLP: hanlp, ltp: LTP
    """
    logger.info("loading word segmentation tool")
    # HanLP = HanLPClient(url='https://www.hanlp.com/api', auth='MTE4QGJicy5oYW5scC5jb206MXFFOHhWUkJNQXBNdlh0NA==')
    HanLP = hanlp.load(hanlp.pretrained.mtl.
                       CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH,
                       verbose=True)
    tasks = list(HanLP.tasks.keys())
    for task in tasks:
        if task not in TASK:
            del HanLP[task]
    tok = HanLP[TASK[0]]
    tok.dict_combine = {'新冠', '新冠病毒', '新冠肺炎'}
    ltp = LTP()
    logger.info("loaded word segmentation tool")
    return HanLP, ltp
Ejemplo n.º 24
0
def work_detail_parser_ltp():
    f = csvReader("标准工作任务单")
    ltp = LTP()
    paList = []
    pbList = []
    for i, row in enumerate(f):
        if i != 0:
            val = row[1][5:].split(',')
            paList.append(val[2])
            temp = val[3:]
            for v in temp:
                pbList.append(v)
    # print(paList)
    # print(pbList)
    sa, ha = ltp.seg(paList)
    sb, hb = ltp.seg(pbList)
    pa = ltp.pos(ha)
    pb = ltp.pos(hb)

    return sa, sb, pa, pb
Ejemplo n.º 25
0
    def load_ltp_weights(weights_type):
        '''
        加载 LTP 权重文件,实例化 LTP 模型
        :param weights_type: 载入模型文件类型,只能采用 base、small、tiny 三种类型
        :return: 载入权重参数后的 LTP 模型
        '''
        # 诊断模型类型
        assert weights_type in ['base', 'small',
                                'tiny'], 'LTP 模型只能采用 base、small、tiny三种类型的参数'

        # 确认文件路径
        if LtpModelPath is None:
            file_path = os.path.abspath(
                os.path.join(os.path.dirname('.'), 'weights', weights_type))
        else:
            file_path = os.path.abspath(
                os.path.join(LtpModelPath, weights_type))

        # 载入权重
        ltp = LTP(path=file_path)

        return ltp
Ejemplo n.º 26
0
def new_generate_ltp_results():
    # 加载模型
    ltp_model = '../../ltp_models/base1'
    ltp = LTP(path=ltp_model)

    # 读取原句子
    data = read_file_in_ltp('../data/train_base.json')
    sentences = list(map(lambda x: x['content'], data))

    segmented, pos, ner, srl, dep, sdp_tree, sdp_graph = [], [], [], [], [], [], []
    for sent in tqdm(sentences):
        # 分词
        segmented0, hidden = ltp.seg([sent])
        # 词性标注
        cur_pos = ltp.pos(hidden)
        # 命名实体识别
        cur_ner = ltp.ner(hidden)
        # 语义角色标注
        cur_srl = ltp.srl(hidden)
        # 依存句法分析
        cur_dep = ltp.dep(hidden)
        # 语义依存分析 (树)
        cur_sdp_tree = ltp.sdp(hidden, mode='tree')
        # 语义依存分析 (图)
        cur_sdp_graph = ltp.sdp(hidden, mode='graph')

        segmented.append(segmented0[0])
        pos.append(cur_pos[0])
        ner.append(cur_ner[0])
        srl.append(cur_srl[0])
        dep.append(cur_dep[0])
        sdp_tree.append(cur_sdp_tree[0])
        sdp_graph.append(cur_sdp_graph[0])

        # 生成句子与分词的对应
    sent_seg_matches = sentence_segment_match(data, segmented)
    pickle.dump([segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches], open('new_ltp_results.pk', 'wb'))

    return segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches
Ejemplo n.º 27
0
# [[('every', 5)], [('自然数', 'x'), 'and', ('奇数', 'x')]]
from ltp import LTP
ltp = LTP()


class NlpCtr(object):
    def __init__(self):
        self.seg = None
        self.words = None
        self.dep = None

    def trans_result(self, depArr, posArr):
        tempposArr = posArr[0]
        tempdepArr = depArr[0]

        tempArr = []
        for item in tempdepArr:
            dic = {
                'dep': item[0],
                'gov': item[1],
                'type': item[2],
                # 'pos': tempposArr[item[0] - 1]
            }
            tempArr.append(dic)
        return tempArr

    def getHED(self, words):
        root = None
        for word in words:
            if word['gov'] == 0 and word['type'] == 'HED':
                root = word['dep']
Ejemplo n.º 28
0
def save_as_txt(data):
    from ltp import LTP
    import random
    ltp = LTP()
    for row in data:
        id = row[0]
        school_id = ("000" + str(row[1]))[-4:]
        texts = row[2]
        textlines = texts.split('\n')
        shortened_textlines = []
        for line in textlines:
            line_len = len(line)
            if line_len > 100:
                for i in range(line_len // 100):
                    shortened_textlines.append(line[i * 100:(i + 1) * 100])
            else:
                shortened_textlines.append(line)
        text = ' '.join(shortened_textlines)
        path = './data/' + str(school_id)
        if os.path.exists(path): pass
        else: os.makedirs(path)
        with open((path + '/' + str(school_id) + "-" + str(id) + ".txt"),
                  'w',
                  encoding='UTF-8') as file:
            file.write(text)
            file.close()
            print("\r已保存 " + str(school_id) + "-" + str(id) + ".txt", end="")
            # T2	报告人 68 71	曹进德
            # R2 报告人_单位 Arg1: T2 Arg2: T1
        seg, hidden = ltp.seg([text])
        ner = ltp.ner(hidden)
        ner_info = []
        entities_nh = []
        entities_ni = []
        print(type(text))
        print()
        for i in ner[0]:
            if (i[0] == 'Nh'):
                start = i[1]
                end = i[2]
                entity = "".join(seg[0][start:end + 1])
                if (len(entity) > 1):
                    entities_nh.append(entity)

            elif (i[0] == 'Ni'):
                start = i[1]
                end = i[2]
                entity = "".join(seg[0][start:end + 1])
                if entity in schoolnames:
                    entities_ni.append(entity)

        for entity in set(entities_nh):
            pattern = re.compile(entity)
            iter = pattern.finditer(text)
            count = 0
            for record in iter:
                ner_info.append("T" + str(300 + count) + "\t姓名 " +
                                str(record.span()[0]) + " " +
                                str(record.span()[1]) + "\t" +
                                str(record.group()) + "\n")
                count += 1

        for entity in set(entities_ni):
            pattern = re.compile(entity)
            iter = pattern.finditer(text)
            count = 0
            for record in iter:
                ner_info.append("T" + str(400 + count) + "\t单位 " +
                                str(record.span()[0]) + " " +
                                str(record.span()[1]) + "\t" +
                                str(record.group()) + "\n")
                count += 1

        pattern = re.compile('教授|副教授|讲师|研究员|副研究员|助理教授|助理研究员')
        iter = pattern.finditer(text)
        count = 0
        for record in iter:
            ner_info.append("T" + str(500 + count) + "\t职称 " +
                            str(record.span()[0]) + " " +
                            str(record.span()[1]) + "\t" +
                            str(record.group()) + "\n")
            count += 1

        date_1 = r"([0-9]+年[0-9]+月[0-9]+日)"  # |([0-9]+月[0-9]+日)
        date_2 = r"([零〇一二三四五六七八九]年[十]?[一二三四五六七八九]月[一二三]?[十]?[一二三四五六七八九十]日)"
        date_3 = r"([0-9]+月[0-9]+日)"
        flag = False
        count = 0
        ## 方式1
        pattern = re.compile(date_1)
        iter = pattern.finditer(text)
        for record in iter:
            ner_info.append("T" + str(600 + count) + "\t日期 " +
                            str(record.span()[0]) + " " +
                            str(record.span()[1]) + "\t" +
                            str(record.group()) + "\n")
            count += 1
            flag = True

        if (flag is False):
            pattern = re.compile(date_3)
            iter = pattern.finditer(text)
            for record in iter:
                ner_info.append("T" + str(600 + count) + "\t日期 " +
                                str(record.span()[0]) + " " +
                                str(record.span()[1]) + "\t" +
                                str(record.group()) + "\n")
                count += 1

        ## 方式2
        pattern = re.compile(date_2)
        iter = pattern.finditer(text)
        for record in iter:
            ner_info.append("T" + str(600 + count) + "\t日期 " +
                            str(record.span()[0]) + " " +
                            str(record.span()[1]) + "\t" +
                            str(record.group()) + "\n")
            count += 1

        with open((path + '/' + str(school_id) + "-" + str(id) + ".ann"),
                  'w',
                  encoding='UTF-8') as file:
            print([text])
            print(ner_info)
            file.writelines(ner_info)
            file.close()
            print("\r已保存 " + str(school_id) + "-" + str(id) + ".ann", end="")
Ejemplo n.º 29
0
# import synonyms
import json

# sen1 = "程序员"
# sen2 = "软件工程师"
# r = synonyms.compare(sen1, sen2, seg=True)
# print(r)

# ddp = DDParser()
# # 单条句子
# re = ddp.parse("语文老师")
# print(re)

from ltp import LTP
ltp = LTP()  # 默认加载 Small 模型
seg, hidden = ltp.seg(["语文老师"])
pos = ltp.pos(hidden)
ner = ltp.ner(hidden)
srl = ltp.srl(hidden)
dep = ltp.dep(hidden)
sdp = ltp.sdp(hidden)

print(seg)
# print(hidden)
print(pos)
print(pos)
print(ner)
print(srl)
print(dep)
Ejemplo n.º 30
0
def gen_feature_v2(raw_data, label_vocab, args):
    tokenizer = BertTokenizer.from_pretrained(args.model_name)
    ltp = LTP()
    Features = []
    for sen_ids, data in enumerate(raw_data):
        if sen_ids % 500 == 0:
            logging.info("sen_ids:{}".format(sen_ids))
        token1 = []
        token2 = []
        try:
            token1_raw, sen1_srl = get_tag(data.sen1, ltp)
        except Exception as e:
            logging.warning("sen_id:{} have some mistake.".format(sen_ids))
            logging.error(e)
            continue
        if len(sen1_srl) == 0:
            continue
        if args.max_aspect < len(sen1_srl):
            args.max_aspect = len(sen1_srl)
        token1_ids = [0]
        for ids, word in enumerate(token1_raw):
            word_token = tokenizer.tokenize(word)
            token1 += word_token
            token1_ids += [ids + 1] * len(word_token)
        try:
            token2_raw, sen2_srl = get_tag(data.sen2, ltp)
        except Exception as e:
            logging.warning("sen_id:{} have some mistake.".format(sen_ids))
            logging.error(e)
            continue
        if len(sen2_srl) == 0:
            continue
        if args.max_aspect < len(sen2_srl):
            args.max_aspect = len(sen2_srl)
        token2_ids = [0]
        for ids, word in enumerate(token2_raw):
            word_token = tokenizer.tokenize(word)
            token2 += word_token
            token2_ids += [ids + 1] * len(word_token)
        while len(token1) > args.max_length - 2:
            token1.pop()
            token1_ids.pop()
        while len(token2) > args.max_length - 2:
            token2.pop()
            token2_ids.pop()
        # logging.info("sen1 size:{}. token1_ids:{} type:{}".format(len(sen1_srl[0]), token1_ids[-1], type(token1_ids[-1])))
        for i, sen in enumerate(sen1_srl):
            sen1_srl[i] = sen[:token1_ids[-1]]
        for i, sen in enumerate(sen2_srl):
            sen2_srl[i] = sen[:token2_ids[-1]]
        # sen1_srl = sen1_srl[:, :token1_ids[-1]]
        # sen2_srl = sen2_srl[:, :token2_ids[-1]]
        # logging.info("token1_ids:{}".format(token1_ids[-1]))
        # logging.info("token2_ids:{}".format(token2_ids[-1]))
        assert len(sen1_srl[0]) <= args.max_length
        assert len(sen2_srl[0]) <= args.max_length
        inputs_token1 = ["[CLS]"] + token1 + ["[SEP]"]
        inputs_token1 = tokenizer.convert_tokens_to_ids(inputs_token1)
        inputs_token2 = ["[CLS]"] + token2 + ["[SEP]"]
        inputs_token2 = tokenizer.convert_tokens_to_ids(inputs_token2)
        token1_ids.append(0)
        token2_ids.append(0)
        inputs_mask_1 = [1] * len(inputs_token1)
        inputs_mask_2 = [1] * len(inputs_token2)
        padding_1 = [0] * (args.max_length - len(inputs_token1))
        padding_2 = [0] * (args.max_length - len(inputs_token2))
        inputs_token1 += padding_1
        inputs_token2 += padding_2
        inputs_mask_1 += padding_1
        inputs_mask_2 += padding_2
        start = -1
        pre_word = -1
        word_start_end_1 = []
        for ids, word_ids in enumerate(token1_ids):
            end = ids
            # logging.info("{} : {}".format(pre_word, ids))
            if pre_word != word_ids:
                if start != -1:
                    word_start_end_1.append((start, end - 1))
                start = ids
            pre_word = word_ids
        if start != -1:
            word_start_end_1.append((start, end))
        word_start_end_1 += [(-1, -1)
                             ] * (args.max_length - len(word_start_end_1))
        start = -1
        pre_word = -1
        word_start_end_2 = []
        for ids, word_ids in enumerate(token2_ids):
            end = ids
            # logging.info("{} : {}".format(pre_word, ids))
            if pre_word != word_ids:
                if start != -1:
                    word_start_end_2.append((start, end - 1))
                start = ids
            pre_word = word_ids
        if start != -1:
            word_start_end_2.append((start, end))
        word_start_end_2 += [(-1, -1)
                             ] * (args.max_length - len(word_start_end_2))
        label_ids = label_vocab[0][data.label]
        Features.append(
            Feature(inputs=(inputs_token1, inputs_token2),
                    inputs_word_start_end=(word_start_end_1, word_start_end_2),
                    inputs_mask=(inputs_mask_1, inputs_mask_2),
                    inputs_sen_ids=None,
                    sen1_srl=sen1_srl,
                    sen2_srl=sen2_srl,
                    inputs_srl_ids=None,
                    label_ids=label_ids))
    return Features