def seg_with_ltp40(in_file, out_file_path, manual_seg_file): # initialization model ltp = LTP() line_list = [] # save seg_result corpus = construct_corpus(in_file) f = open(out_file_path, "w", encoding='utf-8') for line in corpus: line_list.append(line) # 将每句话变成列表["Xxxx"] seg_result, hidden = ltp.seg(line_list) f.write("=".join(seg_result[0]) + "\n") line_list.clear() f.flush() # test qps corpus = construct_corpus(in_file, 1) start = time.time() for line in corpus: segment, hidden = ltp.seg(list(line)) end = time.time() qps = round(len(corpus) / (end - start), 2) # test accuracy p, r, f1, line_aver_length = evaluate(out_file_path, manual_seg_file) return qps, p, r, f1, line_aver_length
def get_pos_tag(self, sentence): r""" pos tag function. :param str sentence: the sentence need to be ner :return: the triple form (tags,start,end) """ assert isinstance(sentence, (list, str)) from ltp import LTP if isinstance(sentence, list): # Turn the list into sentence tmp = '' for word in sentence: tmp += word sentence = tmp if not sentence: return [] if self.__pos is None: # get pos tag self.__pos = LTP() seg, hidden = self.__pos.seg([sentence]) pos = self.__pos.pos(hidden) seg = seg[0] pos = pos[0] pos_tag = [] cnt = 0 for tag in range(len(pos)): pos_tag.append([pos[tag], cnt, cnt + len(seg[tag]) - 1]) cnt += len(seg[tag]) return pos_tag
def mongo2ner(idx, ltp, offset, size): """ 根据offset从mongo中取指定size的文章 :param idx: :param offset: :param size: :return: """ entities = [] pid = os.getpid() try: # debug_logger.debug("{} ---pid:{} MongoDB: Skip: {}, size: {}".format(idx, pid, offset, size)) ltp = LTP(path=LTP4_MODEL_DIR) db_connect = MongoClient(host=MONGODB_HOST, port=MONGODB_PORT) db = db_connect[MONGODB_DATABASE_NAME] coll = db[MONGODB_ENTMT_COLLECTION] # debug_logger.debug("pid: {}, connected".format(pid)) for art in coll.find(skip=offset, limit=size): debug_logger.debug(art['title']) text = art['title'] + art['content'] entities_of_art = get_article_entities(idx, text, ltp) entities += entities_of_art # debug_logger.debug("pid: {}, write".format(pid)) with open(os.path.join(USER_DICT_DIR, 'ners_' + str(idx) + '.txt'), 'w') as fw: for item in entities: for word, label in item: fw.write(word + '\t' + label + '\n') except Exception as e: print("ERROR mongo2ner: {}".format(e)) # debug_logger.debug("ERROR mongo2ner: {}".format(e)) return entities
def __init__(self, path: str = 'small', batch_size: int = 50, device: str = None, onnx: bool = False): self.ltp = LTP(path=path, device=device) self.split = lambda a: map(lambda b: a[b:b + batch_size], range(0, len(a), batch_size))
def __init__(self, seq_len=512): """ Constructs Huggingface CN tokenizer & other col: What column to tokenize if pretraining """ self.tokenizer_cn = AutoTokenizer.from_pretrained("bert-base-chinese") self.tokenizer_ltp = LTP("small") self.max_seq_length = seq_len
def ltp_func(text_list): ltp = LTP() seg, hidden = ltp.seg(text_list) pos = ltp.pos(hidden) result = [] for idx, val in enumerate(seg[0]): pag = [val, pos[0][idx]] result.append('/'.join(pag)) return result
def dependency(self): sentence = self.sentence sentences = [] sentences.append(sentence) ltp = LTP() seg, hidden = ltp.seg(sentences) dep = ltp.dep(hidden) print(seg) print(dep) pass
def __init__(self, path: str = 'small', batch_size: int = 10, device: str = None, onnx: str = None, vocab: str = None): self.ltp = LTP(path=path, batch_size=batch_size, device=device, vocab=vocab)
def __init__(self, default_model_dir=LTP4_MODEL_DIR, user_dict_dir=USER_DICT_DIR): self.ltp = LTP(path=default_model_dir) for file in os.listdir(user_dict_dir): self.ltp.init_dict(path=os.path.join(user_dict_dir, file)) self.sentences = [] self.postags = [] self.nertags = [] self.dep = []
def work_summary_parser_ltp(): f = csvReader("标准工作任务单") ltp = LTP() paList = [] for i, row in enumerate(f): if i != 0: val = row[1][5:].split(',') paList.append(val[2]) wa, ha = ltp.seg(paList) pa = ltp.pos(ha) return wa, pa
def __init__(self, path: str = 'small', batch_size: int = 50, device: str = None, onnx: bool = False): if onnx: self.ltp = FastLTP(path=path, device=device, need_config=True) else: self.ltp = LTP(path=path, device=device, need_config=True) self._split = lambda a: map(lambda b: a[b:b + batch_size], range(0, len(a), batch_size))
def __init__(self, model_type='base', user_dict_dir=RESOURCE_DIR): self.default_user_dict_dir = user_dict_dir # 加载ltp模型 self.ltp = LTP(model_type) # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快 files = os.listdir(user_dict_dir) for file in files: file_path = os.path.join(user_dict_dir, file) # 文件夹则跳过 if os.path.isdir(file): continue self.ltp.init_dict(file_path)
def test_nlp_model(self): ltp1 = LTP(LTP4_MODEL_DIR) ltp2 = LTP(LTP4_MODEL_DIR) ltp3 = LTP(LTP4_MODEL_DIR) ltp4 = LTP(LTP4_MODEL_DIR) ltp5 = LTP(LTP4_MODEL_DIR) ltp6 = LTP(LTP4_MODEL_DIR) ltp7 = LTP(LTP4_MODEL_DIR) print('-------') import time time.sleep(10)
def main(args): # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm) # If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp) with open(args.file_name, "r", encoding="utf-8") as f: data = f.readlines() ltp_tokenizer = LTP(args.ltp) # faster in GPU device bert_tokenizer = BertTokenizer.from_pretrained(args.bert) ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer) with open(args.save_path, "w", encoding="utf-8") as f: data = [json.dumps(ref) + "\n" for ref in ref_ids] f.writelines(data)
def create(): """create profession keywords json file. """ ltp = LTP() # 默认加载 Small 模型 # import the professions file with open('./dataset/profession.json', 'rb') as jsonfile: profession_json = json.load(jsonfile, encoding='utf-8') for i, profession in enumerate(profession_json['data']): profession_json['data'][i]['kwords'] = find_kwords_by_ltp( profession['name'], ltp) with open('./dataset/profession2.json', 'w', encoding='utf-8') as jsonfile: json.dump(profession_json, jsonfile, ensure_ascii=False)
def findFood(self, sentence): ltp = LTP() words, hidden = ltp.seg([sentence]) posTags = ltp.pos(hidden) words = words[0] #分词结果list posTags = posTags[0] #词性标注结果list dep = ltp.dep(hidden)[0] #依存句法分析结果list relyId = [d[1] for d in dep] #父节点id list relation = [d[2] for d in dep] #关系结果 list heads = ['Root' if id == 0 else words[id - 1] for id in relyId] #父节点内容 string = '' for i in range(len(words)): if 'n' in posTags[i] and heads[i] == '吃' and relation[i] == 'VOB': string += words[i] string += ' ' return string
def get_ner(self, sentence): r""" NER function. :param str sent: the sentence need to be ner :return two forms of tags The first is the triple form (tags,start,end) The second is the list form, which marks the ner label of each word such as 周小明去玩 ['Nh', 'Nh', 'Nh', 'O', 'O'] """ assert isinstance(sentence, (list, str)) from ltp import LTP if isinstance(sentence, list): # Turn the list into sentence tmp = '' for word in sentence: tmp += word sentence = tmp if not sentence: return [], [] if self.__ner is None: self.__ner = LTP() seg, hidden = self.__ner.seg([sentence]) seg = seg[0] ner = self.__ner.ner(hidden) ner = ner[0] ner_label = len(sentence) * ['O'] for i in range(len(ner)): tag, start, end = ner[i] tmp = 0 for j in range(start): tmp += len(seg[j]) start = tmp tmp = 0 for j in range(end + 1): tmp += len(seg[j]) end = tmp ner[i] = (tag, start, end - 1) for j in range(start, end): ner_label[j] = tag return ner, ner_label
def WriteTest(readfile, savefile): with open(readfile, "r", encoding="utf-8") as rfp: ltp = LTP() logger.info("Processing file:%s ." % (readfile)) with open(savefile, 'w', encoding='utf-8') as wfp: for row in tqdm(rfp, desc="file %s process" % (readfile)): sent1, sent2 = row.split('\t') seg, hid = ltp.seg([sent1, sent2]) sdp = ltp.sdp(hid, mode='tree') pos = ltp.pos(hid) tmpitem = { 'sentence1': [seg[0], pos[0], sdp[0]], 'sentence2': [seg[1], pos[1], sdp[1]] } jsonline = json.dumps(tmpitem) wfp.write(jsonline + "\n")
def findFood(sentence): ltp = LTP() words, hidden = ltp.seg([sentence]) posTags = ltp.pos(hidden) words = words[0] #分词结果list print(words) posTags = posTags[0] #词性标注结果list print(posTags) dep = ltp.dep(hidden)[0] #依存句法分析结果list for t in dep: print(t) relyId = [d[1] for d in dep] #父节点id list relation = [d[2] for d in dep] #关系结果 list heads = ['Root' if id == 0 else words[id - 1] for id in relyId] #父节点内容 for i in range(len(words)): if 'n' in posTags[i] and heads[i] == '吃' and relation[i] == 'VOB': print("找到了一种食物:" + words[i])
def is_word(sentence): from ltp import LTP r""" Judge whether it is a word. :param str sentence: input sentence string sentence: input sentence string :return bool: is a word or not """ if sentence[0] == sentence[1]: return True ltp = LTP() seg, hidden = ltp.seg([sentence]) pos = ltp.pos(hidden) pos = pos[0] if len(pos) == 1 and pos[0] == 'n': return False return True
def main(args): # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm) # 如果要微调这些模型,则必须使用相同的tokenizer : LTP (https://github.com/HIT-SCIR/ltp) with open(args.file_name, "r", encoding="utf-8", errors='ignore') as f: data = f.readlines() print(f'开始处理数据,共有{len(data)}条') data = [ line.strip() for line in data if len(line) > 0 and not line.isspace() ] # avoid delimiter like '\u2029' print(f"开始加载ltp和bert的tokenizer模型") ltp_tokenizer = LTP(path=args.ltp) # faster in GPU device bert_tokenizer = BertTokenizer.from_pretrained(args.bert) #准备映射关系 ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer) #保存映射关系 with open(args.save_path, "w", encoding="utf-8") as f: data = [json.dumps(ref) + "\n" for ref in ref_ids] f.writelines(data) print(f"保存所有{len(data)}条数据的映射关系到文件{args.save_path}")
def thread_main(args, gpu=True): """ 多线程处理 Args: args: gpu: 是否使用gpu Returns: """ from functools import partial from multiprocessing import Pool from tqdm import tqdm # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm) # 如果要微调这些模型,则必须使用相同的tokenizer : LTP (https://github.com/HIT-SCIR/ltp) with open(args.file_name, "r", encoding="utf-8") as f: data = f.readlines() print(f'开始处理数据,共有{len(data)}条') data = [ line.strip() for line in data if len(line) > 0 and not line.isspace() ] # avoid delimiter like '\u2029' print(f"开始加载ltp和bert的tokenizer模型") ltp_tokenizer = LTP(path=args.ltp) # faster in GPU device bert_tokenizer = BertTokenizer.from_pretrained(args.bert) newdata = [data[i:i + 1000] for i in range(0, len(data), 1000)] #准备映射关系, 并行线程数 #如果使用GPU,请设置如下 if gpu: import torch torch.multiprocessing.set_start_method('spawn') with Pool(processes=args.processes) as p: # partial_clean 是封装一下函数 partial_clean = partial(prepare_ref, ltp_tokenizer=ltp_tokenizer, bert_tokenizer=bert_tokenizer) # chunksize8,就是数据分成8份 ref_ids_nest = list( tqdm(p.imap(partial_clean, newdata, chunksize=8), desc="开始处理数据")) ref_ids = [ref for nest in ref_ids_nest for ref in nest] #保存映射关系 with open(args.save_path, "w", encoding="utf-8") as f: data = [json.dumps(ref) + "\n" for ref in ref_ids] f.writelines(data) print(f"保存所有{len(data)}条数据的映射关系到文件{args.save_path}")
def load_word_segmentation_tool(): """ 加载分词工具 :return: HanLP: hanlp, ltp: LTP """ logger.info("loading word segmentation tool") # HanLP = HanLPClient(url='https://www.hanlp.com/api', auth='MTE4QGJicy5oYW5scC5jb206MXFFOHhWUkJNQXBNdlh0NA==') HanLP = hanlp.load(hanlp.pretrained.mtl. CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH, verbose=True) tasks = list(HanLP.tasks.keys()) for task in tasks: if task not in TASK: del HanLP[task] tok = HanLP[TASK[0]] tok.dict_combine = {'新冠', '新冠病毒', '新冠肺炎'} ltp = LTP() logger.info("loaded word segmentation tool") return HanLP, ltp
def work_detail_parser_ltp(): f = csvReader("标准工作任务单") ltp = LTP() paList = [] pbList = [] for i, row in enumerate(f): if i != 0: val = row[1][5:].split(',') paList.append(val[2]) temp = val[3:] for v in temp: pbList.append(v) # print(paList) # print(pbList) sa, ha = ltp.seg(paList) sb, hb = ltp.seg(pbList) pa = ltp.pos(ha) pb = ltp.pos(hb) return sa, sb, pa, pb
def load_ltp_weights(weights_type): ''' 加载 LTP 权重文件,实例化 LTP 模型 :param weights_type: 载入模型文件类型,只能采用 base、small、tiny 三种类型 :return: 载入权重参数后的 LTP 模型 ''' # 诊断模型类型 assert weights_type in ['base', 'small', 'tiny'], 'LTP 模型只能采用 base、small、tiny三种类型的参数' # 确认文件路径 if LtpModelPath is None: file_path = os.path.abspath( os.path.join(os.path.dirname('.'), 'weights', weights_type)) else: file_path = os.path.abspath( os.path.join(LtpModelPath, weights_type)) # 载入权重 ltp = LTP(path=file_path) return ltp
def new_generate_ltp_results(): # 加载模型 ltp_model = '../../ltp_models/base1' ltp = LTP(path=ltp_model) # 读取原句子 data = read_file_in_ltp('../data/train_base.json') sentences = list(map(lambda x: x['content'], data)) segmented, pos, ner, srl, dep, sdp_tree, sdp_graph = [], [], [], [], [], [], [] for sent in tqdm(sentences): # 分词 segmented0, hidden = ltp.seg([sent]) # 词性标注 cur_pos = ltp.pos(hidden) # 命名实体识别 cur_ner = ltp.ner(hidden) # 语义角色标注 cur_srl = ltp.srl(hidden) # 依存句法分析 cur_dep = ltp.dep(hidden) # 语义依存分析 (树) cur_sdp_tree = ltp.sdp(hidden, mode='tree') # 语义依存分析 (图) cur_sdp_graph = ltp.sdp(hidden, mode='graph') segmented.append(segmented0[0]) pos.append(cur_pos[0]) ner.append(cur_ner[0]) srl.append(cur_srl[0]) dep.append(cur_dep[0]) sdp_tree.append(cur_sdp_tree[0]) sdp_graph.append(cur_sdp_graph[0]) # 生成句子与分词的对应 sent_seg_matches = sentence_segment_match(data, segmented) pickle.dump([segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches], open('new_ltp_results.pk', 'wb')) return segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches
# [[('every', 5)], [('自然数', 'x'), 'and', ('奇数', 'x')]] from ltp import LTP ltp = LTP() class NlpCtr(object): def __init__(self): self.seg = None self.words = None self.dep = None def trans_result(self, depArr, posArr): tempposArr = posArr[0] tempdepArr = depArr[0] tempArr = [] for item in tempdepArr: dic = { 'dep': item[0], 'gov': item[1], 'type': item[2], # 'pos': tempposArr[item[0] - 1] } tempArr.append(dic) return tempArr def getHED(self, words): root = None for word in words: if word['gov'] == 0 and word['type'] == 'HED': root = word['dep']
def save_as_txt(data): from ltp import LTP import random ltp = LTP() for row in data: id = row[0] school_id = ("000" + str(row[1]))[-4:] texts = row[2] textlines = texts.split('\n') shortened_textlines = [] for line in textlines: line_len = len(line) if line_len > 100: for i in range(line_len // 100): shortened_textlines.append(line[i * 100:(i + 1) * 100]) else: shortened_textlines.append(line) text = ' '.join(shortened_textlines) path = './data/' + str(school_id) if os.path.exists(path): pass else: os.makedirs(path) with open((path + '/' + str(school_id) + "-" + str(id) + ".txt"), 'w', encoding='UTF-8') as file: file.write(text) file.close() print("\r已保存 " + str(school_id) + "-" + str(id) + ".txt", end="") # T2 报告人 68 71 曹进德 # R2 报告人_单位 Arg1: T2 Arg2: T1 seg, hidden = ltp.seg([text]) ner = ltp.ner(hidden) ner_info = [] entities_nh = [] entities_ni = [] print(type(text)) print() for i in ner[0]: if (i[0] == 'Nh'): start = i[1] end = i[2] entity = "".join(seg[0][start:end + 1]) if (len(entity) > 1): entities_nh.append(entity) elif (i[0] == 'Ni'): start = i[1] end = i[2] entity = "".join(seg[0][start:end + 1]) if entity in schoolnames: entities_ni.append(entity) for entity in set(entities_nh): pattern = re.compile(entity) iter = pattern.finditer(text) count = 0 for record in iter: ner_info.append("T" + str(300 + count) + "\t姓名 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 for entity in set(entities_ni): pattern = re.compile(entity) iter = pattern.finditer(text) count = 0 for record in iter: ner_info.append("T" + str(400 + count) + "\t单位 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 pattern = re.compile('教授|副教授|讲师|研究员|副研究员|助理教授|助理研究员') iter = pattern.finditer(text) count = 0 for record in iter: ner_info.append("T" + str(500 + count) + "\t职称 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 date_1 = r"([0-9]+年[0-9]+月[0-9]+日)" # |([0-9]+月[0-9]+日) date_2 = r"([零〇一二三四五六七八九]年[十]?[一二三四五六七八九]月[一二三]?[十]?[一二三四五六七八九十]日)" date_3 = r"([0-9]+月[0-9]+日)" flag = False count = 0 ## 方式1 pattern = re.compile(date_1) iter = pattern.finditer(text) for record in iter: ner_info.append("T" + str(600 + count) + "\t日期 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 flag = True if (flag is False): pattern = re.compile(date_3) iter = pattern.finditer(text) for record in iter: ner_info.append("T" + str(600 + count) + "\t日期 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 ## 方式2 pattern = re.compile(date_2) iter = pattern.finditer(text) for record in iter: ner_info.append("T" + str(600 + count) + "\t日期 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 with open((path + '/' + str(school_id) + "-" + str(id) + ".ann"), 'w', encoding='UTF-8') as file: print([text]) print(ner_info) file.writelines(ner_info) file.close() print("\r已保存 " + str(school_id) + "-" + str(id) + ".ann", end="")
# import synonyms import json # sen1 = "程序员" # sen2 = "软件工程师" # r = synonyms.compare(sen1, sen2, seg=True) # print(r) # ddp = DDParser() # # 单条句子 # re = ddp.parse("语文老师") # print(re) from ltp import LTP ltp = LTP() # 默认加载 Small 模型 seg, hidden = ltp.seg(["语文老师"]) pos = ltp.pos(hidden) ner = ltp.ner(hidden) srl = ltp.srl(hidden) dep = ltp.dep(hidden) sdp = ltp.sdp(hidden) print(seg) # print(hidden) print(pos) print(pos) print(ner) print(srl) print(dep)
def gen_feature_v2(raw_data, label_vocab, args): tokenizer = BertTokenizer.from_pretrained(args.model_name) ltp = LTP() Features = [] for sen_ids, data in enumerate(raw_data): if sen_ids % 500 == 0: logging.info("sen_ids:{}".format(sen_ids)) token1 = [] token2 = [] try: token1_raw, sen1_srl = get_tag(data.sen1, ltp) except Exception as e: logging.warning("sen_id:{} have some mistake.".format(sen_ids)) logging.error(e) continue if len(sen1_srl) == 0: continue if args.max_aspect < len(sen1_srl): args.max_aspect = len(sen1_srl) token1_ids = [0] for ids, word in enumerate(token1_raw): word_token = tokenizer.tokenize(word) token1 += word_token token1_ids += [ids + 1] * len(word_token) try: token2_raw, sen2_srl = get_tag(data.sen2, ltp) except Exception as e: logging.warning("sen_id:{} have some mistake.".format(sen_ids)) logging.error(e) continue if len(sen2_srl) == 0: continue if args.max_aspect < len(sen2_srl): args.max_aspect = len(sen2_srl) token2_ids = [0] for ids, word in enumerate(token2_raw): word_token = tokenizer.tokenize(word) token2 += word_token token2_ids += [ids + 1] * len(word_token) while len(token1) > args.max_length - 2: token1.pop() token1_ids.pop() while len(token2) > args.max_length - 2: token2.pop() token2_ids.pop() # logging.info("sen1 size:{}. token1_ids:{} type:{}".format(len(sen1_srl[0]), token1_ids[-1], type(token1_ids[-1]))) for i, sen in enumerate(sen1_srl): sen1_srl[i] = sen[:token1_ids[-1]] for i, sen in enumerate(sen2_srl): sen2_srl[i] = sen[:token2_ids[-1]] # sen1_srl = sen1_srl[:, :token1_ids[-1]] # sen2_srl = sen2_srl[:, :token2_ids[-1]] # logging.info("token1_ids:{}".format(token1_ids[-1])) # logging.info("token2_ids:{}".format(token2_ids[-1])) assert len(sen1_srl[0]) <= args.max_length assert len(sen2_srl[0]) <= args.max_length inputs_token1 = ["[CLS]"] + token1 + ["[SEP]"] inputs_token1 = tokenizer.convert_tokens_to_ids(inputs_token1) inputs_token2 = ["[CLS]"] + token2 + ["[SEP]"] inputs_token2 = tokenizer.convert_tokens_to_ids(inputs_token2) token1_ids.append(0) token2_ids.append(0) inputs_mask_1 = [1] * len(inputs_token1) inputs_mask_2 = [1] * len(inputs_token2) padding_1 = [0] * (args.max_length - len(inputs_token1)) padding_2 = [0] * (args.max_length - len(inputs_token2)) inputs_token1 += padding_1 inputs_token2 += padding_2 inputs_mask_1 += padding_1 inputs_mask_2 += padding_2 start = -1 pre_word = -1 word_start_end_1 = [] for ids, word_ids in enumerate(token1_ids): end = ids # logging.info("{} : {}".format(pre_word, ids)) if pre_word != word_ids: if start != -1: word_start_end_1.append((start, end - 1)) start = ids pre_word = word_ids if start != -1: word_start_end_1.append((start, end)) word_start_end_1 += [(-1, -1) ] * (args.max_length - len(word_start_end_1)) start = -1 pre_word = -1 word_start_end_2 = [] for ids, word_ids in enumerate(token2_ids): end = ids # logging.info("{} : {}".format(pre_word, ids)) if pre_word != word_ids: if start != -1: word_start_end_2.append((start, end - 1)) start = ids pre_word = word_ids if start != -1: word_start_end_2.append((start, end)) word_start_end_2 += [(-1, -1) ] * (args.max_length - len(word_start_end_2)) label_ids = label_vocab[0][data.label] Features.append( Feature(inputs=(inputs_token1, inputs_token2), inputs_word_start_end=(word_start_end_1, word_start_end_2), inputs_mask=(inputs_mask_1, inputs_mask_2), inputs_sen_ids=None, sen1_srl=sen1_srl, sen2_srl=sen2_srl, inputs_srl_ids=None, label_ids=label_ids)) return Features