def generateConfusionMatrices(self): counter = 0 if self.options['LOCAL']: PRINT_THRESHOLD = 10 else: PRINT_THRESHOLD = 10000 with open(self.options['DATA_FILE']) as f: for line in f: line = line.strip().split('\t') if options['USE_CONTEXT']: # TODO : Fill this out pass else: _label = eval(line[3]) if self.options['PR_CURVE']: self.gold_labels.append(_label) for method in self.options['EMBEDDING_METHODS']: self.updateConfusionMatrix(method, my_tokenize(line[1]), my_tokenize(line[2]), _label) counter += 1 if (counter % PRINT_THRESHOLD) == 0: print counter, 'DONE' sys.stdout.flush() if self.options['PR_CURVE']: self.generateProbabilities()
def text_tokenize(txt, sent_start): tokens = my_utils.my_tokenize(txt) offset = 0 for token in tokens: offset = txt.find(token, offset) yield token, offset + sent_start, offset + len(token) + sent_start offset += len(token)
def text_tokenize_and_postagging(txt, sent_start): tokens = my_utils.my_tokenize(txt) pos_tags = nltk.pos_tag(tokens) offset = 0 for token, pos_tag in pos_tags: offset = txt.find(token, offset) yield token, pos_tag, offset + sent_start, offset + len( token) + sent_start offset += len(token)
def to_conll(all_news, source_filter, type_filter, out_file): out_f = codecs.open(out_file, 'w', 'utf-8') sent_num = 0 for news in all_news: if news['source'] not in source_filter: continue if news['category'] not in type_filter: continue tokens = my_tokenize(news['title']) for token in tokens: out_f.write(token + "\n") out_f.write("\n") sent_num += 1 all_sents_inds = [] generator = sent_tokenizer.span_tokenize(news['description']) for t in generator: all_sents_inds.append(t) for ind in range(len(all_sents_inds)): t_start = all_sents_inds[ind][0] t_end = all_sents_inds[ind][1] sent_text = news['description'][t_start:t_end] tokens = my_tokenize(sent_text) for token in tokens: out_f.write(token + "\n") out_f.write("\n") sent_num += 1 out_f.close() print("write {} into {}".format(sent_num, out_file))
def pubmed_to_conll(dir, out_file): sent_num = 0 out_f = codecs.open(out_file, 'w', 'utf-8') for input_file_name in os.listdir(dir): if input_file_name.find(".txt") == -1: continue with codecs.open(os.path.join(dir, input_file_name), 'r', 'utf-8') as f: text = '' for line in f: line = line.strip() if line.find("|t|") != -1: p = line.find("|t|") text += line[p + len("|t|"):] + " " elif line.find("|a|") != -1: p = line.find("|a|") text += line[p + len("|a|"):] all_sents_inds = [] generator = sent_tokenizer.span_tokenize(text) for t in generator: all_sents_inds.append(t) for ind in range(len(all_sents_inds)): t_start = all_sents_inds[ind][0] t_end = all_sents_inds[ind][1] sent_text = text[t_start:t_end] tokens = my_tokenize(sent_text) for token in tokens: out_f.write(token + "\n") out_f.write("\n") sent_num += 1 out_f.close() print("write {} into {}".format(sent_num, out_file))
def getRelationInstanceForOneDoc(doc_token, entities, doc_name, data): X = [] other = [] row_num = len(entities) for latter_idx in range(row_num): for former_idx in range(row_num): if former_idx < latter_idx: former = entities[former_idx] latter = entities[latter_idx] if math.fabs(latter.sent_idx-former.sent_idx) >= data.sent_window: continue # for double annotation, we don't generate instances if former.start==latter.start and former.end==latter.end: continue #type_constraint = relationConstraint(former['type'], latter['type']) type_constraint = relationConstraint_chapman(former.type, latter.type) if type_constraint == 0: continue # here we retrieve all the sentences inbetween two entities, sentence of former, sentence ..., sentence of latter sent_idx = former.sent_idx context_token = pd.DataFrame(columns=doc_token.columns) base = 0 former_tf_start, former_tf_end = -1, -1 latter_tf_start, latter_tf_end = -1, -1 while sent_idx <= latter.sent_idx: sentence = doc_token[(doc_token['sent_idx'] == sent_idx)] if former.sent_idx == sent_idx: former_tf_start, former_tf_end = base+former.tf_start, base+former.tf_end if latter.sent_idx == sent_idx: latter_tf_start, latter_tf_end = base+latter.tf_start, base+latter.tf_end context_token = context_token.append(sentence, ignore_index=True) base += len(sentence['text']) sent_idx += 1 if context_token.shape[0] > data.max_seq_len: # truncate logging.debug("exceed max_seq_len {} {}".format(doc_name, context_token.shape[0])) context_token = context_token.iloc[:data.max_seq_len] words = [] postags = [] cap = [] chars = [] positions1 = [] positions2 = [] former_token = [] latter_token = [] i = 0 for _, token in context_token.iterrows(): if data.number_normalized: word = utils.functions.normalize_word(token['text']) else: word = token['text'] entity_word = my_utils1.normalizeWord(token['text']) words.append(data.word_alphabet.get_index(word)) postags.append(data.feature_alphabets[data.feature_name2id['[POS]']].get_index(token['postag'])) cap.append(data.feature_alphabets[data.feature_name2id['[Cap]']].get_index( str(my_utils.featureCapital(token['text'])))) char_for1word = [] for char in word: char_for1word.append(data.char_alphabet.get_index(char)) chars.append(char_for1word) if i < former_tf_start: positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index( former_tf_start - i)) elif i > former_tf_end: positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index( former_tf_end - i)) pass else: positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0)) former_token.append( data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word)) if i < latter_tf_start: positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index( latter_tf_start - i)) pass elif i > latter_tf_end: positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index( latter_tf_end - i)) pass else: positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0)) latter_token.append( data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word)) i += 1 if len(former_token) == 0: # truncated part contains entity, so we have to use the text in doc_entity # splitted = re.split(r"\s+| +|[\(\)\[\]\-_,]+", former['text']) splitted = my_utils.my_tokenize(former.text) for s in splitted: s = s.strip() if s != "": former_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s))) if len(latter_token) == 0: #splitted = re.split(r"\s+| +|[\(\)\[\]\-_,]+", latter['text']) splitted = my_utils.my_tokenize(latter.text) for s in splitted: s = s.strip() if s != "": latter_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s))) assert len(former_token)>0 assert len(latter_token)>0 features = {'tokens': words, 'postag': postags, 'cap': cap, 'char': chars, 'positions1': positions1, 'positions2': positions2} if type_constraint == 1: features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former.type) features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter.type) features['e1_token'] = former_token features['e2_token'] = latter_token else: features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter.type) features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former.type) features['e1_token'] = latter_token features['e2_token'] = former_token features['tok_num_betw'] = data.re_feature_alphabets[data.re_feature_name2id['[TOKEN_NUM]']].get_index(latter.tf_start-former.tf_end) entity_between = getEntitiesBetween(former, latter, entities) features['et_num'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_NUM]']].get_index(len(entity_between)) X.append(features) other.append((former, latter)) return X, other
def getRelationInstance2(tokens, entities, relations, names, data): X = [] Y = [] cnt_neg = 0 for i in tqdm(range(len(relations))): doc_relation = relations[i] doc_token = tokens[i] doc_entity = entities[i] # entity are sorted by start offset doc_name = names[i] row_num = doc_entity.shape[0] for latter_idx in range(row_num): for former_idx in range(row_num): if former_idx < latter_idx: former = doc_entity.iloc[former_idx] latter = doc_entity.iloc[latter_idx] if math.fabs(latter['sent_idx']-former['sent_idx']) >= data.sent_window: continue # for double annotation, we don't generate instances if former['start']==latter['start'] and former['end']==latter['end']: continue #type_constraint = relationConstraint(former['type'], latter['type']) type_constraint = relationConstraint_chapman(former['type'], latter['type']) if type_constraint == 0: continue gold_relations = doc_relation[ ( ((doc_relation['entity1_id'] == former['id']) & ( doc_relation['entity2_id'] == latter['id'])) | ((doc_relation['entity1_id'] == latter['id']) & ( doc_relation['entity2_id'] == former['id'])) ) ] if gold_relations.shape[0] > 1: #raise RuntimeError("the same entity pair has more than one relations") logging.debug("entity {} and {} has more than one relations".format(former['id'], latter['id'])) continue # here we retrieve all the sentences inbetween two entities, sentence of former, sentence ..., sentence of latter sent_idx = former['sent_idx'] context_token = pd.DataFrame(columns=doc_token.columns) base = 0 former_tf_start, former_tf_end = -1, -1 latter_tf_start, latter_tf_end = -1, -1 while sent_idx <= latter['sent_idx']: sentence = doc_token[(doc_token['sent_idx'] == sent_idx)] if former['sent_idx'] == sent_idx: former_tf_start, former_tf_end = base+former['tf_start'], base+former['tf_end'] if latter['sent_idx'] == sent_idx: latter_tf_start, latter_tf_end = base+latter['tf_start'], base+latter['tf_end'] context_token = context_token.append(sentence, ignore_index=True) base += len(sentence['text']) sent_idx += 1 if context_token.shape[0] > data.max_seq_len: # truncate logging.debug("exceed max_seq_len {} {}".format(doc_name, context_token.shape[0])) context_token = context_token.iloc[:data.max_seq_len] words = [] postags = [] cap = [] chars = [] positions1 = [] positions2 = [] former_token = [] latter_token = [] i = 0 for _, token in context_token.iterrows(): if data.number_normalized: word = utils.functions.normalize_word(token['text']) else: word = token['text'] entity_word = my_utils1.normalizeWord(token['text']) words.append(data.word_alphabet.get_index(word)) postags.append(data.feature_alphabets[data.feature_name2id['[POS]']].get_index(token['postag'])) cap.append(data.feature_alphabets[data.feature_name2id['[Cap]']].get_index(str(my_utils.featureCapital(token['text'])))) char_for1word = [] for char in word: char_for1word.append(data.char_alphabet.get_index(char)) chars.append(char_for1word) if i < former_tf_start: positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(former_tf_start - i)) elif i > former_tf_end: positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(former_tf_end - i)) pass else: positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0)) former_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word)) if i < latter_tf_start: positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(latter_tf_start - i)) pass elif i > latter_tf_end: positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(latter_tf_end - i)) pass else: positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0)) latter_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word)) i += 1 if len(former_token) == 0: # truncated part contains entity, so we have to use the text in doc_entity splitted = my_utils.my_tokenize(former['text']) for s in splitted: s = s.strip() if s != "": former_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s))) if len(latter_token) == 0: splitted = my_utils.my_tokenize(latter['text']) for s in splitted: s = s.strip() if s != "": latter_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s))) assert len(former_token)>0 assert len(latter_token)>0 features = {'tokens': words, 'postag': postags, 'cap': cap, 'char': chars, 'positions1': positions1, 'positions2': positions2} if type_constraint == 1: features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former['type']) features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter['type']) features['e1_token'] = former_token features['e2_token'] = latter_token else: features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter['type']) features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former['type']) features['e1_token'] = latter_token features['e2_token'] = former_token features['tok_num_betw'] = data.re_feature_alphabets[data.re_feature_name2id['[TOKEN_NUM]']].get_index(latter['tf_start']-former['tf_end']) entity_between = doc_entity[((doc_entity['start']>=former['end']) & (doc_entity['end']<=latter['start']))] features['et_num'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_NUM]']].get_index(entity_between.shape[0]) X.append(features) if gold_relations.shape[0] == 0: Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index('</unk>')) cnt_neg += 1 else: gold_answer = gold_relations.iloc[0]['type'] Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index(gold_answer)) neg = 100.0*cnt_neg/len(Y) logging.info("positive instance {}%, negative instance {}%".format(100-neg, neg)) return X, Y
def load_data(data_dir, mode='train'): doc_num = 0 sent_num = 0 max_sent_length = 0 min_sent_length = 9999 total_sent_length = 0 documents = [] alphabet_category = Alphabet('category', True) print(os.listdir(data_dir)) for input_file_name in os.listdir(data_dir): alphabet_category.add(input_file_name) wb = load_workbook(os.path.join(data_dir, input_file_name)) sheetnames = wb.get_sheet_names() ws = wb.get_sheet_by_name(sheetnames[0]) for row_idx, row in enumerate(ws.rows): if row_idx == 0: continue # head document = Document() document.pmid = row[0].value document.title = row[1].value document.abstract = row[2].value if mode == 'train': document.relevant_sentences = parseReleventFromExcel( row[3].value) document.category = input_file_name all_sents_inds = [] generator = nlp_tool.span_tokenize(document.abstract) for t in generator: all_sents_inds.append(t) for ind in range(len(all_sents_inds)): sentence = Sentence() sentence.start = all_sents_inds[ind][0] sentence.end = all_sents_inds[ind][1] offset = 0 sentence.text = document.abstract[sentence.start:sentence.end] if len(document.relevant_sentences) != 0: if sentence.text in document.relevant_sentences: sentence.label = 'yes' else: sentence.label = 'no' else: sentence.label = 'no' # replace due to nltk transfer " to other character, see https://github.com/nltk/nltk/issues/1630 sentence.text = sentence.text.replace('"', " ") sentence.text = sentence.text.replace('\'', " ") for token_txt in my_tokenize(sentence.text): token = {} offset = sentence.text.find(token_txt, offset) if offset == -1: raise RuntimeError("can't find {} in '{}'".format( token_txt, sentence.text)) token['text'] = token_txt token['start'] = sentence.start + offset token['end'] = sentence.start + offset + len(token_txt) token['wp'] = wp_tokenizer.tokenize(token_txt) if len( token['wp'] ) == 0: # for some oov tokens (e.g., \x99), wp_tokenizer return a empty list token['wp'] = ['[UNK]'] sentence.tokens.append(token) offset += len(token_txt) document.sentences.append(sentence) sent_num += 1 total_sent_length += len(sentence.tokens) if len(sentence.tokens) > max_sent_length: max_sent_length = len(sentence.tokens) if len(sentence.tokens) < min_sent_length: min_sent_length = len(sentence.tokens) documents.append(document) doc_num += 1 logging.info("{} statistics".format(data_dir)) logging.info("doc number {}, sent number {}".format(doc_num, sent_num)) logging.info( "avg sent length {}, max sent length {}, min sent length {}".format( total_sent_length // sent_num, max_sent_length, min_sent_length)) return documents, alphabet_category
def getRelationInstance(tokens, entities, relations, names, data): X = [] Y = [] cnt_neg = 0 for i in tqdm(range(len(relations))): doc_relation = relations[i] doc_token = tokens[i] doc_entity = entities[i] # entity are sorted by start offset doc_name = names[i] row_num = doc_entity.shape[0] for latter_idx in range(row_num): for former_idx in range(row_num): if former_idx < latter_idx: former = doc_entity.iloc[former_idx] latter = doc_entity.iloc[latter_idx] if former['text'] == latter['text']: continue gold_relations = doc_relation[( ((doc_relation['entity1_text'] == former['text']) & (doc_relation['entity2_text'] == latter['text'])) | ((doc_relation['entity1_text'] == latter['text']) & (doc_relation['entity2_text'] == former['text'])))] # if gold_relations.shape[0] == 0: # raise RuntimeError("{}: entity {} and {} has strange relations".format(doc_name, former['id'], latter['id'])) context_token = doc_token former_tf_start, former_tf_end = former[ 'tf_start'], former['tf_end'] latter_tf_start, latter_tf_end = latter[ 'tf_start'], latter['tf_end'] if context_token.shape[0] > data.max_seq_len: # truncate logging.debug("exceed max_seq_len {} {}".format( doc_name, context_token.shape[0])) context_token = context_token.iloc[:data.max_seq_len] words = [] postags = [] cap = [] chars = [] positions1 = [] positions2 = [] former_token = [] latter_token = [] i = 0 for _, token in context_token.iterrows(): if data.number_normalized: word = normalize_word(token['text']) else: word = token['text'] entity_word = my_utils1.normalizeWord(token['text']) words.append(data.word_alphabet.get_index(word)) postags.append(data.feature_alphabets[ data.feature_name2id['[POS]']].get_index( token['postag'])) cap.append(data.feature_alphabets[ data.feature_name2id['[Cap]']].get_index( str(my_utils.featureCapital(token['text'])))) char_for1word = [] for char in word: char_for1word.append( data.char_alphabet.get_index(char)) chars.append(char_for1word) if i < former_tf_start: positions1.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(former_tf_start - i)) elif i > former_tf_end: positions1.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(former_tf_end - i)) else: positions1.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(0)) former_token.append(data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY]']].get_index( entity_word)) if i < latter_tf_start: positions2.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(latter_tf_start - i)) elif i > latter_tf_end: positions2.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(latter_tf_end - i)) else: positions2.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(0)) latter_token.append(data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY]']].get_index( entity_word)) i += 1 if len( former_token ) == 0: # truncated part contains entity, so we have to use the text in doc_entity splitted = my_utils.my_tokenize(former['text']) for s in splitted: s = s.strip() if s != "": former_token.append(data.re_feature_alphabets[ data. re_feature_name2id['[ENTITY]']].get_index( my_utils1.normalizeWord(s))) if len(latter_token) == 0: splitted = my_utils.my_tokenize(latter['text']) for s in splitted: s = s.strip() if s != "": latter_token.append(data.re_feature_alphabets[ data. re_feature_name2id['[ENTITY]']].get_index( my_utils1.normalizeWord(s))) assert len(former_token) > 0 assert len(latter_token) > 0 features = { 'tokens': words, 'postag': postags, 'cap': cap, 'char': chars, 'positions1': positions1, 'positions2': positions2 } features['e1_type'] = data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY_TYPE]']].get_index( former['type']) features['e2_type'] = data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY_TYPE]']].get_index( latter['type']) features['e1_token'] = former_token features['e2_token'] = latter_token features['tok_num_betw'] = data.re_feature_alphabets[ data.re_feature_name2id['[TOKEN_NUM]']].get_index( latter['tf_start'] - former['tf_end']) entity_between = doc_entity[( (doc_entity['start'] >= former['end']) & (doc_entity['end'] <= latter['start']))] features['et_num'] = data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY_NUM]']].get_index( entity_between.shape[0]) X.append(features) gold_answer = '</unk>' for _, gold_relation in gold_relations.iterrows(): if gold_relation['type'] != 'None': gold_answer = gold_relation['type'] break Y.append(data.re_feature_alphabets[data.re_feature_name2id[ '[RELATION]']].get_index(gold_answer)) if gold_answer == '</unk>': cnt_neg += 1 # if gold_relations.iloc[0]['type']=='None' and gold_relations.iloc[1]['type']=='None': # Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index('</unk>')) # cnt_neg += 1 # else: # gold_answer = gold_relations.iloc[0]['type'] if gold_relations.iloc[0]['type']!='None' else gold_relations.iloc[1]['type'] # Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index(gold_answer)) neg = 100.0 * cnt_neg / len(Y) logging.info("positive instance {}%, negative instance {}%".format( 100 - neg, neg)) return X, Y
def load_data(data_dir): doc_num = 0 sent_num = 0 entity_num = 0 max_sent_length = 0 min_sent_length = 9999 total_sent_length = 0 documents = [] for input_file_name in os.listdir(data_dir): if input_file_name.find(".txt") != -1: document = Document() document.name = input_file_name ann_file_name = input_file_name.replace(".txt", '.ann') if os.path.isfile(os.path.join(data_dir, ann_file_name)): with codecs.open(os.path.join(data_dir, ann_file_name), 'r', 'UTF-8') as fp: for line in fp: line = line.strip() if line == '': continue if line[0] == '#': # ignored annotations for task 2 continue entity = {} columns = line.split('\t') entity['id'] = columns[0] columns_1 = columns[1].split(" ") entity['type'] = columns_1[0] entity['start'] = int(columns_1[1]) entity['end'] = int(columns_1[2]) entity['text'] = columns[2] document.entities.append(entity) entity_num += 1 with codecs.open(os.path.join(data_dir, input_file_name), 'r', 'UTF-8') as fp: document.text = fp.read() all_sents_inds = [] generator = nlp_tool.span_tokenize(document.text) for t in generator: all_sents_inds.append(t) for ind in range(len(all_sents_inds)): sentence = Sentence() sentence.start = all_sents_inds[ind][0] sentence.end = all_sents_inds[ind][1] offset = 0 sentence_txt = document.text[sentence.start:sentence.end] # replace due to nltk transfer " to other character, see https://github.com/nltk/nltk/issues/1630 sentence_txt = sentence_txt.replace('"', " ") sentence_txt = sentence_txt.replace('\'', " ") for token_txt in my_tokenize(sentence_txt): token = {} offset = sentence_txt.find(token_txt, offset) if offset == -1: raise RuntimeError("can't find {} in '{}'".format( token_txt, sentence_txt)) token['text'] = token_txt token['start'] = sentence.start + offset token['end'] = sentence.start + offset + len(token_txt) token['wp'] = wp_tokenizer.tokenize(token_txt) if len( token['wp'] ) == 0: # for some oov tokens (e.g., \x99), wp_tokenizer return a empty list token['wp'] = ['[UNK]'] # if len(document.entities) != 0: # token['label'] = getLabel_BIO(token['start'], token['end'], document.entities) token['label'] = getLabel_BIO(token['start'], token['end'], document.entities) sentence.tokens.append(token) offset += len(token_txt) document.sentences.append(sentence) sent_num += 1 total_sent_length += len(sentence.tokens) if len(sentence.tokens) > max_sent_length: max_sent_length = len(sentence.tokens) if len(sentence.tokens) < min_sent_length: min_sent_length = len(sentence.tokens) documents.append(document) doc_num += 1 logging.info("{} statistics".format(data_dir)) logging.info("doc number {}, sent number {}, entity number {}".format( doc_num, sent_num, entity_num)) logging.info( "avg sent length {}, max sent length {}, min sent length {}".format( total_sent_length // sent_num, max_sent_length, min_sent_length)) return documents