def load_lines(mylineformat_file): sents = [] all_ent_ins = [] sent_counter = 0 raw_sents = [] f = open(mylineformat_file) for line in f: parts = line.split('\t') if len(parts) != 5: print line assert len(parts) == 5 ent_mid = parts[1].strip() #parseents(parts[1]) # myent = parts[1] # myent_mid = myent.split('/')[2] text = parts[4] subsents = getsentences(text) for i, sent in enumerate(subsents): if has_ent(sent, ent_mid): if len(subsents) > 1: sent = sent.strip() + ' .' (formatted_sent, ent_inds) = convert_text2figer_format(sent, sent_counter, ent_mid) sents.append(formatted_sent + '\n') all_ent_ins.append(ent_inds) raw_sents.append(getrawsent(sent)) sent_counter += 1 break # only the first sentence of the line -- return (sents, all_ent_ins, raw_sents)
def load_lines(mylineformat_file): sents = [] all_ent_ins = [] sent_counter = 0 raw_sents = [] f = open(mylineformat_file) for line in f: parts = line.split('\t') if len(parts) != 5: print line assert len(parts) == 5 ent_mid = parts[1].strip()#parseents(parts[1]) # myent = parts[1] # myent_mid = myent.split('/')[2] text = parts[4] subsents = getsentences(text) for i, sent in enumerate(subsents): if has_ent(sent, ent_mid): if len(subsents) > 1: sent = sent.strip() + ' .' (formatted_sent, ent_inds) = convert_text2figer_format(sent, sent_counter, ent_mid) sents.append(formatted_sent + '\n') all_ent_ins.append(ent_inds) raw_sents.append(getrawsent(sent)) sent_counter += 1 break # only the first sentence of the line -- return (sents, all_ent_ins, raw_sents)
def convert_text2figer_format(sent, sent_counter, ent_mid): new_lined_formatted = '' ent_inds = [] tokens = sent.strip().split(' ') token_counter = 0 first_occr = True for token in tokens: if has_ent(token, ent_mid) and first_occr == True: (mid, ent_tokens, notabletype) = getentparts(token) new_lined_formatted += ent_tokens[0] + '\tB-E\n' if len(ent_tokens) > 1: for i in range(1, len(ent_tokens)): new_lined_formatted += ent_tokens[i] + '\tI-E\n' ent_ind = str(sent_counter) + '\t' + str( token_counter) + '\t' + mid + '\t' + str( ent_tokens) + '\t' + notabletype ent_inds.append(ent_ind) first_occr = False elif '/m/' in token: (mid, ent_tokens, notabletype) = getentparts(token) for t in ent_tokens: new_lined_formatted += t + '\tO\n' else: new_lined_formatted += token + '\tO\n' token_counter += 1 return (new_lined_formatted, ent_inds)
def convert_text2figer_format(sent, sent_counter, ent_mid): new_lined_formatted = '' ent_inds = [] tokens = sent.strip().split(' ') token_counter = 0 first_occr = True for token in tokens: if has_ent(token, ent_mid) and first_occr == True: (mid, ent_tokens, notabletype) = getentparts(token) new_lined_formatted += ent_tokens[0] + '\tB-E\n' if len(ent_tokens) > 1: for i in range(1, len(ent_tokens)): new_lined_formatted += ent_tokens[i] + '\tI-E\n' ent_ind = str(sent_counter) + '\t' + str(token_counter) + '\t' + mid + '\t' + str(ent_tokens) + '\t' + notabletype ent_inds.append(ent_ind) first_occr = False elif '/m/' in token: (mid, ent_tokens, notabletype) = getentparts(token) for t in ent_tokens: new_lined_formatted += t + '\tO\n' else: new_lined_formatted += token + '\tO\n' token_counter += 1 return (new_lined_formatted, ent_inds)