def convert_text2figer_format(sent, sent_counter, ent_mid): new_lined_formatted = '' ent_inds = [] tokens = sent.strip().split(' ') token_counter = 0 first_occr = True for token in tokens: if has_ent(token, ent_mid) and first_occr == True: (mid, ent_tokens, notabletype) = getentparts(token) new_lined_formatted += ent_tokens[0] + '\tB-E\n' if len(ent_tokens) > 1: for i in range(1, len(ent_tokens)): new_lined_formatted += ent_tokens[i] + '\tI-E\n' ent_ind = str(sent_counter) + '\t' + str( token_counter) + '\t' + mid + '\t' + str( ent_tokens) + '\t' + notabletype ent_inds.append(ent_ind) first_occr = False elif '/m/' in token: (mid, ent_tokens, notabletype) = getentparts(token) for t in ent_tokens: new_lined_formatted += t + '\tO\n' else: new_lined_formatted += token + '\tO\n' token_counter += 1 return (new_lined_formatted, ent_inds)
def convert_text2figer_format(sent, sent_counter, ent_mid): new_lined_formatted = '' ent_inds = [] tokens = sent.strip().split(' ') token_counter = 0 first_occr = True for token in tokens: if has_ent(token, ent_mid) and first_occr == True: (mid, ent_tokens, notabletype) = getentparts(token) new_lined_formatted += ent_tokens[0] + '\tB-E\n' if len(ent_tokens) > 1: for i in range(1, len(ent_tokens)): new_lined_formatted += ent_tokens[i] + '\tI-E\n' ent_ind = str(sent_counter) + '\t' + str(token_counter) + '\t' + mid + '\t' + str(ent_tokens) + '\t' + notabletype ent_inds.append(ent_ind) first_occr = False elif '/m/' in token: (mid, ent_tokens, notabletype) = getentparts(token) for t in ent_tokens: new_lined_formatted += t + '\tO\n' else: new_lined_formatted += token + '\tO\n' token_counter += 1 return (new_lined_formatted, ent_inds)
def getrawsent(sent): new_lined_formatted = '' tokens = sent.strip().split(' ') token_counter = 0 for token in tokens: if '/m/' in token: (mid, ent_tokens, notabletype) = getentparts(token) for t in ent_tokens: new_lined_formatted += t + ' ' else: new_lined_formatted += token + ' ' token_counter += 1 return (new_lined_formatted.strip())
def filter_sentences(sampled_lines): lines = [] for l in sampled_lines: sent = l.split('\t')[4] news = [] for w in sent.split(): if '/m/' in w: _, tokens, _ = getentparts(w) news.append(' '.join(tokens).strip()) else: news.append(w) lines.append(' '.join(news).strip()) return lines
def fillUsingLines(linespath): e2name2freq = defaultdict(dict) f = open(linespath) for line in f: parts = line.split('\t') for w in parts[4].split(): if '/m/' in w: (mid, tokens, notabletype) = getentparts(w) name = ' '.join(tokens) if mid not in e2name2freq: e2name2freq[mid] = defaultdict(lambda: 0) e2name2freq[mid][name] += 1 f.close() return e2name2freq