def try_extract(self, _, future): """Try to find an address.""" if not (future[0].word[0].isupper() or future[0].word[0].isdigit()): return None ent = AddressEntity('', future[0].position) ok, idx = self.try_street(future, 0) if not ok: self.debug('Abort: no street') return None ent.attrs['ner:addr:street'] = join_tokens(future[:idx])[1] ok, idx = self.try_house_number(future, ent, idx) if not ok: self.debug('Missing house number') return None if future[idx].word == ',': self.debug('Found optional comma') idx += 1 if future[idx].typ == Token.ZIP_CODE: ent.attrs['ner:addr:zip_code'] = future[idx].word idx += 1 else: self.debug('Missing zip code') old_idx = idx ok, idx = self.try_town(future, idx) if not ok: self.debug('Abort: no town') return None ent.attrs['ner:addr:town'] = join_tokens(future[old_idx:idx])[1] found_comma = False if future[idx].word == ',': self.debug('Found comma after town') found_comma = True idx += 1 old_idx = idx ok, idx = self.try_country(future, idx) if not ok: self.debug('No country') if found_comma: idx -= 1 else: ent.attrs['ner:addr:country'] = join_tokens(future[old_idx:idx])[1] tokens = future.popmany(idx) ent.whitespace, ent.word = join_tokens(tokens) return ent
def try_extract(self, _, future): """Check if future starts with non-Czech phrase.""" if not future[0].word[0].isalpha(): return None max_len = 0 max_lang = None for lang in self.words.keys(): self.debug('Trying %s starting from %s', lang, future[0].word) length = self.try_lang(lang, future) if length > max_len: max_len = length max_lang = lang if max_len < 2: self.debug('Ignore short case') return None if max([len(future[i].word) for i in range(0, max_len)]) < LIMIT: self.debug('Too short words') return None tokens = future.popmany(max_len) whitespace, text = join_tokens(tokens) ent = LanguageEntity(text, max_lang, tokens[0].position) ent.whitespace = whitespace return ent
def read_docs(phase='starting_spans'): pmid_groups = {} for g in GROUPS: pmids = utils.readlines( os.path.join(config.EBM_NLP_DIR, 'pmids_{}.txt'.format(g))) for pmid in pmids: pmid_groups[pmid] = g def get_e_fname(pmid, e): if pmid_groups[pmid] == 'test': subdir = os.path.join('test', 'gold') else: subdir = 'train' f = '{}.AGGREGATED.ann'.format(pmid) return os.path.join(config.EBM_NLP_DIR, 'annotations', 'aggregated', phase, e, subdir, f) docs = [] for pmid, group in pmid_groups.items(): tokens = utils.readlines( os.path.join(config.EBM_NLP_DIR, 'documents', '{}.tokens'.format(pmid))) text, token_offsets = utils.join_tokens(tokens) doc = classes.Doc(pmid, text) doc.group = group for e in ['participants', 'interventions', 'outcomes']: label_name = 'GOLD_{}'.format(e[0]) labels = [int(l) for l in utils.readlines(get_e_fname(pmid, e))] for token_i, token_f, l in utils.condense_labels(labels): char_i = token_offsets[token_i][0] char_f = token_offsets[token_f - 1][1] doc.labels[label_name].append( classes.Span(char_i, char_f, text[char_i:char_f])) docs.append(doc) return docs
def try_house_number(self, future, ent, idx): """ Try to recognise house number, optionally with orientation number. """ if not is_number(future[idx].word): return False, idx if future[idx + 1].word == '/' and is_number(future[idx + 2].word): _, num = join_tokens(future[idx:idx + 3]) ent.attrs['ner:addr:house'] = num return True, idx + 3 self.debug('Found house number <%s>', future[idx].word) ent.attrs['ner:addr:house'] = future[idx].word return True, idx + 1
def try_town(self, future, idx): """ Try to extract town name from stream. It will take the longest available name. """ current = [] longest = 0 self.debug('Finding town') for i in range(min(TOWN_LIMIT, len(future) - idx)): current.append(future[idx + i]) i += 1 _, word = join_tokens(current) self.debug('Testing <%s>', word) if word in self.towns: longest = i self.debug('Found town <%s>', word) return (longest > 0, idx + longest)
def finalize(self, is_improper): """All names have been added, finalize remaining attributes.""" self.attrs['tag'] = '|'.join(self.possible_tags) tok_num = len(self.words) while is_improper(self.words[tok_num - 1].word): tok_num -= 1 self.words = self.words[:tok_num] self.set_position(self.words[0].position) self.whitespace, self.word = join_tokens(self.words) lemma_parts = [] for name_part in self.lemmas: opts = [lt[0] for lt in name_part if lt[1] in self.possible_tags] if len(opts) > 0: lemma_parts.append(most_frequent(opts)) self.attrs['lemma'] = ' '.join(lemma_parts)
def try_street(self, future, idx): """ Try to extract street or town from token stream. The longest option will be taken. """ current = [] longest = 0 for i in range(min(STREET_LIMIT, len(future) - idx)): current.append(future[idx + i]) i += 1 _, word = join_tokens(current) self.debug('Testing <%s>', word) # If there is a period without any spaces around it, add one word = re.sub(r'(?<!\s)\.(?!\s)', '. ', word) if word in self.streets or word in self.towns: longest = i self.debug('Found street <%s>', word) if future[longest].word == '.': longest += 1 return (longest > 0, idx + longest)
def try_extract_entry(self, history, future): """ Try to find a phrase in the future tokens. """ if not is_possible_start(future[0].word): return None current = [] last_seq = '' longest = 0 for i in range(0, min(LIMIT, len(future) - 1)): current.append(future[i]) sequence = ''.join([to_join(t) for t in current]).lstrip() sequence = sequence.replace(' ,', ',').replace(' .', '.') self.debug('current: <%s>', sequence) if self.lookup_sequence(sequence): longest = i + 1 last_seq = sequence self.debug('Found match of len %d', longest) self.debug('Longest match was: %d', longest) if not self.check_length(longest, history, future): return None tokens = future.popmany(longest) whitespace, text = join_tokens(tokens) ent = PhraseEntity(text, tokens[0].position) ent.whitespace = whitespace lemma = self.find_lemma(last_seq) if lemma: ent.attrs['lemma'] = lemma ent.add_categories(self.get_categories(last_seq)) return ent
def __init__(self, tokens): whitespace, text = join_tokens(tokens) ner.NamedEntity.__init__(self, text, None, 'COrganisation', None) self.whitespace = whitespace self.set_position(tokens[0].position) self.set_src("OrganisationNer")
def build_entity(future, n, typ): """Create a pattern entity of type `typ` from `n` tokens.""" tokens = future.popmany(n) whitespace, text = join_tokens(tokens) return PatternEntity(text, typ, tokens[0].position, whitespace)
def get_word(self): """Return actual current name.""" return join_tokens(self.words)[1]