def iter_mentions(doc, window=1): sent_spans = list(iter_sent_spans(doc['text'])) sent_offsets = [s.start for s in sent_spans] for link in doc['links']: # align the link span over sentence spans in the document # mention span may cross sentence bounds if sentence tokenisation is dodgy # if so, the entire span between bounding sentences will be used as context sent_start_idx = bisect_right(sent_offsets, link['start']) - 1 sent_end_idx = bisect_left(sent_offsets, link['stop']) - 1 lhs_offset = window / 2 rhs_offset = (window - lhs_offset) - 1 sent_start_idx = max(0, sent_start_idx - lhs_offset) sent_end_idx = min(len(sent_spans) - 1, sent_end_idx + rhs_offset) sent_offset = sent_spans[sent_start_idx].start span = (link['start'] - sent_offset, link['stop'] - sent_offset) target = trim_link_subsection(link['target']) target = trim_link_protocol(target) mention = doc['text'][sent_spans[sent_start_idx]. start:sent_spans[sent_end_idx].stop] # filter out instances where the mention span is the entire sentence if span == (0, len(mention)): continue # filter out list item sentences sm = mention.strip() if not sm or sm.startswith('*') or sm[-1] not in '.!?"\'': continue yield target, doc['_id'], mention, span
def iter_mentions(doc, window = 1): sent_spans = list(iter_sent_spans(doc['text'])) sent_offsets = [s.start for s in sent_spans] for link in doc['links']: # align the link span over sentence spans in the document # mention span may cross sentence bounds if sentence tokenisation is dodgy # if so, the entire span between bounding sentences will be used as context sent_start_idx = bisect_right(sent_offsets, link['start']) - 1 sent_end_idx = bisect_left(sent_offsets, link['stop']) - 1 lhs_offset = window / 2 rhs_offset = (window - lhs_offset) - 1 sent_start_idx = max(0, sent_start_idx - lhs_offset) sent_end_idx = min(len(sent_spans)-1, sent_end_idx + rhs_offset) sent_offset = sent_spans[sent_start_idx].start span = (link['start'] - sent_offset, link['stop'] - sent_offset) target = trim_link_subsection(link['target']) target = trim_link_protocol(target) mention = doc['text'][sent_spans[sent_start_idx].start:sent_spans[sent_end_idx].stop] # filter out instances where the mention span is the entire sentence if span == (0, len(mention)): continue # filter out list item sentences sm = mention.strip() if not sm or sm.startswith('*') or sm[-1] not in '.!?"\'': continue yield target, (span, mention)
def iter_unique_links(doc): links = set() for l in doc['links']: link = trim_link_subsection(l['target']) link = trim_link_protocol(link) if link not in links: yield link links.add(link)
def iter_anchor_target_pairs(self, doc): for link in doc['links']: target = link['target'] target = trim_link_subsection(target) target = trim_link_protocol(target) anchor = doc['text'][link['start']:link['stop']].strip() if self.lowercase: anchor = anchor.lower() if anchor and target: yield anchor, target
def iter_mentions(doc): sent_spans = list(iter_sent_spans(doc["text"])) sent_offsets = [s.start for s in sent_spans] for link in doc["links"]: # align the link span over sentence spans in the document sent_start_idx = bisect_right(sent_offsets, link["start"]) - 1 sent_end_idx = bisect_left(sent_offsets, link["stop"]) - 1 target = trim_link_subsection(link["target"]) target = trim_link_protocol(target) # mention span may cross sentence bounds if sentence tokenisation is dodgy # if so, the entire span between bounding sentences will be used as context yield target, doc["text"][sent_spans[sent_start_idx].start : sent_spans[sent_end_idx].stop]
def iter_comentions(links): links = list(set(trim_link_protocol(trim_link_subsection(l['target'])) for l in links)) for i in xrange(len(links)): yield links[i], Counter(links[:i] + links[i+1:])