def apply(self, context): # These are the character offset--**relative to the sentence start**--for each _token_ offsets = context.char_offsets # Loop over all n-grams in **reverse** order (to facilitate longest-match semantics) L = len(offsets) seen = set() for l in range(1, self.n_max+1)[::-1]: for i in range(L-l+1): w = context.words[i+l-1] start = offsets[i] end = offsets[i+l-1] + len(w) - 1 ts = TemporarySpan(char_start=start, char_end=end, sentence=context) if ts not in seen: seen.add(ts) yield ts # Check for split # NOTE: For simplicity, we only split single tokens right now! if l == 1 and self.split_rgx is not None and end - start > 0: m = re.search(self.split_rgx, context.text[start-offsets[0]:end-offsets[0]+1]) if m is not None and l < self.n_max + 1: ts1 = TemporarySpan(char_start=start, char_end=start + m.start(1) - 1, sentence=context) if ts1 not in seen and ts1.get_span(): seen.add(ts1) yield ts1 ts2 = TemporarySpan(char_start=start + m.end(1), char_end=end, sentence=context) if ts2 not in seen and ts1.get_span(): seen.add(ts2) yield ts2
def apply(self, context): seen = set() text=context.text # gets sentence as string #print(text) i=0 while i < len(text)-1: j=i+1 while text[j]!='|': j=j+1 # must continue until next one k=j+1 j=j+1 if j<len(text)-1: #print(j,len(text)) while text[j]!='|': j=j+1 start=i end=j #i=j+1 i=k #print(start,end) #print(text[start:end]) ts = TemporarySpan(char_start=start, char_end=end, sentence=context) #print(ts) if ts not in seen: seen.add(ts) yield ts else: i=j
def apply(self, context): # These are the character offset--**relative to the sentence start**--for each _token_ offsets = context.char_offsets # Loop over all n-grams in **reverse** order (to facilitate longest-match semantics) L = len(offsets) seen = set() for l in range(1, self.n_max + 1)[::-1]: for i in range(L - l + 1): w = context.words[i + l - 1] start = offsets[i] end = offsets[i + l - 1] + len(w) - 1 ts = TemporarySpan(char_start=start, char_end=end, sentence=context) if ts not in seen: seen.add(ts) yield ts # Check for split # NOTE: For simplicity, we only split single tokens right now! if l == 1 and self.split_rgx is not None and end - start > 0: m = re.search( self.split_rgx, context.text[start - offsets[0]:end - offsets[0] + 1]) if m is not None and l < self.n_max + 1: ts1 = TemporarySpan(char_start=start, char_end=start + m.start(1) - 1, sentence=context) if ts1 not in seen: seen.add(ts1) yield ts ts2 = TemporarySpan(char_start=start + m.end(1), char_end=end, sentence=context) if ts2 not in seen: seen.add(ts2) yield ts2
def apply(self, context, clear, split, check_for_existing=True, **kwargs): """Extract Candidates from a Context""" # For now, just handle Sentences if not isinstance(context, Sentence): raise NotImplementedError( "%s is currently only implemented for Sentence contexts." % self.__name__) # Do a first pass to collect all mentions by entity type / cid entity_idxs = dict( (et, defaultdict(list)) for et in set(self.entity_types)) L = len(context.words) for i in range(L): if context.entity_types[i] is not None: ets = context.entity_types[i].split(self.entity_sep) cids = context.entity_cids[i].split(self.entity_sep) for et, cid in zip(ets, cids): if et in entity_idxs: entity_idxs[et][cid].append(i) # Form entity Spans entity_spans = defaultdict(list) entity_cids = {} for et, cid_idxs in iteritems(entity_idxs): for cid, idxs in iteritems(entity_idxs[et]): while len(idxs) > 0: i = idxs.pop(0) char_start = context.char_offsets[i] char_end = char_start + len(context.words[i]) - 1 while len(idxs) > 0 and idxs[0] == i + 1: i = idxs.pop(0) char_end = context.char_offsets[i] + len( context.words[i]) - 1 # Insert / load temporary span, also store map to entity CID tc = TemporarySpan(char_start=char_start, char_end=char_end, sentence=context) tc.load_id_or_insert(self.session) entity_cids[tc.id] = cid entity_spans[et].append(tc) # Generates and persists candidates candidate_args = {'split': split} for args in product( *[enumerate(entity_spans[et]) for et in self.entity_types]): # TODO: Make this work for higher-order relations if self.arity == 2: ai, a = args[0] bi, b = args[1] # Check for self-joins, "nested" joins (joins from span to its subspan), and flipped duplicate # "symmetric" relations if not self.self_relations and a == b: continue elif not self.nested_relations and (a in b or b in a): continue elif not self.symmetric_relations and ai > bi: continue # Assemble candidate arguments for i, arg_name in enumerate(self.candidate_class.__argnames__): candidate_args[arg_name + '_id'] = args[i][1].id candidate_args[arg_name + '_cid'] = entity_cids[args[i][1].id] # Checking for existence if check_for_existing: q = select([self.candidate_class.id]) for key, value in iteritems(candidate_args): q = q.where(getattr(self.candidate_class, key) == value) candidate_id = self.session.execute(q).first() if candidate_id is not None: continue # Add Candidate to session yield self.candidate_class(**candidate_args)
def apply(self, context, clear, split, check_for_existing=True, **kwargs): """Extract Candidates from a Context""" # For now, just handle Sentences if not isinstance(context, Sentence): raise NotImplementedError( "%s is currently only implemented for Sentence contexts." % self.__name__) # Load and remap this entire parent document's tag set if context.document.id not in self.cache: tags = self.session.query(SequenceTag).filter( SequenceTag.document_id == context.document.id).all() # filter to 1) target concept/entity types and 2) target sources (e.g., PutTator, TaggerOne) tags = [ t for t in tags if t.concept_type in set(self.entity_types) ] tags = [ t for t in tags if len([rgx.search(t.source) for rgx in self.tag_sources]) > 0 ] tags = self._map_annotations(context.document, tags) self.cache[context.document.id] = defaultdict(list) for position, tag in tags: self.cache[context.document.id][position].append(tag) # no tags for this Sentence if context.position not in self.cache[context.document.id]: return spans = self.cache[context.document.id][context.position] #del self.cache[context.document.id][context.position] entity_spans = defaultdict(list) entity_cids = {} # create temp spans offsets = [ context.document.sentences[i].abs_char_offsets[0] for i in range(len(context.document.sentences)) ] i = context.position for tag in spans: char_start, char_end = tag.abs_char_start - offsets[ i], tag.abs_char_end - offsets[i] tc = TemporarySpan(char_start=char_start, char_end=char_end - 1, sentence=context.document.sentences[i]) tc.load_id_or_insert(self.session) entity_cids[tc.id] = tag.concept_uid entity_spans[tag.concept_type].append(tc) # Generates and persists candidates candidate_args = {'split': split} for args in product( *[enumerate(entity_spans[et]) for et in self.entity_types]): if self.arity == 2: ai, a = args[0] bi, b = args[1] # Check for self-joins, "nested" joins (joins from span to its subspan), # and flipped duplicate "symmetric" relations if not self.self_relations and a == b: continue elif not self.nested_relations and (a in b or b in a): continue elif not self.symmetric_relations and ai > bi: continue # Assemble candidate arguments for i, arg_name in enumerate(self.candidate_class.__argnames__): candidate_args[arg_name + '_id'] = args[i][1].id candidate_args[arg_name + '_cid'] = entity_cids[args[i][1].id] # Checking for existence if check_for_existing: q = select([self.candidate_class.id]) for key, value in candidate_args.items(): q = q.where(getattr(self.candidate_class, key) == value) candidate_id = self.session.execute(q).first() if candidate_id is not None: continue # Add Candidate to session yield self.candidate_class(**candidate_args)
def apply(self, context, clear, split, check_for_existing=True, **kwargs): """Extract Candidates from a Context""" # For now, just handle Sentences if not isinstance(context, Sentence): raise NotImplementedError("%s is currently only implemented for Sentence contexts." % self.__name__) # Do a first pass to collect all mentions by entity type / cid entity_idxs = dict((et, defaultdict(list)) for et in set(self.entity_types)) L = len(context.words) for i in range(L): if context.entity_types[i] is not None: ets = context.entity_types[i].split(self.entity_sep) cids = context.entity_cids[i].split(self.entity_sep) for et, cid in zip(ets, cids): if et in entity_idxs: entity_idxs[et][cid].append(i) # Form entity Spans entity_spans = defaultdict(list) entity_cids = {} for et, cid_idxs in iteritems(entity_idxs): for cid, idxs in iteritems(entity_idxs[et]): while len(idxs) > 0: i = idxs.pop(0) char_start = context.char_offsets[i] char_end = char_start + len(context.words[i]) - 1 while len(idxs) > 0 and idxs[0] == i + 1: i = idxs.pop(0) char_end = context.char_offsets[i] + len(context.words[i]) - 1 # Insert / load temporary span, also store map to entity CID tc = TemporarySpan(char_start=char_start, char_end=char_end, sentence=context) tc.load_id_or_insert(self.session) entity_cids[tc.id] = cid entity_spans[et].append(tc) # Generates and persists candidates candidate_args = {'split' : split} for args in product(*[enumerate(entity_spans[et]) for et in self.entity_types]): # TODO: Make this work for higher-order relations if self.arity == 2: ai, a = args[0] bi, b = args[1] # Check for self-joins, "nested" joins (joins from span to its subspan), and flipped duplicate # "symmetric" relations if not self.self_relations and a == b: continue elif not self.nested_relations and (a in b or b in a): continue elif not self.symmetric_relations and ai > bi: continue # Assemble candidate arguments for i, arg_name in enumerate(self.candidate_class.__argnames__): candidate_args[arg_name + '_id'] = args[i][1].id candidate_args[arg_name + '_cid'] = entity_cids[args[i][1].id] # Checking for existence if check_for_existing: q = select([self.candidate_class.id]) for key, value in iteritems(candidate_args): q = q.where(getattr(self.candidate_class, key) == value) candidate_id = self.session.execute(q).first() if candidate_id is not None: continue # Add Candidate to session yield self.candidate_class(**candidate_args)