Exemple #1
0
    def apply(self, context):

        # These are the character offset--**relative to the sentence start**--for each _token_
        offsets = context.char_offsets

        # Loop over all n-grams in **reverse** order (to facilitate longest-match semantics)
        L    = len(offsets)
        seen = set()
        for l in range(1, self.n_max+1)[::-1]:
            for i in range(L-l+1):
                w     = context.words[i+l-1]
                start = offsets[i]
                end   = offsets[i+l-1] + len(w) - 1
                ts    = TemporarySpan(char_start=start, char_end=end, sentence=context)
                if ts not in seen:
                    seen.add(ts)
                    yield ts

                # Check for split
                # NOTE: For simplicity, we only split single tokens right now!
                if l == 1 and self.split_rgx is not None and end - start > 0:
                    m = re.search(self.split_rgx, context.text[start-offsets[0]:end-offsets[0]+1])
                    if m is not None and l < self.n_max + 1:
                        ts1 = TemporarySpan(char_start=start, char_end=start + m.start(1) - 1, sentence=context)
                        if ts1 not in seen and ts1.get_span():
                            seen.add(ts1)
                            yield ts1
                        ts2 = TemporarySpan(char_start=start + m.end(1), char_end=end, sentence=context)
                        if ts2 not in seen and ts1.get_span():
                            seen.add(ts2)
                            yield ts2
    def apply(self, context):
        seen = set()
        text=context.text # gets sentence as string
        #print(text)
        i=0
        while i < len(text)-1:
            j=i+1
            while text[j]!='|':
                j=j+1
            # must continue until next one
            k=j+1
            j=j+1
            if j<len(text)-1:
                #print(j,len(text))
                while text[j]!='|':
                    j=j+1
                start=i
                end=j
                #i=j+1
                i=k

                #print(start,end)
                #print(text[start:end])
                ts    = TemporarySpan(char_start=start, char_end=end, sentence=context)
                #print(ts)

                if ts not in seen:
                    seen.add(ts)
                    yield ts
            else:
                i=j
Exemple #3
0
    def apply(self, context):

        # These are the character offset--**relative to the sentence start**--for each _token_
        offsets = context.char_offsets

        # Loop over all n-grams in **reverse** order (to facilitate longest-match semantics)
        L = len(offsets)
        seen = set()
        for l in range(1, self.n_max + 1)[::-1]:
            for i in range(L - l + 1):
                w = context.words[i + l - 1]
                start = offsets[i]
                end = offsets[i + l - 1] + len(w) - 1
                ts = TemporarySpan(char_start=start,
                                   char_end=end,
                                   sentence=context)
                if ts not in seen:
                    seen.add(ts)
                    yield ts

                # Check for split
                # NOTE: For simplicity, we only split single tokens right now!
                if l == 1 and self.split_rgx is not None and end - start > 0:
                    m = re.search(
                        self.split_rgx,
                        context.text[start - offsets[0]:end - offsets[0] + 1])
                    if m is not None and l < self.n_max + 1:
                        ts1 = TemporarySpan(char_start=start,
                                            char_end=start + m.start(1) - 1,
                                            sentence=context)
                        if ts1 not in seen:
                            seen.add(ts1)
                            yield ts
                        ts2 = TemporarySpan(char_start=start + m.end(1),
                                            char_end=end,
                                            sentence=context)
                        if ts2 not in seen:
                            seen.add(ts2)
                            yield ts2
    def apply(self, context, clear, split, check_for_existing=True, **kwargs):
        """Extract Candidates from a Context"""
        # For now, just handle Sentences
        if not isinstance(context, Sentence):
            raise NotImplementedError(
                "%s is currently only implemented for Sentence contexts." %
                self.__name__)

        # Do a first pass to collect all mentions by entity type / cid
        entity_idxs = dict(
            (et, defaultdict(list)) for et in set(self.entity_types))
        L = len(context.words)
        for i in range(L):
            if context.entity_types[i] is not None:
                ets = context.entity_types[i].split(self.entity_sep)
                cids = context.entity_cids[i].split(self.entity_sep)
                for et, cid in zip(ets, cids):
                    if et in entity_idxs:
                        entity_idxs[et][cid].append(i)

        # Form entity Spans
        entity_spans = defaultdict(list)
        entity_cids = {}
        for et, cid_idxs in iteritems(entity_idxs):
            for cid, idxs in iteritems(entity_idxs[et]):
                while len(idxs) > 0:
                    i = idxs.pop(0)
                    char_start = context.char_offsets[i]
                    char_end = char_start + len(context.words[i]) - 1
                    while len(idxs) > 0 and idxs[0] == i + 1:
                        i = idxs.pop(0)
                        char_end = context.char_offsets[i] + len(
                            context.words[i]) - 1

                    # Insert / load temporary span, also store map to entity CID
                    tc = TemporarySpan(char_start=char_start,
                                       char_end=char_end,
                                       sentence=context)
                    tc.load_id_or_insert(self.session)
                    entity_cids[tc.id] = cid
                    entity_spans[et].append(tc)

        # Generates and persists candidates
        candidate_args = {'split': split}
        for args in product(
                *[enumerate(entity_spans[et]) for et in self.entity_types]):

            # TODO: Make this work for higher-order relations
            if self.arity == 2:
                ai, a = args[0]
                bi, b = args[1]

                # Check for self-joins, "nested" joins (joins from span to its subspan), and flipped duplicate
                # "symmetric" relations
                if not self.self_relations and a == b:
                    continue
                elif not self.nested_relations and (a in b or b in a):
                    continue
                elif not self.symmetric_relations and ai > bi:
                    continue

            # Assemble candidate arguments
            for i, arg_name in enumerate(self.candidate_class.__argnames__):
                candidate_args[arg_name + '_id'] = args[i][1].id
                candidate_args[arg_name + '_cid'] = entity_cids[args[i][1].id]

            # Checking for existence
            if check_for_existing:
                q = select([self.candidate_class.id])
                for key, value in iteritems(candidate_args):
                    q = q.where(getattr(self.candidate_class, key) == value)
                candidate_id = self.session.execute(q).first()
                if candidate_id is not None:
                    continue

            # Add Candidate to session
            yield self.candidate_class(**candidate_args)
Exemple #5
0
    def apply(self, context, clear, split, check_for_existing=True, **kwargs):
        """Extract Candidates from a Context"""
        # For now, just handle Sentences
        if not isinstance(context, Sentence):
            raise NotImplementedError(
                "%s is currently only implemented for Sentence contexts." %
                self.__name__)

        # Load and remap this entire parent document's tag set
        if context.document.id not in self.cache:
            tags = self.session.query(SequenceTag).filter(
                SequenceTag.document_id == context.document.id).all()
            # filter to 1) target concept/entity types and 2) target sources (e.g., PutTator, TaggerOne)
            tags = [
                t for t in tags if t.concept_type in set(self.entity_types)
            ]
            tags = [
                t for t in tags
                if len([rgx.search(t.source) for rgx in self.tag_sources]) > 0
            ]

            tags = self._map_annotations(context.document, tags)
            self.cache[context.document.id] = defaultdict(list)
            for position, tag in tags:
                self.cache[context.document.id][position].append(tag)

        # no tags for this Sentence
        if context.position not in self.cache[context.document.id]:
            return

        spans = self.cache[context.document.id][context.position]
        #del self.cache[context.document.id][context.position]

        entity_spans = defaultdict(list)
        entity_cids = {}

        # create temp spans
        offsets = [
            context.document.sentences[i].abs_char_offsets[0]
            for i in range(len(context.document.sentences))
        ]

        i = context.position
        for tag in spans:
            char_start, char_end = tag.abs_char_start - offsets[
                i], tag.abs_char_end - offsets[i]
            tc = TemporarySpan(char_start=char_start,
                               char_end=char_end - 1,
                               sentence=context.document.sentences[i])
            tc.load_id_or_insert(self.session)
            entity_cids[tc.id] = tag.concept_uid
            entity_spans[tag.concept_type].append(tc)

        # Generates and persists candidates
        candidate_args = {'split': split}
        for args in product(
                *[enumerate(entity_spans[et]) for et in self.entity_types]):
            if self.arity == 2:
                ai, a = args[0]
                bi, b = args[1]

                # Check for self-joins, "nested" joins (joins from span to its subspan),
                # and flipped duplicate "symmetric" relations
                if not self.self_relations and a == b:
                    continue
                elif not self.nested_relations and (a in b or b in a):
                    continue
                elif not self.symmetric_relations and ai > bi:
                    continue

            # Assemble candidate arguments
            for i, arg_name in enumerate(self.candidate_class.__argnames__):
                candidate_args[arg_name + '_id'] = args[i][1].id
                candidate_args[arg_name + '_cid'] = entity_cids[args[i][1].id]

            # Checking for existence
            if check_for_existing:
                q = select([self.candidate_class.id])
                for key, value in candidate_args.items():
                    q = q.where(getattr(self.candidate_class, key) == value)
                candidate_id = self.session.execute(q).first()
                if candidate_id is not None:
                    continue

            # Add Candidate to session
            yield self.candidate_class(**candidate_args)
Exemple #6
0
    def apply(self, context, clear, split, check_for_existing=True, **kwargs):
        """Extract Candidates from a Context"""
        # For now, just handle Sentences
        if not isinstance(context, Sentence):
            raise NotImplementedError("%s is currently only implemented for Sentence contexts." % self.__name__)

        # Do a first pass to collect all mentions by entity type / cid
        entity_idxs = dict((et, defaultdict(list)) for et in set(self.entity_types))
        L = len(context.words)
        for i in range(L):
            if context.entity_types[i] is not None:
                ets  = context.entity_types[i].split(self.entity_sep)
                cids = context.entity_cids[i].split(self.entity_sep)
                for et, cid in zip(ets, cids):
                    if et in entity_idxs:
                        entity_idxs[et][cid].append(i)

        # Form entity Spans
        entity_spans = defaultdict(list)
        entity_cids  = {}
        for et, cid_idxs in iteritems(entity_idxs):
            for cid, idxs in iteritems(entity_idxs[et]):
                while len(idxs) > 0:
                    i          = idxs.pop(0)
                    char_start = context.char_offsets[i]
                    char_end   = char_start + len(context.words[i]) - 1
                    while len(idxs) > 0 and idxs[0] == i + 1:
                        i        = idxs.pop(0)
                        char_end = context.char_offsets[i] + len(context.words[i]) - 1

                    # Insert / load temporary span, also store map to entity CID
                    tc = TemporarySpan(char_start=char_start, char_end=char_end, sentence=context)
                    tc.load_id_or_insert(self.session)
                    entity_cids[tc.id] = cid
                    entity_spans[et].append(tc)

        # Generates and persists candidates
        candidate_args = {'split' : split}
        for args in product(*[enumerate(entity_spans[et]) for et in self.entity_types]):

            # TODO: Make this work for higher-order relations
            if self.arity == 2:
                ai, a = args[0]
                bi, b = args[1]

                # Check for self-joins, "nested" joins (joins from span to its subspan), and flipped duplicate
                # "symmetric" relations
                if not self.self_relations and a == b:
                    continue
                elif not self.nested_relations and (a in b or b in a):
                    continue
                elif not self.symmetric_relations and ai > bi:
                    continue

            # Assemble candidate arguments
            for i, arg_name in enumerate(self.candidate_class.__argnames__):
                candidate_args[arg_name + '_id'] = args[i][1].id
                candidate_args[arg_name + '_cid'] = entity_cids[args[i][1].id]

            # Checking for existence
            if check_for_existing:
                q = select([self.candidate_class.id])
                for key, value in iteritems(candidate_args):
                    q = q.where(getattr(self.candidate_class, key) == value)
                candidate_id = self.session.execute(q).first()
                if candidate_id is not None:
                    continue

            # Add Candidate to session
            yield self.candidate_class(**candidate_args)