def link_entities_in_raw_input(self, input_text: str, element_id: str=None, num_candidates=-1) -> Sentence:
        """
        Takes a raw input string, extracts mentions and returns a list of the most probable entities that can be linked
         to the given input text.

        :param input_text: the input sentence as a string
        :param element_id: sentence id
        :param num_candidates: the number of candidate entity links to store for each entity.
                                If set to more than 0 it will override the class setting.
        :return: a list of tuples where the first element is the entity id and the second is the entity label
        >>> l = HeuristicsLinker(num_candidates=1)
        >>> l.link_entities_in_raw_input("Who wrote the song hotel California?")
        [('Q7366', 'song', (14, 18), [3]), ('Q780394', 'Hotel California', (19, 35), [4, 5])]
        >>> l.link_entities_in_raw_input("Donovan McNabb'strade to the Vikings is in place.")  # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
        [('Q963185', 'Donovan McNabb', (0, 14), [0, 1]), ...]
        >>> l.link_entities_in_raw_input("what was the queen album?")
        [('Q15862', 'Queen', (13, 18), [3]), ('Q482994', 'album', (20, 24), [4])]
        """
        sentence = Sentence(input_text=input_text)
        sentence = self.link_entities_in_sentence_obj(sentence, element_id=element_id, num_candidates=num_candidates)
        sentence.entities = [{k: e[k] for k in {'type', 'linkings', 'token_ids', 'poss', 'tokens'}}
                             for e in sentence.entities
                             if len(e['linkings']) > 0]
        for e in sentence.entities:
            e['linkings'] = [(l.get('kbID'), l.get('label')) for l in e['linkings']]
        return sentence
Beispiel #2
0
    def link_entities_in_sentence_obj(self,
                                      sentence_obj: Sentence,
                                      element_id=None,
                                      num_candidates=-1):
        sentence_obj = Sentence(input_text=sentence_obj.input_text,
                                tagged=sentence_obj.tagged,
                                mentions=sentence_obj.mentions,
                                entities=sentence_obj.entities)
        if not sentence_obj.tagged:
            sentence_obj.tagged = utils.get_tagged_from_server(
                sentence_obj.input_text,
                caseless=sentence_obj.input_text.islower())
        sentence_obj.entities = []
        if element_id:
            smart_predictions = [
                ([
                    p[4].replace("/", ".")[1:] for p in candidates
                    if float(p[6]) > self._confidence
                ], int(candidates[0][2]),
                 int(candidates[0][2]) + int(candidates[0][3]))
                for e, candidates in self.predictions[element_id].items()
                if len(candidates) > 0
            ]

            for c, s, e in smart_predictions:
                linkings = []
                for p in c:
                    kbID = queries.map_f_id(p)
                    linkings.append({
                        'fbID':
                        p,
                        'kbID':
                        kbID,
                        'label':
                        queries.get_main_entity_label(kbID) if kbID else None
                    })
                sentence_obj.entities.append({
                    "linkings":
                    linkings,
                    'offsets': (s, e),
                    'type':
                    'NNP',
                    'poss': [],
                    'token_ids':
                    _offets_to_token_ids(s, e, sentence_obj.tagged),
                    'tokens': []
                })

        for e in sentence_obj.entities:
            # If there are many linking candidates we take the top N, since they are still ordered
            if num_candidates > 0:
                e['linkings'] = e['linkings'][:num_candidates]
            else:
                e['linkings'] = e['linkings'][:self.num_candidates]

        return sentence_obj
    def link_entities_in_sentence_obj(self, sentence_obj: Sentence, element_id=None, num_candidates=-1):
        sentence_obj = Sentence(input_text=sentence_obj.input_text, tagged=sentence_obj.tagged,
                                mentions=sentence_obj.mentions, entities=sentence_obj.entities)
        sentence_obj.entities = []

        params = urllib.parse.urlencode({'text': sentence_obj.input_text, 'confidence': str(self._confidence)})
        request = urllib.request.Request(self._spotlight_url + params)
        request.add_header("Accept", "application/json")
        try:
            content = json.loads(urllib.request.urlopen(request).read())
            sentence_obj.entities = [{
                                        "linkings": [{
                                                        'kbID': queries.map_wikipedia_id(r.get("@URI")
                                                                                                  .replace("http://dbpedia.org/resource/", "")
                                                                                                  .replace("http://dbpedia.org/page/", ""))
                                                      }],
                                        'offsets': (int(r.get('@offset', '0')), int(r.get('@offset', '0')) + len(r.get('@surfaceForm', "")))}
                                     for r in content.get('Resources', [])]
        except:
            pass
        return sentence_obj
    def link_entities_in_sentence_obj(self, sentence_obj: Sentence, element_id=None, num_candidates=-1) -> Sentence:
        """
        The method takes a sentence dictionary object that might already contain a tagged input or recognized mentions.
        This is useful if tagging and mentioned extraction is done in bulk before the entity linking step.
        Supported fields in the sentence_obj object:
            "input_text": raw input text as a string
            "tagged": a list of dict objects, one per token, with the output of the POS and NER taggers, see utils
                      for more info (optional)
            "mentions": a list of dict object, one per mention, see mention_extraction for more info (optional)
            "entities": extracted entity candidates (optional)
        See Sentence for more info.
        
        :param sentence_obj: input sentence as a dictionary, might be an empty dict
        :param element_id: sentence id to retrieve precomputed candidates for certain linkers
        :param num_candidates: the number of candidate entity links to store for each entity. 
                                If set to more than 0 it will override the class setting.
        :return: the same sentence_obj object with a new field "entities"
        
        >>> l = HeuristicsLinker()
        >>> l.link_entities_in_sentence_obj(Sentence("Where does Norway get their oil?")).entities[0]['linkings']  # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
        [('Q20', 'Norway'), ...]
        """
        if self._precomputed_candidates is not None and element_id in self._precomputed_candidates:
            sentence_obj = self._precomputed_candidates[element_id]
        sentence_obj = Sentence(input_text=sentence_obj.input_text, tagged=sentence_obj.tagged,
                                mentions=sentence_obj.mentions, entities=sentence_obj.entities)
        if sentence_obj.tagged is None and sentence_obj.input_text is not None:
            sentence_obj.tagged = utils.get_tagged_from_server(sentence_obj.input_text,
                                                               caseless=sentence_obj.input_text.islower())
            self.logger.debug([(t['word'], t['pos']) for t in sentence_obj.tagged])

        if sentence_obj.entities is None:
            if sentence_obj.mentions is not None:
                sentence_obj.entities = self._link_mentions_to_entities(sentence_obj.mentions)
            else:
                sentence_obj.entities = self._link_entities_in_tagged_input(sentence_obj.tagged)
                self.logger.debug([e['linkings'][0] for e in sentence_obj.entities])
        elif self.prefer_longer_matches:
            sentence_obj.entities = self._prefer_longer_matches(sentence_obj.entities)

        for e in sentence_obj.entities:
            e['text'] = sentence_obj.input_text
        sentence_obj.entities = [self.compute_candidate_scores(e, tagged_text=sentence_obj.tagged)
                                 for e in sentence_obj.entities]

        if self.no_mentions_overlap:
            if not self.one_entity_mode:
                sentence_obj.entities = resolve_entity_overlap_beam_search(sentence_obj.entities)
            else:
                sentence_obj.entities = sorted(sentence_obj.entities, key=lambda x: x.get('drop_score', 0.0))
                sentence_obj.entities = sentence_obj.entities[:1]

                # One mention span -> one entity. Each entity can have multiple linking candidates.
        for e in sentence_obj.entities:
            # If there are many linking candidates we take the top N, since they are still ordered
            if num_candidates > 0:
                e['linkings'] = e['linkings'][:num_candidates]
            else:
                e['linkings'] = e['linkings'][:self.num_candidates]

        return sentence_obj