Exemple #1
0
    def link_entity(self, text):
        """Link tokens to entities.

        Example:
            ['i', 'work', 'at', 'apple'] =>
            ['i', 'work', 'at', ('apple', ('apple','company'))]

        """
        doc = self.nlp(unicode(text))
        entities = []
        for np in doc.noun_chunks:
            s = np.text
            candidates = self.query(s, k=1)
            if candidates:
                sorted_candidates = sorted(candidates, key=lambda x: fuzz.ratio(s.lower(), x.value.lower()), reverse=True)
                for candidate in ifilter(lambda e: e.value.lower() not in self.stopwords, sorted_candidates):
                    if fuzz.ratio(s.lower(), candidate.value.lower()) > 80:
                        entity = Entity(surface=s, canonical=candidate)
                        entities.append((entity, np.start, np.end))
                        # Take the best matched candidate
                        break

        def overlap(e, entities):
            for entity in entities:
                if not (e.start >= entity[2] or e.end < entity[1]):
                    return True
            return False

        for ent in doc.ents:
            if not overlap(ent, entities):
                if ent.label_ == 'PERSON':
                    entity = Entity.from_elements(surface=ent.text, value=ent.text, type='person')
                    entities.append((entity, ent.start, ent.end))
                elif ent.label_ == 'WORK_OF_ART':
                    entity = Entity.from_elements(surface=ent.text, value=ent.text, type='title')
                    entities.append((entity, ent.start, ent.end))

        tokens = [tok.text for tok in doc]
        if not entities:
            entity_tokens = tokens
        else:
            last = 0
            entity_tokens = []
            entities = sorted(entities, key=lambda x: x[1])
            for entity in entities:
                entity, start, end = entity
                entity_tokens.extend(tokens[last:start])
                entity_tokens.append(entity)
                last = end
            if last < len(tokens):
                entity_tokens.extend(tokens[last:])
        return entity_tokens
Exemple #2
0
    def detect_slots(self, tokens, kb, context=None, join=False, stem=False):
        '''
        join: join consecutive slot words to one entity
        '''
        if not context:
            context = self.get_context_tokens((kb, ), stem=stem)
        #print 'context tokens:', context

        role = kb.facts['personal']['Role']
        category = kb.facts['item']['Category']
        labels = self._get_slot_labels(role,
                                       category,
                                       context,
                                       tokens,
                                       stem=stem)

        slot_entity = lambda s: Entity(
            surface=s, canonical=CanonicalEntity(value='', type='slot'))
        if not join:
            slots = [
                slot_entity(x) if labels[i] == 1 else x
                for i, x in enumerate(tokens)
            ]
        else:
            spans = self.label_to_span(labels)
            slots = [
                tokens[x] if not isinstance(x, tuple) else slot_entity(
                    ' '.join(tokens[x[0]:x[1]])) for x in spans
            ]
        return slots
Exemple #3
0
    def _realize_entity(self, entity):
        if isinstance(entity, Entity):
            entity = entity.canoncial
        elif isinstance(entity, CanonicalEntity):
            pass
        else:
            raise TypeError('Unknown entity')

        s = re.sub(r',|-|&', ' ', entity.value)
        tokens = [
            tok for tok in ifilter(
                lambda x: x.lower() not in ('the', 'of', 'and'), s.split())
        ]
        if entity.type == 'school':
            tokens = [
                tok for tok in ifilter(
                    lambda x: x.lower() not in
                    ('university', 'college', 'state', 'at'), tokens)
            ]
        elif entity.type == 'company':
            tokens = [
                tok for tok in ifilter(
                    lambda x: x.lower() not in
                    ('at', 'company', 'corporation', 'group'), tokens)
            ]
        surface = ' '.join(tokens[:2])
        return Entity(surface, entity)
Exemple #4
0
    def _realize_entity(self, entity):
        """Return an observed surface form of entity.

        Args:
            entity (Entity or CanonicalEntity)

        """
        entity = super(InverseLexicon, self)._realize_entity(entity)

        if entity.canonical not in self.inverse_lexicon:
            return entity
        else:
            entity = entity.canonical
            items = self.inverse_lexicon[entity].items()
            variants = [item[0] for item in items]
            counts = np.array([item[1] for item in items], dtype=np.float32)
            # Make it peaky
            peaky_counts = counts**2
            normal_counts = peaky_counts / np.sum(peaky_counts)
            try:
                idx = np.random.choice(np.arange(len(counts)),
                                       1,
                                       p=normal_counts)[0]
            except ValueError:
                idx = np.argmax(counts)
            realized = variants[idx]
            return Entity(realized, entity)
Exemple #5
0
 def detect_item(self, token):
     for item in self.items:
         if re.match(r'{}s?'.format(item), token) or \
             (item == 'ball' and re.match(r'(basket)?balls?', token)):
             return Entity.from_elements(surface=token,
                                         value=item,
                                         type='item')
     return False
Exemple #6
0
 def _init_prices(self):
     from core.price_tracker import PriceList
     self.p_list = PriceList.getPriceList().p_list
     from cocoa.core.entity import Entity, CanonicalEntity
     self.add_words([
         Entity(surface='',
                canonical=CanonicalEntity(value=p, type='price'))
         for p in self.p_list
     ])
    def link_entity(self, raw_tokens, kb=None, scale=True, price_clip=None):
        """
        Takes the numbers in the tokenized sentence and converts them into Entity Numbers

        Args:
            raw_tokens: List of dialogue tokens
            kb: agent knowledge base
            scale(bool): If True, will scale the prices between 0 and 1
            price_clip(int, optional): If specified, will ignore all numbers greater then the price_clip

        Returns:
            A new list of tokens, with the number tokens replaced with entities

        """
        tokens = ['<s>'] + raw_tokens + ['</s>']
        entity_tokens = []
        if kb:
            kb_numbers = self.get_kb_numbers(kb)
            list_price = kb.facts['item']['Price']
        for i in xrange(1, len(tokens)-1):  # Ignore the start and end tokens that were just added
            token = tokens[i]
            try:
                number = float(self.process_string(token))
                has_dollar = lambda token: token[0] == '$' or token[-1] == '$'
                # Check context
                if not has_dollar(token) and \
                        not self.is_price(tokens[i-1], tokens[i+1]):
                    number = None
                # Avoid 'infinity' being recognized as a number
                elif number == float('inf') or number == float('-inf'):
                    number = None
                # Check if the price is reasonable
                elif kb:
                    if not has_dollar(token):
                        if number > 1.5 * list_price:
                            number = None
                        # Probably a spec number
                        if number != list_price and number in kb_numbers:
                            number = None
                    if number is not None and price_clip is not None:
                        scaled_price = PriceScaler._scale_price(kb, number)
                        if abs(scaled_price) > price_clip:
                            number = None
            except ValueError:
                number = None
            if number is None:
                new_token = token
            else:
                assert not math.isnan(number)
                if scale:
                    scaled_price = PriceScaler._scale_price(kb, number)
                else:
                    scaled_price = number
                new_token = Entity(surface=token, canonical=CanonicalEntity(value=scaled_price, type='price'))
            entity_tokens.append(new_token)
        return entity_tokens
 def inform_title(self):
     intent = 'inform-new-title'
     title = self.choose_title()
     print 'chosen title:', title
     template = self.retrieve_response_template(intent, title=title)
     titles = [Entity.from_elements(surface=title, value=title, type='title')]
     lf = LF(intent, titles=titles)
     text = template['template']
     utterance = Utterance(raw_text=text, logical_form=lf, template=template)
     return self.message(utterance)
Exemple #9
0
 def detect_number(self, token):
     try:
         n = int(token)
     except ValueError:
         try:
             n = self.word_to_num[token]
         except KeyError:
             n = None
     if n is not None:
         return Entity.from_elements(surface=token, value=n, type='number')
     return False
Exemple #10
0
 def lf_to_tokens(self, kb, lf):
     intent = lf['intent']
     if intent == 'accept':
         intent = markers.ACCEPT
     elif intent == 'reject':
         intent = markers.REJECT
     elif intent == 'quit':
         intent = markers.QUIT
     elif intent == 'offer':
         intent = markers.OFFER
     tokens = [intent]
     if lf.get('price') is not None:
         p = lf['price']
         price = Entity.from_elements(surface=p, value=p, type='price')
         tokens.append(PriceScaler.scale_price(kb, price))
     return tokens
Exemple #11
0
    def __init__(self, raw_text, tokens, action='message'):
        self.text = raw_text
        self.tokens = tokens
        self.action = action
        self.prices = []
        self.keywords = []
        self.speech_acts = []
        self.stage = -1
        self.categories = defaultdict(lambda: defaultdict(int))

        if self.action == 'message':
            self.prices = [token for token in self.tokens if is_entity(token)]
        elif self.action == 'offer':
            price = self.text
            self.prices.append(
                Entity(price, CanonicalEntity(float(price), 'price')))
Exemple #12
0
 def rewrite_candidate(self, fillers, candidate):
     rewritten = []
     tokens = candidate
     if not tokens:
         return rewritten
     for i, tok in enumerate(tokens):
         if is_entity(tok) and tok.canonical.type == 'slot':
             for filler in fillers:
                 ss = filler.split()
                 new_tokens = list(tokens)
                 del new_tokens[i]
                 for j, s in enumerate(ss):
                     new_tokens.insert(
                         i + j, Entity(s, CanonicalEntity('', 'slot')))
                 new_cand = new_tokens
                 rewritten.append(new_cand)
     return rewritten
Exemple #13
0
 def link_entity(self, raw_tokens, kb=None, scale=True, price_clip=None):
     tokens = ['<s>'] + raw_tokens + ['</s>']
     entity_tokens = []
     if kb:
         kb_numbers = self.get_kb_numbers(kb)
         list_price = kb.facts['item']['Price']
     for i in xrange(1, len(tokens) - 1):
         token = tokens[i]
         try:
             number = float(self.process_string(token))
             has_dollar = lambda token: token[0] == '$' or token[-1] == '$'
             # Check context
             if not has_dollar(token) and \
                     not self.is_price(tokens[i-1], tokens[i+1]):
                 number = None
             # Avoid 'infinity' being recognized as a number
             elif number == float('inf') or number == float('-inf'):
                 number = None
             # Check if the price is reasonable
             elif kb:
                 if not has_dollar(token):
                     if number > 1.5 * list_price:
                         number = None
                     # Probably a spec number
                     if number != list_price and number in kb_numbers:
                         number = None
                 if number is not None and price_clip is not None:
                     scaled_price = PriceScaler._scale_price(kb, number)
                     if abs(scaled_price) > price_clip:
                         number = None
         except ValueError:
             number = None
         if number is None:
             new_token = token
         else:
             assert not math.isnan(number)
             if scale:
                 scaled_price = PriceScaler._scale_price(kb, number)
             else:
                 scaled_price = number
             new_token = Entity(surface=token,
                                canonical=CanonicalEntity(
                                    value=scaled_price, type='price'))
         entity_tokens.append(new_token)
     return entity_tokens
Exemple #14
0
 def load_candidates(self, paths):
     candidates = defaultdict(list)
     # When dumped to json, NamedTuple becomes list. Now convert it back.
     is_str = lambda x: isinstance(x, basestring)
     # x[0] (surface of entity): note that for prices from the offer action,
     # surface is float instead of string
     to_ent = lambda x: x.encode('utf-8') if is_str(x) else \
         Entity(x[0].encode('utf-8') if is_str(x[0]) else x[0], CanonicalEntity(*x[1]))
     for path in paths:
         print 'Load candidates from', path
         results = read_json(path)
         for r in results:
             # None for encoding turns
             if r['candidates'] is None:
                 candidates[(r['uuid'], r['role'])].append(None)
             else:
                 # Only take the response (list of tokens)
                 candidates_ = [[to_ent(x) for x in c['response']]
                                for c in ifilter(lambda x: 'response' in x,
                                                 r['candidates'])]
                 candidates[(r['uuid'], r['role'])].append(candidates_)
     return candidates
Exemple #15
0
 def price_to_entity(cls, price):
     return Entity(price, CanonicalEntity(price, 'price'))
Exemple #16
0
    def link_entity(self,
                    raw_tokens,
                    return_entities=False,
                    agent=1,
                    uuid="NONE",
                    kb=None,
                    mentioned_entities=None,
                    known_kb=True):
        """
        Add detected entities to each token
        Example: ['i', 'work', 'at', 'apple'] => ['i', 'work', 'at', ('apple', ('apple','company'))]
        Note: Linking works differently here because we are considering intersection of lists across
        token spans so that "univ of penn" will lookup in our lexicon table for "univ" and "penn"
        (disregarding stop words and special tokens) and find their intersection
        :param return_entities: Whether to return entities found in utterance
        :param agent: Agent (0,1) whose utterance is being linked
        :param uuid: uuid of scenario being used for testing whether candidate entity is in KB
        """
        if kb is not None:
            kb_entities = kb.entity_set
            if mentioned_entities is not None:
                kb_entities = kb_entities.union(mentioned_entities)
            kb_entity_types = kb.entity_type_set
        else:
            kb_entities = None
            kb_entity_types = None

        i = 0
        found_entities = []
        linked = []
        stop_words = set(['of'])
        while i < len(raw_tokens):
            candidate_entities = None
            single_char = False
            # Find longest phrase (if any) that matches an entity
            for l in range(6, 0, -1):
                phrase = ' '.join(raw_tokens[i:i + l])
                raw = raw_tokens[i:i + l]

                for idx, token in enumerate(raw):
                    results = self.lookup(token)
                    if idx == 0: candidate_entities = results
                    if token not in stop_words:
                        candidate_entities = list(
                            set(candidate_entities).intersection(set(results)))

                # Single character token so disregard candidate entities
                if l == 1 and len(phrase) == 1:
                    single_char = True
                    break

                # Found some match
                if len(candidate_entities) > 0:
                    if kb_entities is not None:
                        best_match = self.score_and_match(
                            phrase, candidate_entities, agent, uuid,
                            kb_entities, kb_entity_types, known_kb)
                    else:
                        # TODO: Fix default system, if no kb_entities provided -- only returns random candidate now
                        best_match = random.sample(candidate_entities, 1)[0]
                    # If best_match is entity from KB add to list
                    if best_match[1] is not None:
                        # Return as (surface form, (canonical, type))
                        linked.append((phrase, best_match))
                        found_entities.append((phrase, best_match))
                        i += l
                        break
                    else:
                        candidate_entities = None
                        continue
                    # else:
                    #     i += l
                    #     entities.append(candidate_entities)
                    #     break

            if not candidate_entities or single_char:
                linked.append(raw_tokens[i])
                i += 1

        linked = self.combine_repeated_entity(linked)

        # Convert to Entity
        linked = [
            Entity.from_elements(x[0], x[1][0], x[1][1])
            if not isinstance(x, basestring) else x for x in linked
        ]

        # For computing per dialogue entities found
        if return_entities:
            return linked, found_entities

        return linked
Exemple #17
0
 def tuple_to_entity(cls, token):
     if isinstance(token, list):
         return Entity(token[0], CanonicalEntity(*token[1]))
     else:
         return token