def link_entity(self, text): """Link tokens to entities. Example: ['i', 'work', 'at', 'apple'] => ['i', 'work', 'at', ('apple', ('apple','company'))] """ doc = self.nlp(unicode(text)) entities = [] for np in doc.noun_chunks: s = np.text candidates = self.query(s, k=1) if candidates: sorted_candidates = sorted(candidates, key=lambda x: fuzz.ratio(s.lower(), x.value.lower()), reverse=True) for candidate in ifilter(lambda e: e.value.lower() not in self.stopwords, sorted_candidates): if fuzz.ratio(s.lower(), candidate.value.lower()) > 80: entity = Entity(surface=s, canonical=candidate) entities.append((entity, np.start, np.end)) # Take the best matched candidate break def overlap(e, entities): for entity in entities: if not (e.start >= entity[2] or e.end < entity[1]): return True return False for ent in doc.ents: if not overlap(ent, entities): if ent.label_ == 'PERSON': entity = Entity.from_elements(surface=ent.text, value=ent.text, type='person') entities.append((entity, ent.start, ent.end)) elif ent.label_ == 'WORK_OF_ART': entity = Entity.from_elements(surface=ent.text, value=ent.text, type='title') entities.append((entity, ent.start, ent.end)) tokens = [tok.text for tok in doc] if not entities: entity_tokens = tokens else: last = 0 entity_tokens = [] entities = sorted(entities, key=lambda x: x[1]) for entity in entities: entity, start, end = entity entity_tokens.extend(tokens[last:start]) entity_tokens.append(entity) last = end if last < len(tokens): entity_tokens.extend(tokens[last:]) return entity_tokens
def detect_slots(self, tokens, kb, context=None, join=False, stem=False): ''' join: join consecutive slot words to one entity ''' if not context: context = self.get_context_tokens((kb, ), stem=stem) #print 'context tokens:', context role = kb.facts['personal']['Role'] category = kb.facts['item']['Category'] labels = self._get_slot_labels(role, category, context, tokens, stem=stem) slot_entity = lambda s: Entity( surface=s, canonical=CanonicalEntity(value='', type='slot')) if not join: slots = [ slot_entity(x) if labels[i] == 1 else x for i, x in enumerate(tokens) ] else: spans = self.label_to_span(labels) slots = [ tokens[x] if not isinstance(x, tuple) else slot_entity( ' '.join(tokens[x[0]:x[1]])) for x in spans ] return slots
def _realize_entity(self, entity): if isinstance(entity, Entity): entity = entity.canoncial elif isinstance(entity, CanonicalEntity): pass else: raise TypeError('Unknown entity') s = re.sub(r',|-|&', ' ', entity.value) tokens = [ tok for tok in ifilter( lambda x: x.lower() not in ('the', 'of', 'and'), s.split()) ] if entity.type == 'school': tokens = [ tok for tok in ifilter( lambda x: x.lower() not in ('university', 'college', 'state', 'at'), tokens) ] elif entity.type == 'company': tokens = [ tok for tok in ifilter( lambda x: x.lower() not in ('at', 'company', 'corporation', 'group'), tokens) ] surface = ' '.join(tokens[:2]) return Entity(surface, entity)
def _realize_entity(self, entity): """Return an observed surface form of entity. Args: entity (Entity or CanonicalEntity) """ entity = super(InverseLexicon, self)._realize_entity(entity) if entity.canonical not in self.inverse_lexicon: return entity else: entity = entity.canonical items = self.inverse_lexicon[entity].items() variants = [item[0] for item in items] counts = np.array([item[1] for item in items], dtype=np.float32) # Make it peaky peaky_counts = counts**2 normal_counts = peaky_counts / np.sum(peaky_counts) try: idx = np.random.choice(np.arange(len(counts)), 1, p=normal_counts)[0] except ValueError: idx = np.argmax(counts) realized = variants[idx] return Entity(realized, entity)
def detect_item(self, token): for item in self.items: if re.match(r'{}s?'.format(item), token) or \ (item == 'ball' and re.match(r'(basket)?balls?', token)): return Entity.from_elements(surface=token, value=item, type='item') return False
def _init_prices(self): from core.price_tracker import PriceList self.p_list = PriceList.getPriceList().p_list from cocoa.core.entity import Entity, CanonicalEntity self.add_words([ Entity(surface='', canonical=CanonicalEntity(value=p, type='price')) for p in self.p_list ])
def link_entity(self, raw_tokens, kb=None, scale=True, price_clip=None): """ Takes the numbers in the tokenized sentence and converts them into Entity Numbers Args: raw_tokens: List of dialogue tokens kb: agent knowledge base scale(bool): If True, will scale the prices between 0 and 1 price_clip(int, optional): If specified, will ignore all numbers greater then the price_clip Returns: A new list of tokens, with the number tokens replaced with entities """ tokens = ['<s>'] + raw_tokens + ['</s>'] entity_tokens = [] if kb: kb_numbers = self.get_kb_numbers(kb) list_price = kb.facts['item']['Price'] for i in xrange(1, len(tokens)-1): # Ignore the start and end tokens that were just added token = tokens[i] try: number = float(self.process_string(token)) has_dollar = lambda token: token[0] == '$' or token[-1] == '$' # Check context if not has_dollar(token) and \ not self.is_price(tokens[i-1], tokens[i+1]): number = None # Avoid 'infinity' being recognized as a number elif number == float('inf') or number == float('-inf'): number = None # Check if the price is reasonable elif kb: if not has_dollar(token): if number > 1.5 * list_price: number = None # Probably a spec number if number != list_price and number in kb_numbers: number = None if number is not None and price_clip is not None: scaled_price = PriceScaler._scale_price(kb, number) if abs(scaled_price) > price_clip: number = None except ValueError: number = None if number is None: new_token = token else: assert not math.isnan(number) if scale: scaled_price = PriceScaler._scale_price(kb, number) else: scaled_price = number new_token = Entity(surface=token, canonical=CanonicalEntity(value=scaled_price, type='price')) entity_tokens.append(new_token) return entity_tokens
def inform_title(self): intent = 'inform-new-title' title = self.choose_title() print 'chosen title:', title template = self.retrieve_response_template(intent, title=title) titles = [Entity.from_elements(surface=title, value=title, type='title')] lf = LF(intent, titles=titles) text = template['template'] utterance = Utterance(raw_text=text, logical_form=lf, template=template) return self.message(utterance)
def detect_number(self, token): try: n = int(token) except ValueError: try: n = self.word_to_num[token] except KeyError: n = None if n is not None: return Entity.from_elements(surface=token, value=n, type='number') return False
def lf_to_tokens(self, kb, lf): intent = lf['intent'] if intent == 'accept': intent = markers.ACCEPT elif intent == 'reject': intent = markers.REJECT elif intent == 'quit': intent = markers.QUIT elif intent == 'offer': intent = markers.OFFER tokens = [intent] if lf.get('price') is not None: p = lf['price'] price = Entity.from_elements(surface=p, value=p, type='price') tokens.append(PriceScaler.scale_price(kb, price)) return tokens
def __init__(self, raw_text, tokens, action='message'): self.text = raw_text self.tokens = tokens self.action = action self.prices = [] self.keywords = [] self.speech_acts = [] self.stage = -1 self.categories = defaultdict(lambda: defaultdict(int)) if self.action == 'message': self.prices = [token for token in self.tokens if is_entity(token)] elif self.action == 'offer': price = self.text self.prices.append( Entity(price, CanonicalEntity(float(price), 'price')))
def rewrite_candidate(self, fillers, candidate): rewritten = [] tokens = candidate if not tokens: return rewritten for i, tok in enumerate(tokens): if is_entity(tok) and tok.canonical.type == 'slot': for filler in fillers: ss = filler.split() new_tokens = list(tokens) del new_tokens[i] for j, s in enumerate(ss): new_tokens.insert( i + j, Entity(s, CanonicalEntity('', 'slot'))) new_cand = new_tokens rewritten.append(new_cand) return rewritten
def link_entity(self, raw_tokens, kb=None, scale=True, price_clip=None): tokens = ['<s>'] + raw_tokens + ['</s>'] entity_tokens = [] if kb: kb_numbers = self.get_kb_numbers(kb) list_price = kb.facts['item']['Price'] for i in xrange(1, len(tokens) - 1): token = tokens[i] try: number = float(self.process_string(token)) has_dollar = lambda token: token[0] == '$' or token[-1] == '$' # Check context if not has_dollar(token) and \ not self.is_price(tokens[i-1], tokens[i+1]): number = None # Avoid 'infinity' being recognized as a number elif number == float('inf') or number == float('-inf'): number = None # Check if the price is reasonable elif kb: if not has_dollar(token): if number > 1.5 * list_price: number = None # Probably a spec number if number != list_price and number in kb_numbers: number = None if number is not None and price_clip is not None: scaled_price = PriceScaler._scale_price(kb, number) if abs(scaled_price) > price_clip: number = None except ValueError: number = None if number is None: new_token = token else: assert not math.isnan(number) if scale: scaled_price = PriceScaler._scale_price(kb, number) else: scaled_price = number new_token = Entity(surface=token, canonical=CanonicalEntity( value=scaled_price, type='price')) entity_tokens.append(new_token) return entity_tokens
def load_candidates(self, paths): candidates = defaultdict(list) # When dumped to json, NamedTuple becomes list. Now convert it back. is_str = lambda x: isinstance(x, basestring) # x[0] (surface of entity): note that for prices from the offer action, # surface is float instead of string to_ent = lambda x: x.encode('utf-8') if is_str(x) else \ Entity(x[0].encode('utf-8') if is_str(x[0]) else x[0], CanonicalEntity(*x[1])) for path in paths: print 'Load candidates from', path results = read_json(path) for r in results: # None for encoding turns if r['candidates'] is None: candidates[(r['uuid'], r['role'])].append(None) else: # Only take the response (list of tokens) candidates_ = [[to_ent(x) for x in c['response']] for c in ifilter(lambda x: 'response' in x, r['candidates'])] candidates[(r['uuid'], r['role'])].append(candidates_) return candidates
def price_to_entity(cls, price): return Entity(price, CanonicalEntity(price, 'price'))
def link_entity(self, raw_tokens, return_entities=False, agent=1, uuid="NONE", kb=None, mentioned_entities=None, known_kb=True): """ Add detected entities to each token Example: ['i', 'work', 'at', 'apple'] => ['i', 'work', 'at', ('apple', ('apple','company'))] Note: Linking works differently here because we are considering intersection of lists across token spans so that "univ of penn" will lookup in our lexicon table for "univ" and "penn" (disregarding stop words and special tokens) and find their intersection :param return_entities: Whether to return entities found in utterance :param agent: Agent (0,1) whose utterance is being linked :param uuid: uuid of scenario being used for testing whether candidate entity is in KB """ if kb is not None: kb_entities = kb.entity_set if mentioned_entities is not None: kb_entities = kb_entities.union(mentioned_entities) kb_entity_types = kb.entity_type_set else: kb_entities = None kb_entity_types = None i = 0 found_entities = [] linked = [] stop_words = set(['of']) while i < len(raw_tokens): candidate_entities = None single_char = False # Find longest phrase (if any) that matches an entity for l in range(6, 0, -1): phrase = ' '.join(raw_tokens[i:i + l]) raw = raw_tokens[i:i + l] for idx, token in enumerate(raw): results = self.lookup(token) if idx == 0: candidate_entities = results if token not in stop_words: candidate_entities = list( set(candidate_entities).intersection(set(results))) # Single character token so disregard candidate entities if l == 1 and len(phrase) == 1: single_char = True break # Found some match if len(candidate_entities) > 0: if kb_entities is not None: best_match = self.score_and_match( phrase, candidate_entities, agent, uuid, kb_entities, kb_entity_types, known_kb) else: # TODO: Fix default system, if no kb_entities provided -- only returns random candidate now best_match = random.sample(candidate_entities, 1)[0] # If best_match is entity from KB add to list if best_match[1] is not None: # Return as (surface form, (canonical, type)) linked.append((phrase, best_match)) found_entities.append((phrase, best_match)) i += l break else: candidate_entities = None continue # else: # i += l # entities.append(candidate_entities) # break if not candidate_entities or single_char: linked.append(raw_tokens[i]) i += 1 linked = self.combine_repeated_entity(linked) # Convert to Entity linked = [ Entity.from_elements(x[0], x[1][0], x[1][1]) if not isinstance(x, basestring) else x for x in linked ] # For computing per dialogue entities found if return_entities: return linked, found_entities return linked
def tuple_to_entity(cls, token): if isinstance(token, list): return Entity(token[0], CanonicalEntity(*token[1])) else: return token