def test(self): entities = [ CanonicalEntity("amanda", "name"), CanonicalEntity("rowan eastern college of the arts", "school"), CanonicalEntity("bible studies", "major") ] realized = self.realize_entity(entities) print realized
def from_csv(cls, path, threshold): entities = [] with open(path, 'r') as csvfile: reader = csv.DictReader(csvfile, delimiter=',') for i, row in enumerate(reader): entities.append(CanonicalEntity(value=row['title'], type='title')) return cls(entities=entities, threshold=threshold)
def detect_slots(self, tokens, kb, context=None, join=False, stem=False): ''' join: join consecutive slot words to one entity ''' if not context: context = self.get_context_tokens((kb, ), stem=stem) #print 'context tokens:', context role = kb.facts['personal']['Role'] category = kb.facts['item']['Category'] labels = self._get_slot_labels(role, category, context, tokens, stem=stem) slot_entity = lambda s: Entity( surface=s, canonical=CanonicalEntity(value='', type='slot')) if not join: slots = [ slot_entity(x) if labels[i] == 1 else x for i, x in enumerate(tokens) ] else: spans = self.label_to_span(labels) slots = [ tokens[x] if not isinstance(x, tuple) else slot_entity( ' '.join(tokens[x[0]:x[1]])) for x in spans ] return slots
def _tokens_to_event(self, tokens, output_data, semi_event=False): # if self.agent == 0 : # try: # tokens = [0, 0] # tokens[0] = markers.OFFER # tokens[1] = '$60' # except ValueError: # #return None # pass if isinstance(tokens, tuple): tokens = list(tokens) if isinstance(tokens[1], float): tokens[1] = CanonicalEntity(type='price', value=tokens[1]) if semi_event: # From scale to real price # print('semi_event: {}->'.format(tokens[1]),end='') if tokens[1] is not None: tokens[1] = self.builder.get_price_number(tokens[1], self.kb) # print('{}.'.format(tokens[1])) return tokens if isinstance(tokens[0], int): tokens[0] = self.env.vocab.to_word(tokens[0]) if len(tokens) > 1 and tokens[0] == markers.OFFER and is_entity(tokens[1]): try: price = self.builder.get_price_number(tokens[1], self.kb) return self.offer({'price': price}, metadata={"output_data": output_data}) except ValueError: # return None pass elif tokens[0] == markers.OFFER: assert False tokens = self.builder.entity_to_str(tokens, self.kb) if len(tokens) > 0: if tokens[0] == markers.ACCEPT: return self.accept(metadata={"output_data": output_data}) elif tokens[0] == markers.REJECT: return self.reject(metadata={"output_data": output_data}) elif tokens[0] == markers.QUIT: return self.quit(metadata={"output_data": output_data}) while len(tokens) > 0 and tokens[-1] == None: tokens = tokens[:-1] s = self.attach_punct(' '.join(tokens)) # print 'send:', s # print(">>> sender's intent: ", tokens) role = self.kb.facts['personal']['Role'] category = self.kb.facts['item']['Category'] real_uttr = self.uttr_gen(tokens, role, category) # print(">>> sender's uttr: ", real_uttr) return self.message(s, metadata={"output_data": output_data, "real_uttr": real_uttr})
def _init_prices(self): from core.price_tracker import PriceList self.p_list = PriceList.getPriceList().p_list from cocoa.core.entity import Entity, CanonicalEntity self.add_words([ Entity(surface='', canonical=CanonicalEntity(value=p, type='price')) for p in self.p_list ])
def int_to_text(self, inds, stage=None, prices=None): ''' Inverse of text_to_int. ''' toks = [self.vocab.to_word(ind) for ind in inds] if prices is not None: assert len(inds) == len(prices) toks = [CanonicalEntity(value=p, type='price') if price_filler(x) else x for x, p in izip(toks, prices)] return toks
def link_entity(self, raw_tokens, kb=None, scale=True, price_clip=None): """ Takes the numbers in the tokenized sentence and converts them into Entity Numbers Args: raw_tokens: List of dialogue tokens kb: agent knowledge base scale(bool): If True, will scale the prices between 0 and 1 price_clip(int, optional): If specified, will ignore all numbers greater then the price_clip Returns: A new list of tokens, with the number tokens replaced with entities """ tokens = ['<s>'] + raw_tokens + ['</s>'] entity_tokens = [] if kb: kb_numbers = self.get_kb_numbers(kb) list_price = kb.facts['item']['Price'] for i in xrange(1, len(tokens)-1): # Ignore the start and end tokens that were just added token = tokens[i] try: number = float(self.process_string(token)) has_dollar = lambda token: token[0] == '$' or token[-1] == '$' # Check context if not has_dollar(token) and \ not self.is_price(tokens[i-1], tokens[i+1]): number = None # Avoid 'infinity' being recognized as a number elif number == float('inf') or number == float('-inf'): number = None # Check if the price is reasonable elif kb: if not has_dollar(token): if number > 1.5 * list_price: number = None # Probably a spec number if number != list_price and number in kb_numbers: number = None if number is not None and price_clip is not None: scaled_price = PriceScaler._scale_price(kb, number) if abs(scaled_price) > price_clip: number = None except ValueError: number = None if number is None: new_token = token else: assert not math.isnan(number) if scale: scaled_price = PriceScaler._scale_price(kb, number) else: scaled_price = number new_token = Entity(surface=token, canonical=CanonicalEntity(value=scaled_price, type='price')) entity_tokens.append(new_token) return entity_tokens
def get_entity_coords(self): """Return a dict of {entity: [row]} """ entity_coords = defaultdict(list) for row, item in enumerate(self.kb.items): for col, attr in enumerate(self.kb.attributes): entity = CanonicalEntity(value=item[attr.name].lower(), type=attr.value_type) entity_coords[entity].append(row) return entity_coords
def count_entity(self): ''' Return a dict of {entity: count}. ''' entity_counts = defaultdict(int) for item in self.kb.items: for attr_name, entity_value in item.iteritems(): entity = CanonicalEntity(entity_value.lower(), self.attr_type[attr_name]) entity_counts[entity] += 1 return entity_counts
def from_file(cls, inverse_lexicon_path): """Read linked entities from file. Process inverse lexicon data <entity> \t <span> \t <type> and generate variant frequency count """ inverse_lexicon = defaultdict(Counter) with open(inverse_lexicon_path, "r") as f: for line in f: value, surface, type_ = line.strip().split("\t") entity = CanonicalEntity(value, type_) inverse_lexicon[entity][surface] += 1 return cls(inverse_lexicon)
def __init__(self, raw_text, tokens, action='message'): self.text = raw_text self.tokens = tokens self.action = action self.prices = [] self.keywords = [] self.speech_acts = [] self.stage = -1 self.categories = defaultdict(lambda: defaultdict(int)) if self.action == 'message': self.prices = [token for token in self.tokens if is_entity(token)] elif self.action == 'offer': price = self.text self.prices.append( Entity(price, CanonicalEntity(float(price), 'price')))
def rewrite_candidate(self, fillers, candidate): rewritten = [] tokens = candidate if not tokens: return rewritten for i, tok in enumerate(tokens): if is_entity(tok) and tok.canonical.type == 'slot': for filler in fillers: ss = filler.split() new_tokens = list(tokens) del new_tokens[i] for j, s in enumerate(ss): new_tokens.insert( i + j, Entity(s, CanonicalEntity('', 'slot'))) new_cand = new_tokens rewritten.append(new_cand) return rewritten
def link_entity(self, raw_tokens, kb=None, scale=True, price_clip=None): tokens = ['<s>'] + raw_tokens + ['</s>'] entity_tokens = [] if kb: kb_numbers = self.get_kb_numbers(kb) list_price = kb.facts['item']['Price'] for i in xrange(1, len(tokens) - 1): token = tokens[i] try: number = float(self.process_string(token)) has_dollar = lambda token: token[0] == '$' or token[-1] == '$' # Check context if not has_dollar(token) and \ not self.is_price(tokens[i-1], tokens[i+1]): number = None # Avoid 'infinity' being recognized as a number elif number == float('inf') or number == float('-inf'): number = None # Check if the price is reasonable elif kb: if not has_dollar(token): if number > 1.5 * list_price: number = None # Probably a spec number if number != list_price and number in kb_numbers: number = None if number is not None and price_clip is not None: scaled_price = PriceScaler._scale_price(kb, number) if abs(scaled_price) > price_clip: number = None except ValueError: number = None if number is None: new_token = token else: assert not math.isnan(number) if scale: scaled_price = PriceScaler._scale_price(kb, number) else: scaled_price = number new_token = Entity(surface=token, canonical=CanonicalEntity( value=scaled_price, type='price')) entity_tokens.append(new_token) return entity_tokens
def _tokens_to_event(self, tokens, output_data): # if self.agent == 0 : # try: # tokens = [0, 0] # tokens[0] = markers.OFFER # tokens[1] = '$60' # except ValueError: # #return None # pass if isinstance(tokens, tuple): tokens = list(tokens) if isinstance(tokens[0], int): tokens[0] = self.env.vocab.to_word(tokens[0]) if isinstance(tokens[1], float): tokens[1] = CanonicalEntity(type='price', value=tokens[1]) if len(tokens) > 1 and tokens[0] == markers.OFFER and is_entity( tokens[1]): try: price = self.builder.get_price_number(tokens[1], self.kb) return self.offer({'price': price}, metadata=output_data) except ValueError: # return None pass elif tokens[0] == markers.OFFER: assert False tokens = self.builder.entity_to_str(tokens, self.kb) if len(tokens) > 0: if tokens[0] == markers.ACCEPT: return self.accept(metadata=output_data) elif tokens[0] == markers.REJECT: return self.reject(metadata=output_data) elif tokens[0] == markers.QUIT: return self.quit(metadata=output_data) while len(tokens) > 0 and tokens[-1] == None: tokens = tokens[:-1] s = self.attach_punct(' '.join(tokens)) # print 'send:', s return self.message(s, metadata=output_data)
def get_same_row_entities(self, entities): """Return entities in the same row as `entities`. """ print 'get same row:', entities rows = set(range(len(self.kb.items))) for entity in entities: rows = rows.intersection(set(self.entity_coords[entity])) row_entities = [] for row in rows: ents = [] if self.item_weights[row] < 0: continue item = self.kb.items[row] for col, attr in enumerate(self.kb.attributes): entity = CanonicalEntity(value=item[attr.name].lower(), type=attr.value_type) if not entity in entities: ents.append(entity) if ents: row_entities.append(ents) return row_entities
def load_candidates(self, paths): candidates = defaultdict(list) # When dumped to json, NamedTuple becomes list. Now convert it back. is_str = lambda x: isinstance(x, basestring) # x[0] (surface of entity): note that for prices from the offer action, # surface is float instead of string to_ent = lambda x: x.encode('utf-8') if is_str(x) else \ Entity(x[0].encode('utf-8') if is_str(x[0]) else x[0], CanonicalEntity(*x[1])) for path in paths: print 'Load candidates from', path results = read_json(path) for r in results: # None for encoding turns if r['candidates'] is None: candidates[(r['uuid'], r['role'])].append(None) else: # Only take the response (list of tokens) candidates_ = [[to_ent(x) for x in c['response']] for c in ifilter(lambda x: 'response' in x, r['candidates'])] candidates[(r['uuid'], r['role'])].append(candidates_) return candidates
def from_json(cls, path, threshold): entities = [] reader = json.load( open(path, 'r') ) for i, row in enumerate(reader): entities.append(CanonicalEntity(value=row['title'], type='title')) return cls(entities=entities, threshold=threshold)
def price_to_entity(cls, price): return Entity(price, CanonicalEntity(price, 'price'))
def tuple_to_entity(cls, token): if isinstance(token, list): return Entity(token[0], CanonicalEntity(*token[1])) else: return token
def to_word(self, ind): if isinstance(ind, int): return self.ind_to_word[ind] else: from cocoa.core.entity import CanonicalEntity return CanonicalEntity(value=ind, type='price')