def test_basic_parse(self): """Check that the dictionary is correctly adding and parsing short sentence. """ from parlai.core.dict import DictionaryAgent from parlai.core.params import ParlaiParser argparser = ParlaiParser() DictionaryAgent.add_cmdline_args(argparser) opt = argparser.parse_args(print_args=False) dictionary = DictionaryAgent(opt) num_builtin = len(dictionary) dictionary.observe({'text': 'hello world'}) dictionary.act() assert len(dictionary) - num_builtin == 2 vec = dictionary.parse('hello world') assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1 vec = dictionary.parse('hello world', vec_type=list) assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1 vec = dictionary.parse('hello world', vec_type=tuple) assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1
def test_basic_parse(self): """Check that the dictionary is correctly adding and parsing short sentence. """ from parlai.core.dict import DictionaryAgent from parlai.core.params import ParlaiParser argparser = ParlaiParser() DictionaryAgent.add_cmdline_args(argparser) opt = argparser.parse_args() dictionary = DictionaryAgent(opt) num_builtin = len(dictionary) dictionary.observe({'text': 'hello world'}) dictionary.act() assert len(dictionary) - num_builtin == 2 vec = dictionary.parse('hello world') assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1 vec = dictionary.parse('hello world', vec_type=list) assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1 vec = dictionary.parse('hello world', vec_type=tuple) assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1
def test_basic_parse(self): """ Check the dictionary is correctly adding and parsing short sentence. """ parser = ParlaiParser() DictionaryAgent.add_cmdline_args(parser, partial_opt=None) opt = parser.parse_args([]) dictionary = DictionaryAgent(opt) num_builtin = len(dictionary) dictionary.observe({'text': 'hello world'}) dictionary.act() assert len(dictionary) - num_builtin == 2 vec = dictionary.parse('hello world') assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1 vec = dictionary.parse('hello world', vec_type=list) assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1 vec = dictionary.parse('hello world', vec_type=tuple) assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1
class NERDictionaryAgent(DictionaryAgent): """Named Entity Recognition dictionary agent""" @staticmethod def add_cmdline_args(argparser): """Add command line arguments""" group = DictionaryAgent.add_cmdline_args(argparser) group.add_argument( '--dict_class', default=class2str(NERDictionaryAgent), help='Sets the dictionary\'s class' ) def __init__(self, opt, shared=None): """Initialize NER dictionary agent""" child_opt = copy.deepcopy(opt) # child_opt['model_file'] += '.labels' child_opt['dict_file'] = child_opt['dict_file'] + '.labels.dict' self.labels_dict = DictionaryAgent(child_opt, shared) self.char_dict = get_char_dict() super().__init__(opt, shared) def observe(self, observation): """Get the data from the observation""" observation = copy.deepcopy(observation) labels_observation = copy.deepcopy(observation) labels_observation['text'] = None observation['labels'] = None self.labels_dict.observe(labels_observation) return super().observe(observation) def act(self): self.labels_dict.act() super().act() return {'id': 'NERDictionary'} def save(self, filename=None, append=False, sort=True): """Save dictionary to the file Args: filename: filename of the dictionary append: boolean whether to append to the existing dict sort: boolean which determines whether to sort the dict or not Returns: None """ filename = self.opt['model_file'] if filename is None else filename self.labels_dict.save(filename + '.labels.dict') return super().save(filename, append, sort) def tokenize(self, text, building=False): """Tokenize given text""" return text.split(' ') if text else []
class NERDictionaryAgent(DictionaryAgent): @staticmethod def add_cmdline_args(argparser): group = DictionaryAgent.add_cmdline_args(argparser) group.add_argument('--dict_class', default=class2str(NERDictionaryAgent), help='Sets the dictionary\'s class') def __init__(self, opt, shared=None): child_opt = copy.deepcopy(opt) # child_opt['model_file'] += '.labels' child_opt['dict_file'] = child_opt['dict_file'] + '.labels.dict' self.labels_dict = DictionaryAgent(child_opt, shared) self.char_dict = get_char_dict() super().__init__(opt, shared) def observe(self, observation): observation = copy.deepcopy(observation) labels_observation = copy.deepcopy(observation) labels_observation['text'] = None observation['labels'] = None self.labels_dict.observe(labels_observation) return super().observe(observation) def act(self): self.labels_dict.act() super().act() return {'id': 'NERDictionary'} def save(self, filename=None, append=False, sort=True): filename = self.opt['model_file'] if filename is None else filename self.labels_dict.save(filename + '.labels.dict') return super().save(filename, append, sort) def tokenize(self, text, building=False): return text.split(' ') if text else []
class IrBaselineAgent(Agent): """Information Retrieval baseline.""" @staticmethod def add_cmdline_args(parser): """Add command line args specific to this agent.""" parser = parser.add_argument_group('IrBaseline Arguments') parser.add_argument('-lp', '--length_penalty', type=float, default=0.5, help='length penalty for responses') parser.add_argument( '-hsz', '--history_size', type=int, default=1, help='number of utterances from the dialogue history to take use ' 'as the query') parser.add_argument('--label_candidates_file', type=str, default=None, help='file of candidate responses to choose from') def __init__(self, opt, shared=None): """Initialize agent.""" super().__init__(opt) self.id = 'IRBaselineAgent' self.length_penalty = float(opt['length_penalty']) self.dictionary = DictionaryAgent(opt) self.opt = opt self.history = [] self.episodeDone = True if opt.get('label_candidates_file'): f = open(opt.get('label_candidates_file')) self.label_candidates = f.read().split('\n') def reset(self): """Reset agent properties.""" self.observation = None self.history = [] self.episodeDone = True def observe(self, obs): """Store and remember incoming observation message dict.""" self.observation = obs self.dictionary.observe(obs) if self.episodeDone: self.history = [] if 'text' in obs: self.history.append(obs.get('text', '')) self.episodeDone = obs.get('episode_done', False) return obs def act(self): """Generate a response to the previously seen observation(s).""" if self.opt.get('datatype', '').startswith('train'): self.dictionary.act() obs = self.observation reply = {} reply['id'] = self.getID() # Rank candidates cands = None if 'label_candidates' in obs and len(obs['label_candidates']) > 0: cands = obs['label_candidates'] if hasattr(self, 'label_candidates'): # override label candidates with candidate file if set cands = self.label_candidates if cands: hist_sz = self.opt.get('history_size', 1) left_idx = max(0, len(self.history) - hist_sz) text = ' '.join(self.history[left_idx:len(self.history)]) rep = self.build_query_representation(text) reply['text_candidates'] = (rank_candidates( rep, cands, self.length_penalty, self.dictionary)) reply['text'] = reply['text_candidates'][0] else: reply['text'] = "I don't know." return reply def save(self, fname=None): """Save dictionary tokenizer if available.""" fname = self.opt.get('model_file', None) if fname is None else fname if fname: self.dictionary.save(fname + '.dict') def load(self, fname): """Load internal dictionary.""" self.dictionary.load(fname + '.dict') def build_query_representation(self, query): """Build representation of query, e.g. words or n-grams. :param query: string to represent. :returns: dictionary containing 'words' dictionary (token => frequency) and 'norm' float (square root of the number of tokens) """ rep = {} rep['words'] = {} words = [w for w in self.dictionary.tokenize(query.lower())] rw = rep['words'] used = {} for w in words: if len(self.dictionary.freqs()) > 0: rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w])) else: if w not in stopwords: rw[w] = 1 used[w] = True rep['norm'] = math.sqrt(len(words)) return rep
class IrBaselineAgent(Agent): @staticmethod def add_cmdline_args(parser): DictionaryAgent.add_cmdline_args(parser) parser.add_argument( '-lp', '--length_penalty', type=float, default=0.5, help='length penalty for responses') parser.add_argument( '-hsz', '--history_size', type=int, default=1, help='number of utterances from the dialogue history to take use as the query') def __init__(self, opt, shared=None): super().__init__(opt) self.id = 'IRBaselineAgent' self.length_penalty = float(opt['length_penalty']) self.dictionary = DictionaryAgent(opt) self.opt = opt self.history = [] self.episodeDone = True def reset(self): self.observation = None self.history = [] self.episodeDone = True def observe(self, obs): self.observation = obs self.dictionary.observe(obs) if self.episodeDone: self.history = [] if 'text' in obs: self.history.append(obs.get('text', '')) self.episodeDone = obs.get('episode_done', False) return obs def act(self): if self.opt.get('datatype', '').startswith('train'): self.dictionary.act() obs = self.observation reply = {} reply['id'] = self.getID() # Rank candidates if 'label_candidates' in obs and len(obs['label_candidates']) > 0: # text = obs['text'] text = ' '.join( self.history[max(0, len(self.history) - self.opt.get('history_size', 1)):len(self.history)]) rep = self.build_query_representation(text) reply['text_candidates'] = ( rank_candidates(rep, obs['label_candidates'], self.length_penalty, self.dictionary)) reply['text'] = reply['text_candidates'][0] else: reply['text'] = "I don't know." return reply def save(self, fname=None): fname = self.opt.get('model_file', None) if fname is None else fname if fname: self.dictionary.save(fname + '.dict') def load(self, fname): self.dictionary.load(fname + '.dict') def build_query_representation(self, query): """ Build representation of query, e.g. words or n-grams """ rep = {} rep['words'] = {} words = [w for w in self.dictionary.tokenize(query.lower())] rw = rep['words'] used = {} for w in words: if len(self.dictionary.freqs()) > 0: rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w])) else: if w not in stopwords: rw[w] = 1 used[w] = True rep['norm'] = math.sqrt(len(words)) return rep
class IrBaselineAgent(Agent): @staticmethod def add_cmdline_args(parser): DictionaryAgent.add_cmdline_args(parser) parser.add_argument( '-lp', '--length_penalty', default=0.5, help='length penalty for responses') def __init__(self, opt, shared=None): super().__init__(opt) self.id = 'IRBaselineAgent' self.length_penalty = float(opt['length_penalty']) self.dictionary = DictionaryAgent(opt) self.opt = opt def observe(self, obs): self.observation = obs self.dictionary.observe(obs) return obs def act(self): if self.opt.get('datatype', '').startswith('train'): self.dictionary.act() obs = self.observation reply = {} reply['id'] = self.getID() # Rank candidates if 'label_candidates' in obs and len(obs['label_candidates']) > 0: rep = self.build_query_representation(obs['text']) reply['text_candidates'] = ( rank_candidates(rep, obs['label_candidates'], self.length_penalty, self.dictionary)) reply['text'] = reply['text_candidates'][0] else: reply['text'] = "I don't know." return reply def save(self, fname=None): fname = self.opt.get('model_file', None) if fname is None else fname if fname: self.dictionary.save(fname + '.dict') def load(self, fname): self.dictionary.load(fname + '.dict') def build_query_representation(self, query): """ Build representation of query, e.g. words or n-grams """ rep = {} rep['words'] = {} words = [w for w in self.dictionary.tokenize(query.lower())] rw = rep['words'] used = {} for w in words: if len(self.dictionary.freqs()) > 0: rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w])) else: if w not in stopwords: rw[w] = 1 used[w] = True norm = len(used) rep['norm'] = math.sqrt(len(words)) return rep
class IrBaselineAgent(Agent): @staticmethod def add_cmdline_args(parser): DictionaryAgent.add_cmdline_args(parser) parser.add_argument('-lp', '--length_penalty', default=0.5, help='length penalty for responses') def __init__(self, opt, shared=None): super().__init__(opt) self.id = 'IRBaselineAgent' self.length_penalty = float(opt['length_penalty']) self.dictionary = DictionaryAgent(opt) self.opt = opt def observe(self, obs): self.observation = obs self.dictionary.observe(obs) return obs def act(self): if self.opt.get('datatype', '').startswith('train'): self.dictionary.act() obs = self.observation reply = {} reply['id'] = self.getID() # Rank candidates if 'label_candidates' in obs and len(obs['label_candidates']) > 0: rep = self.build_query_representation(obs['text']) reply['text_candidates'] = (rank_candidates( rep, obs['label_candidates'], self.length_penalty, self.dictionary)) reply['text'] = reply['text_candidates'][0] else: reply['text'] = "I don't know." return reply def save(self, fname=None): fname = self.opt.get('model_file', None) if fname is None else fname if fname: self.dictionary.save(fname + '.dict') def load(self, fname): self.dictionary.load(fname + '.dict') def build_query_representation(self, query): """ Build representation of query, e.g. words or n-grams """ rep = {} rep['words'] = {} words = [w for w in self.dictionary.tokenize(query.lower())] rw = rep['words'] used = {} for w in words: if len(self.dictionary.freqs()) > 0: rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w])) else: if w not in stopwords: rw[w] = 1 used[w] = True norm = len(used) rep['norm'] = math.sqrt(len(words)) return rep