コード例 #1
0
    def test_basic_parse(self):
        """Check that the dictionary is correctly adding and parsing short
        sentence.
        """
        from parlai.core.dict import DictionaryAgent
        from parlai.core.params import ParlaiParser

        argparser = ParlaiParser()
        DictionaryAgent.add_cmdline_args(argparser)
        opt = argparser.parse_args(print_args=False)
        dictionary = DictionaryAgent(opt)
        num_builtin = len(dictionary)

        dictionary.observe({'text': 'hello world'})
        dictionary.act()
        assert len(dictionary) - num_builtin == 2

        vec = dictionary.parse('hello world')
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1

        vec = dictionary.parse('hello world', vec_type=list)
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1

        vec = dictionary.parse('hello world', vec_type=tuple)
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1
コード例 #2
0
ファイル: test_dict.py プロジェクト: ahiroto/ParlAI
    def test_basic_parse(self):
        """Check that the dictionary is correctly adding and parsing short
        sentence.
        """
        from parlai.core.dict import DictionaryAgent
        from parlai.core.params import ParlaiParser

        argparser = ParlaiParser()
        DictionaryAgent.add_cmdline_args(argparser)
        opt = argparser.parse_args()
        dictionary = DictionaryAgent(opt)
        num_builtin = len(dictionary)

        dictionary.observe({'text': 'hello world'})
        dictionary.act()
        assert len(dictionary) - num_builtin == 2

        vec = dictionary.parse('hello world')
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1

        vec = dictionary.parse('hello world', vec_type=list)
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1

        vec = dictionary.parse('hello world', vec_type=tuple)
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1
コード例 #3
0
ファイル: test_dict.py プロジェクト: simplecoka/cortx
    def test_basic_parse(self):
        """
        Check the dictionary is correctly adding and parsing short sentence.
        """
        parser = ParlaiParser()
        DictionaryAgent.add_cmdline_args(parser, partial_opt=None)
        opt = parser.parse_args([])
        dictionary = DictionaryAgent(opt)
        num_builtin = len(dictionary)

        dictionary.observe({'text': 'hello world'})
        dictionary.act()
        assert len(dictionary) - num_builtin == 2

        vec = dictionary.parse('hello world')
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1

        vec = dictionary.parse('hello world', vec_type=list)
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1

        vec = dictionary.parse('hello world', vec_type=tuple)
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1
コード例 #4
0
class NERDictionaryAgent(DictionaryAgent):
    """Named Entity Recognition dictionary agent"""

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command line arguments"""
        group = DictionaryAgent.add_cmdline_args(argparser)
        group.add_argument(
            '--dict_class', default=class2str(NERDictionaryAgent),
            help='Sets the dictionary\'s class'
        )

    def __init__(self, opt, shared=None):
        """Initialize NER dictionary agent"""
        child_opt = copy.deepcopy(opt)
        # child_opt['model_file'] += '.labels'
        child_opt['dict_file'] = child_opt['dict_file'] + '.labels.dict'
        self.labels_dict = DictionaryAgent(child_opt, shared)
        self.char_dict = get_char_dict()
        super().__init__(opt, shared)

    def observe(self, observation):
        """Get the data from the observation"""
        observation = copy.deepcopy(observation)
        labels_observation = copy.deepcopy(observation)
        labels_observation['text'] = None
        observation['labels'] = None
        self.labels_dict.observe(labels_observation)
        return super().observe(observation)

    def act(self):
        self.labels_dict.act()
        super().act()
        return {'id': 'NERDictionary'}

    def save(self, filename=None, append=False, sort=True):
        """Save dictionary to the file

        Args:
            filename: filename of the dictionary
            append: boolean whether to append to the existing dict
            sort: boolean which determines whether to sort the dict or not

        Returns:
            None
        """
        filename = self.opt['model_file'] if filename is None else filename
        self.labels_dict.save(filename + '.labels.dict')
        return super().save(filename, append, sort)

    def tokenize(self, text, building=False):
        """Tokenize given text"""
        return text.split(' ') if text else []
コード例 #5
0
class NERDictionaryAgent(DictionaryAgent):
    @staticmethod
    def add_cmdline_args(argparser):
        group = DictionaryAgent.add_cmdline_args(argparser)
        group.add_argument('--dict_class',
                           default=class2str(NERDictionaryAgent),
                           help='Sets the dictionary\'s class')

    def __init__(self, opt, shared=None):
        child_opt = copy.deepcopy(opt)
        # child_opt['model_file'] += '.labels'
        child_opt['dict_file'] = child_opt['dict_file'] + '.labels.dict'
        self.labels_dict = DictionaryAgent(child_opt, shared)
        self.char_dict = get_char_dict()
        super().__init__(opt, shared)

    def observe(self, observation):
        observation = copy.deepcopy(observation)
        labels_observation = copy.deepcopy(observation)
        labels_observation['text'] = None
        observation['labels'] = None
        self.labels_dict.observe(labels_observation)
        return super().observe(observation)

    def act(self):
        self.labels_dict.act()
        super().act()
        return {'id': 'NERDictionary'}

    def save(self, filename=None, append=False, sort=True):
        filename = self.opt['model_file'] if filename is None else filename
        self.labels_dict.save(filename + '.labels.dict')
        return super().save(filename, append, sort)

    def tokenize(self, text, building=False):
        return text.split(' ') if text else []
コード例 #6
0
class IrBaselineAgent(Agent):
    """Information Retrieval baseline."""
    @staticmethod
    def add_cmdline_args(parser):
        """Add command line args specific to this agent."""
        parser = parser.add_argument_group('IrBaseline Arguments')
        parser.add_argument('-lp',
                            '--length_penalty',
                            type=float,
                            default=0.5,
                            help='length penalty for responses')
        parser.add_argument(
            '-hsz',
            '--history_size',
            type=int,
            default=1,
            help='number of utterances from the dialogue history to take use '
            'as the query')
        parser.add_argument('--label_candidates_file',
                            type=str,
                            default=None,
                            help='file of candidate responses to choose from')

    def __init__(self, opt, shared=None):
        """Initialize agent."""
        super().__init__(opt)
        self.id = 'IRBaselineAgent'
        self.length_penalty = float(opt['length_penalty'])
        self.dictionary = DictionaryAgent(opt)
        self.opt = opt
        self.history = []
        self.episodeDone = True
        if opt.get('label_candidates_file'):
            f = open(opt.get('label_candidates_file'))
            self.label_candidates = f.read().split('\n')

    def reset(self):
        """Reset agent properties."""
        self.observation = None
        self.history = []
        self.episodeDone = True

    def observe(self, obs):
        """Store and remember incoming observation message dict."""
        self.observation = obs
        self.dictionary.observe(obs)
        if self.episodeDone:
            self.history = []
        if 'text' in obs:
            self.history.append(obs.get('text', ''))
        self.episodeDone = obs.get('episode_done', False)
        return obs

    def act(self):
        """Generate a response to the previously seen observation(s)."""
        if self.opt.get('datatype', '').startswith('train'):
            self.dictionary.act()

        obs = self.observation
        reply = {}
        reply['id'] = self.getID()

        # Rank candidates
        cands = None
        if 'label_candidates' in obs and len(obs['label_candidates']) > 0:
            cands = obs['label_candidates']
        if hasattr(self, 'label_candidates'):
            # override label candidates with candidate file if set
            cands = self.label_candidates
        if cands:
            hist_sz = self.opt.get('history_size', 1)
            left_idx = max(0, len(self.history) - hist_sz)
            text = ' '.join(self.history[left_idx:len(self.history)])
            rep = self.build_query_representation(text)
            reply['text_candidates'] = (rank_candidates(
                rep, cands, self.length_penalty, self.dictionary))
            reply['text'] = reply['text_candidates'][0]
        else:
            reply['text'] = "I don't know."
        return reply

    def save(self, fname=None):
        """Save dictionary tokenizer if available."""
        fname = self.opt.get('model_file', None) if fname is None else fname
        if fname:
            self.dictionary.save(fname + '.dict')

    def load(self, fname):
        """Load internal dictionary."""
        self.dictionary.load(fname + '.dict')

    def build_query_representation(self, query):
        """Build representation of query, e.g. words or n-grams.

        :param query: string to represent.

        :returns: dictionary containing 'words' dictionary (token => frequency)
                  and 'norm' float (square root of the number of tokens)
        """
        rep = {}
        rep['words'] = {}
        words = [w for w in self.dictionary.tokenize(query.lower())]
        rw = rep['words']
        used = {}
        for w in words:
            if len(self.dictionary.freqs()) > 0:
                rw[w] = 1.0 / (1.0 +
                               math.log(1.0 + self.dictionary.freqs()[w]))
            else:
                if w not in stopwords:
                    rw[w] = 1
            used[w] = True
        rep['norm'] = math.sqrt(len(words))
        return rep
コード例 #7
0
class IrBaselineAgent(Agent):

    @staticmethod
    def add_cmdline_args(parser):
        DictionaryAgent.add_cmdline_args(parser)
        parser.add_argument(
            '-lp', '--length_penalty', type=float, default=0.5,
            help='length penalty for responses')
        parser.add_argument(
            '-hsz', '--history_size', type=int, default=1,
            help='number of utterances from the dialogue history to take use as the query')

    def __init__(self, opt, shared=None):
        super().__init__(opt)
        self.id = 'IRBaselineAgent'
        self.length_penalty = float(opt['length_penalty'])
        self.dictionary = DictionaryAgent(opt)
        self.opt = opt
        self.history = []
        self.episodeDone = True

    def reset(self):
        self.observation = None
        self.history = []
        self.episodeDone = True

    def observe(self, obs):
        self.observation = obs
        self.dictionary.observe(obs)
        if self.episodeDone:
            self.history = []
        if 'text' in obs:
            self.history.append(obs.get('text', ''))
        self.episodeDone = obs.get('episode_done', False)
        return obs

    def act(self):
        if self.opt.get('datatype', '').startswith('train'):
            self.dictionary.act()

        obs = self.observation
        reply = {}
        reply['id'] = self.getID()

        # Rank candidates
        if 'label_candidates' in obs and len(obs['label_candidates']) > 0:
            # text = obs['text']
            text = ' '.join(
                self.history[max(0, len(self.history) -
                                 self.opt.get('history_size', 1)):len(self.history)])
            rep = self.build_query_representation(text)
            reply['text_candidates'] = (
                rank_candidates(rep, obs['label_candidates'],
                                self.length_penalty, self.dictionary))
            reply['text'] = reply['text_candidates'][0]
        else:
            reply['text'] = "I don't know."
        return reply

    def save(self, fname=None):
        fname = self.opt.get('model_file', None) if fname is None else fname
        if fname:
            self.dictionary.save(fname + '.dict')

    def load(self, fname):
        self.dictionary.load(fname + '.dict')

    def build_query_representation(self, query):
        """ Build representation of query, e.g. words or n-grams """
        rep = {}
        rep['words'] = {}
        words = [w for w in self.dictionary.tokenize(query.lower())]
        rw = rep['words']
        used = {}
        for w in words:
            if len(self.dictionary.freqs()) > 0:
                rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w]))
            else:
                if w not in stopwords:
                    rw[w] = 1
            used[w] = True
        rep['norm'] = math.sqrt(len(words))
        return rep
コード例 #8
0
ファイル: ir_baseline.py プロジェクト: ahiroto/ParlAI
class IrBaselineAgent(Agent):

    @staticmethod
    def add_cmdline_args(parser):
        DictionaryAgent.add_cmdline_args(parser)
        parser.add_argument(
            '-lp', '--length_penalty', default=0.5,
            help='length penalty for responses')

    def __init__(self, opt, shared=None):
        super().__init__(opt)
        self.id = 'IRBaselineAgent'
        self.length_penalty = float(opt['length_penalty'])
        self.dictionary = DictionaryAgent(opt)
        self.opt = opt

    def observe(self, obs):
        self.observation = obs
        self.dictionary.observe(obs)
        return obs

    def act(self):
        if self.opt.get('datatype', '').startswith('train'):
            self.dictionary.act()

        obs = self.observation
        reply = {}
        reply['id'] = self.getID()

        # Rank candidates
        if 'label_candidates' in obs and len(obs['label_candidates']) > 0:
            rep = self.build_query_representation(obs['text'])
            reply['text_candidates'] = (
                rank_candidates(rep, obs['label_candidates'],
                                self.length_penalty, self.dictionary))
            reply['text'] = reply['text_candidates'][0]
        else:
            reply['text'] = "I don't know."
        return reply

    def save(self, fname=None):
        fname = self.opt.get('model_file', None) if fname is None else fname
        if fname:
            self.dictionary.save(fname + '.dict')

    def load(self, fname):
        self.dictionary.load(fname + '.dict')

    def build_query_representation(self, query):
        """ Build representation of query, e.g. words or n-grams """
        rep = {}
        rep['words'] = {}
        words = [w for w in self.dictionary.tokenize(query.lower())]
        rw = rep['words']
        used = {}
        for w in words:
            if len(self.dictionary.freqs()) > 0:
                rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w]))
            else:
                if w not in stopwords:
                    rw[w] = 1
            used[w] = True
        norm = len(used)
        rep['norm'] = math.sqrt(len(words))
        return rep
コード例 #9
0
ファイル: ir_baseline.py プロジェクト: rikima/ParlAI
class IrBaselineAgent(Agent):
    @staticmethod
    def add_cmdline_args(parser):
        DictionaryAgent.add_cmdline_args(parser)
        parser.add_argument('-lp',
                            '--length_penalty',
                            default=0.5,
                            help='length penalty for responses')

    def __init__(self, opt, shared=None):
        super().__init__(opt)
        self.id = 'IRBaselineAgent'
        self.length_penalty = float(opt['length_penalty'])
        self.dictionary = DictionaryAgent(opt)
        self.opt = opt

    def observe(self, obs):
        self.observation = obs
        self.dictionary.observe(obs)
        return obs

    def act(self):
        if self.opt.get('datatype', '').startswith('train'):
            self.dictionary.act()

        obs = self.observation
        reply = {}
        reply['id'] = self.getID()

        # Rank candidates
        if 'label_candidates' in obs and len(obs['label_candidates']) > 0:
            rep = self.build_query_representation(obs['text'])
            reply['text_candidates'] = (rank_candidates(
                rep, obs['label_candidates'], self.length_penalty,
                self.dictionary))
            reply['text'] = reply['text_candidates'][0]
        else:
            reply['text'] = "I don't know."
        return reply

    def save(self, fname=None):
        fname = self.opt.get('model_file', None) if fname is None else fname
        if fname:
            self.dictionary.save(fname + '.dict')

    def load(self, fname):
        self.dictionary.load(fname + '.dict')

    def build_query_representation(self, query):
        """ Build representation of query, e.g. words or n-grams """
        rep = {}
        rep['words'] = {}
        words = [w for w in self.dictionary.tokenize(query.lower())]
        rw = rep['words']
        used = {}
        for w in words:
            if len(self.dictionary.freqs()) > 0:
                rw[w] = 1.0 / (1.0 +
                               math.log(1.0 + self.dictionary.freqs()[w]))
            else:
                if w not in stopwords:
                    rw[w] = 1
            used[w] = True
        norm = len(used)
        rep['norm'] = math.sqrt(len(words))
        return rep