Exemple #1
0
    def init_trainer(self, args):
        if args.gpuid:
            print('Running with GPU {}.'.format(args.gpuid[0]))
            cuda.set_device(args.gpuid[0])
        else:
            print('Running with CPU.')

        if args.random_seed:
            random.seed(args.random_seed + os.getpid())
            np.random.seed(args.random_seed + os.getpid())

        schema = Schema(args.schema_path)
        scenario_db = ScenarioDB.from_dict(schema,
                                           read_json(args.scenarios_path),
                                           Scenario)
        valid_scenario_db = ScenarioDB.from_dict(
            schema, read_json(args.valid_scenarios_path), Scenario)

        # if len(args.agent_checkpoints) == 0
        # assert len(args.agent_checkpoints) <= len(args.agents)
        if len(args.agent_checkpoints) < len(args.agents):
            ckpt = [None] * 2
        else:
            ckpt = args.agent_checkpoints

        systems = [
            get_system(name, args, schema, False, ckpt[i])
            for i, name in enumerate(args.agents)
        ]

        rl_agent = 0
        system = systems[rl_agent]
        model = system.env.model
        loss = None
        # optim = build_optim(args, [model, system.env.critic], None)
        optim = {
            'model': build_optim(args, model, None),
            'critic': build_optim(args, system.env.critic, None)
        }
        optim['critic']._set_rate(0.05)

        scenarios = {
            'train': scenario_db.scenarios_list,
            'dev': valid_scenario_db.scenarios_list
        }
        from neural.a2c_trainer import RLTrainer as A2CTrainer
        trainer = A2CTrainer(systems,
                             scenarios,
                             loss,
                             optim,
                             rl_agent,
                             reward_func=args.reward,
                             cuda=(len(args.gpuid) > 0),
                             args=args)

        self.args = args
        self.trainer = trainer
        self.systems = systems
Exemple #2
0
 def __init__(self, chats, surveys=None, worker_ids=None):
     self.chats = []
     for f in chats:
         self.chats.extend(read_json(f))
     self.uuid_to_chat = {chat['uuid']: chat for chat in self.chats}
     if surveys:
         # This is a list because we might have multiple batches of surveys
         self.surveys = [read_json(survey) for survey in surveys]
     if worker_ids:
         self.worker_ids = {}
         for f in worker_ids:
             self.worker_ids.update(read_json(f))
     else:
         self.worker_ids = None
Exemple #3
0
def init(path):
    global stats_path, STATS
    stats_path = path
    try:
        STATS = read_json(stats_path)
    except Exception:
        STATS = {}
Exemple #4
0
 def _read_transcripts(self, transcripts_paths, max_examples):
     transcripts = []
     for transcripts_path in transcripts_paths:
         transcripts.extend(read_json(transcripts_path))
     if max_examples is not None:
         transcripts = transcripts[:max_examples]
     return transcripts
Exemple #5
0
    def __init__(self, schema, price_tracker, retriever, model_path, mappings, timed_session=False):
        super(NeuralRankerSystem, self).__init__()
        self.schema = schema
        self.price_tracker = price_tracker
        self.timed_session = timed_session

        # Load arguments
        args_path = os.path.join(model_path, 'config.json')
        config = read_json(args_path)
        # TODO: handle this properly
        config['batch_size'] = 1
        config['pretrained_wordvec'] = None
        args = argparse.Namespace(**config)

        mappings_path = os.path.join(mappings, 'vocab.pkl')
        mappings = read_pickle(mappings_path)
        vocab = mappings['vocab']

        logstats.add_args('model_args', args)
        model = build_model(schema, mappings, None, args)

        # Tensorflow config
        if args.gpu == 0:
            print 'GPU is disabled'
            config = tf.ConfigProto(device_count = {'GPU': 0})
        else:
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.5, allow_growth=True)
            config = tf.ConfigProto(device_count = {'GPU': 1}, gpu_options=gpu_options)
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

        # NOTE: need to close the session when done
        tf_session = tf.Session(config=config)
        tf_session.run(tf.global_variables_initializer())

        # Load TF model parameters
        ckpt = tf.train.get_checkpoint_state(model_path+'-best')
        assert ckpt, 'No checkpoint found'
        assert ckpt.model_checkpoint_path, 'No model path found in checkpoint'
        saver = tf.train.Saver()
        saver.restore(tf_session, ckpt.model_checkpoint_path)

        preprocessor = Preprocessor(schema, price_tracker, 'canonical', 'canonical', 'canonical')
        textint_map = TextIntMap(vocab, preprocessor)

        int_markers = SpecialSymbols(*[mappings['vocab'].to_ind(m) for m in markers])
        model_config = {'retrieve': True}
        batcher = DialogueBatcherFactory.get_dialogue_batcher(model_config, int_markers=int_markers, slot_filling=False, kb_pad=mappings['kb_vocab'].to_ind(markers.PAD))

        StreamingDialogue.textint_map = textint_map
        StreamingDialogue.num_context = args.num_context
        StreamingDialogue.mappings = mappings

        Env = namedtuple('Env', ['ranker', 'retriever', 'tf_session', 'preprocessor', 'mappings', 'textint_map', 'batcher'])
        self.env = Env(model, retriever, tf_session, preprocessor, mappings, textint_map, batcher)
Exemple #6
0
    def __init__(self, schema, lexicon, model_path, fact_check, decoding, timed_session=False, consecutive_entity=True, realizer=None):
        super(NeuralSystem, self).__init__()
        self.schema = schema
        self.lexicon = lexicon
        self.timed_session = timed_session
        self.consecutive_entity = consecutive_entity

        # Load arguments
        args_path = os.path.join(model_path, 'config.json')
        config = read_json(args_path)
        config['batch_size'] = 1
        config['gpu'] = 0  # Don't need GPU for batch_size=1
        config['decoding'] = decoding
        args = argparse.Namespace(**config)

        mappings_path = os.path.join(model_path, 'vocab.pkl')
        mappings = read_pickle(mappings_path)
        vocab = mappings['vocab']

        # TODO: different models have the same key now
        args.dropout = 0
        logstats.add_args('model_args', args)
        model = build_model(schema, mappings, args)

        # Tensorflow config
        if args.gpu == 0:
            print 'GPU is disabled'
            config = tf.ConfigProto(device_count = {'GPU': 0})
        else:
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.5, allow_growth=True)
            config = tf.ConfigProto(device_count = {'GPU': 1}, gpu_options=gpu_options)

        # NOTE: need to close the session when done
        tf_session = tf.Session(config=config)
        tf.initialize_all_variables().run(session=tf_session)

        # Load TF model parameters
        ckpt = tf.train.get_checkpoint_state(model_path+'-best')
        assert ckpt, 'No checkpoint found'
        assert ckpt.model_checkpoint_path, 'No model path found in checkpoint'
        saver = tf.train.Saver()
        saver.restore(tf_session, ckpt.model_checkpoint_path)

        self.model_name = args.model
        if self.model_name == 'attn-copy-encdec':
            args.entity_target_form = 'graph'
            copy = True
        else:
            copy = False
        preprocessor = Preprocessor(schema, lexicon, args.entity_encoding_form, args.entity_decoding_form, args.entity_target_form, args.prepend)
        textint_map = TextIntMap(vocab, mappings['entity'], preprocessor)

        Env = namedtuple('Env', ['model', 'tf_session', 'preprocessor', 'vocab', 'copy', 'textint_map', 'stop_symbol', 'remove_symbols', 'max_len', 'evaluator', 'prepend', 'consecutive_entity', 'realizer'])
        self.env = Env(model, tf_session, preprocessor, mappings['vocab'], copy, textint_map, stop_symbol=vocab.to_ind(markers.EOS), remove_symbols=map(vocab.to_ind, (markers.EOS, markers.PAD)), max_len=20, evaluator=FactEvaluator() if fact_check else None, prepend=args.prepend, consecutive_entity=self.consecutive_entity, realizer=realizer)
Exemple #7
0
 def get_evaluated_qids(cls, db_path):
     db = read_json(db_path)
     qids = set()
     for hit_id, hit_info in db.iteritems():
         for assignment_id, result in hit_info.iteritems():
             answers = result['answers']
             for answer in answers:
                 if answer['qid'] == 'comment':
                     continue
                 qids.add(answer['qid'])
     return qids
Exemple #8
0
def init(path, verbose=False):
    global stats_path, STATS
    stats_path = path
    try:
        STATS = read_json(stats_path)
        if verbose:
            print("Stats file loaded from {}".format(stats_path))
    except Exception:
        STATS = {}
        if verbose:
            print("New stats file created, will be stored in {}".format(
                stats_path))
Exemple #9
0
def read_examples(paths, max_examples, Scenario):
    '''
    Read a maximum of |max_examples| examples from |paths|.
    '''
    examples = []
    for path in paths:
        print('read_examples: %s' % path)
        for raw in read_json(path):
            if max_examples >= 0 and len(examples) >= max_examples:
                break
            examples.append(Example.from_dict(raw, Scenario))
    return examples
Exemple #10
0
 def __init__(self,
              slot_scores_path=None,
              stop_words_path=None,
              threshold=4.):
     if not stop_words_path:
         self.stopwords = set(stopwords.words('english'))
     else:
         with open(stop_words_path, 'r') as fin:
             self.stopwords = set(fin.read().split()[:200])
     self.stopwords.update(
         ['.', '...', ',', '?', '!', '"', "n't", "'m", "'d", "'ll"])
     if slot_scores_path:
         self.slot_scores = read_json(slot_scores_path)
     self.threshold = threshold
     self.stemmer = PorterStemmer()
Exemple #11
0
 def load_candidates(self, paths):
     candidates = defaultdict(list)
     # When dumped to json, NamedTuple becomes list. Now convert it back.
     is_str = lambda x: isinstance(x, basestring)
     # x[0] (surface of entity): note that for prices from the offer action,
     # surface is float instead of string
     to_ent = lambda x: x.encode('utf-8') if is_str(x) else \
         Entity(x[0].encode('utf-8') if is_str(x[0]) else x[0], CanonicalEntity(*x[1]))
     for path in paths:
         print 'Load candidates from', path
         results = read_json(path)
         for r in results:
             # None for encoding turns
             if r['candidates'] is None:
                 candidates[(r['uuid'], r['role'])].append(None)
             else:
                 # Only take the response (list of tokens)
                 candidates_ = [[to_ent(x) for x in c['response']]
                                for c in ifilter(lambda x: 'response' in x,
                                                 r['candidates'])]
                 candidates[(r['uuid'], r['role'])].append(candidates_)
     return candidates
Exemple #12
0
    def read_system_responses(cls, system, path, num_context_utterances, data):
        """Read system responses and update the database.

        Args:
            system (str): system name
            path (str): a JSON file containing system outputs.
                |-[]
                 |-ex_id (str): unique id that identifies a context-reference pair
                 |-prev_turns (list)
                 |-reference
                 |-response
            num_context_utterances (int)
            data (dict): see from_file.

        """
        examples = read_json(path)
        for ex in examples:
            if not cls.valid_example(ex, num_context_utterances):
                continue
            qid = ex['ex_id']
            context_turns = ex['prev_turns'][-1*num_context_utterances:]
            agent_names = cls.get_agent_name(context_turns + [ex['reference']])
            context = []
            for i, u in enumerate(context_turns):
                u = cls.process_utterance(u, role=agent_names[i])
                if len(u[1]) > 0:
                    context.append(u)
            reference = cls.process_utterance(ex['reference'], role=agent_names[-1])
            response = cls.process_utterance(ex['response'], role=agent_names[-1])
            if not (len(reference) and len(response) and len(context)):
                continue
            if qid not in data:
                data[qid] = {
                        'context': context,
                        'responses': {}
                        }
            assert system not in data[qid]['responses']
            data[qid]['responses'][system] = response
            data[qid]['responses']['reference'] = reference
Exemple #13
0
def get_data_generator(args, model_args, mappings, schema):
    from preprocess import DataGenerator, Preprocessor
    from cocoa.core.scenario_db import ScenarioDB
    from cocoa.core.mutualfriends.lexicon import Lexicon
    from cocoa.core.dataset import read_dataset
    from cocoa.core.util import read_json
    import time

    scenario_db = ScenarioDB.from_dict(schema, read_json(args.scenarios_path))
    dataset = read_dataset(scenario_db, args)
    print 'Building lexicon...'
    start = time.time()
    lexicon = Lexicon(schema, args.learned_lex, stop_words=args.stop_words)
    print '%.2f s' % (time.time() - start)

    # Dataset
    use_kb = False if model_args.model == 'encdec' else True
    copy = True if model_args.model == 'attn-copy-encdec' else False
    if model_args.model == 'attn-copy-encdec':
        model_args.entity_target_form = 'graph'
    preprocessor = Preprocessor(schema, lexicon,
                                model_args.entity_encoding_form,
                                model_args.entity_decoding_form,
                                model_args.entity_target_form,
                                model_args.prepend)
    if args.test:
        model_args.dropout = 0
        data_generator = DataGenerator(None, None, dataset.test_examples,
                                       preprocessor, schema,
                                       model_args.num_items, mappings, use_kb,
                                       copy)
    else:
        data_generator = DataGenerator(dataset.train_examples,
                                       dataset.test_examples, None,
                                       preprocessor, schema,
                                       model_args.num_items, mappings, use_kb,
                                       copy)

    return data_generator
Exemple #14
0
        default=10,
        help='Number of questions repeated in each HIT for estimating agreement'
    )
    parser.add_argument('--compare',
                        action='store_true',
                        help='Rate by comparing two responses')
    parser.add_argument('--title',
                        default='Dialogue response evaluation',
                        help='Title of the HIT')
    add_turk_task_arguments(parser)
    add_eval_data_arguments(parser)
    args = parser.parse_args()

    random.seed(1)

    config = read_json(args.aws_config)
    mtc = get_mturk_connection(config, debug=args.debug)

    if args.debug:
        lifetime = 0.1
        args.hit_db = 'test.json'
    else:
        lifetime = 30
    if args.compare:
        Task = HTMLCompareEvalTask
    else:
        Task = HTMLEvalTask
    task = Task(
        mtc=mtc,
        title=args.title,
        description='Rate a response in a dialogue',
Exemple #15
0
        raise ValueError(
            "Location of HTML templates should be specified in config with the key templates_dir"
        )
    if not os.path.exists(templates_dir):
        raise ValueError("Specified HTML template location doesn't exist: %s" %
                         templates_dir)

    app = create_app(debug=False, templates_dir=templates_dir)

    schema_path = args.schema_path

    if not os.path.exists(schema_path):
        raise ValueError("No schema file found at %s" % schema_path)

    schema = Schema(schema_path)
    scenarios = read_json(args.scenarios_path)
    if args.num_scenarios is not None:
        scenarios = scenarios[:args.num_scenarios]
    scenario_db = ScenarioDB.from_dict(schema, scenarios, Scenario)
    app.config['scenario_db'] = scenario_db

    if 'models' not in params.keys():
        params['models'] = {}

    if 'quit_after' not in params.keys():
        params['quit_after'] = params['status_params']['chat'][
            'num_seconds'] + 500

    if 'skip_chat_enabled' not in params.keys():
        params['skip_chat_enabled'] = False
Exemple #16
0
                        default=[],
                        help='Input test examples')
    parser.add_argument('--train-max-examples',
                        type=int,
                        help='Maximum number of training examples')
    parser.add_argument('--test-max-examples',
                        type=int,
                        help='Maximum number of test examples')
    parser.add_argument('--eval-examples-paths',
                        nargs='*',
                        default=[],
                        help='Path to multi-response evaluation files')


def read_dataset(args, Scenario):
    '''
    Return the dataset specified by the given args.
    '''
    train_examples = read_examples(args.train_examples_paths,
                                   args.train_max_examples, Scenario)
    test_examples = read_examples(args.test_examples_paths,
                                  args.test_max_examples, Scenario)
    dataset = Dataset(train_examples, test_examples)
    return dataset


if __name__ == "__main__":
    raw = read_json("fb-negotiation/data/transformed_test.json")
    for idx, example in enumerate(raw):
        print(Example.test_dict(example))
Exemple #17
0
        raise ValueError(
            "Location of HTML templates should be specified in config with the key templates_dir"
        )
    if not os.path.exists(templates_dir):
        raise ValueError("Specified HTML template location doesn't exist: %s" %
                         templates_dir)

    app = create_app(debug=False, templates_dir=templates_dir)

    schema_path = args.schema_path

    if not os.path.exists(schema_path):
        raise ValueError("No schema file found at %s" % schema_path)

    schema = Schema(schema_path)
    scenarios = read_json(args.scenarios_path)
    if args.num_scenarios is not None:
        scenarios = scenarios[:args.num_scenarios]
    scenario_db = ScenarioDB.from_dict(schema, scenarios, Scenario)
    app.config['scenario_db'] = scenario_db

    if 'models' not in params.keys():
        params['models'] = {}

    if 'quit_after' not in params.keys():
        params[
            'quit_after'] = params['status_params']['chat']['num_seconds'] + 1

    if 'skip_chat_enabled' not in params.keys():
        params['skip_chat_enabled'] = False
Exemple #18
0
                    help='Transciprts paths',
                    nargs='*',
                    default=[])
parser.add_argument('--train-frac',
                    help='Fraction of training examples',
                    type=float,
                    default=0.6)
parser.add_argument('--test-frac',
                    help='Fraction of test examples',
                    type=float,
                    default=0.2)
parser.add_argument('--dev-frac',
                    help='Fraction of dev examples',
                    type=float,
                    default=0.2)
parser.add_argument('--output-path', help='Output path for splits')
args = parser.parse_args()

np.random.seed(0)
json_data = ([], [], [])
for path in args.example_paths:
    examples = read_json(path)
    folds = np.random.choice(
        3, len(examples), p=[args.train_frac, args.dev_frac, args.test_frac])
    for ex, fold in izip(examples, folds):
        json_data[fold].append(ex)

for fold, dataset in izip(('train', 'dev', 'test'), json_data):
    if len(dataset) > 0:
        write_json(dataset, '%s%s.json' % (args.output_path, fold))
                        required=True,
                        help="Fractions of data from different categories")
    parser.add_argument('--skip',
                        default=0,
                        type=int,
                        help="Skip the first X scenarios")
    cocoa.options.add_scenario_arguments(parser)
    args = parser.parse_args()

    random.seed(args.random_seed)
    np.random.seed(args.random_seed)

    schema = Schema(args.schema_path)

    listings = [
        read_json(
            os.path.join(args.scraped_data, 'craigslist_{}.json'.format(c)))
        for c in args.categories
    ]
    fractions = np.array([float(x) for x in args.fractions])
    fractions = fractions / np.sum(fractions)

    # Sample listings
    sampled_listings = []
    N = sum([len(l) for l in listings])
    for listing, fraction in izip(listings, fractions):
        n = int(N * fraction)
        print listing[0]['category'], len(listing), fraction, n
        sampled_listings.append(listing[:n])
    listings = [x for l in sampled_listings for x in l]
    N = len(listings)
    inds = np.random.permutation(N)
Exemple #20
0
 def _read_surveys(self, survey_paths):
     dialogue_scores = {}
     for path in survey_paths:
         dialogue_scores.update(read_json(path)[1])
     return dialogue_scores
Exemple #21
0
            tooltip = mpld3.plugins.PointHTMLTooltip(points,
                                                     labels=labels,
                                                     voffset=10,
                                                     hoffset=10,
                                                     css=self.css)
            mpld3.plugins.connect(fig, tooltip)

        fig_dict = mpld3.fig_to_dict(fig)
        plt.close()
        return fig_dict


####### TEST #########
if __name__ == '__main__':
    from cocoa.core.negotiation.price_tracker import PriceTracker
    from cocoa.core.util import read_json
    from liwc import LIWC

    transcripts = read_json('web_output/combined/transcripts/transcripts.json')
    price_tracker = PriceTracker('/scr/hehe/game-dialogue/price_tracker.pkl')
    liwc = LIWC.from_pkl('data/liwc.pkl')
    dialogue = Dialogue.from_dict(transcripts[0], price_tracker)
    dialogue.label_liwc(liwc)
    for u in dialogue.iter_utterances():
        print(u.text)
        print(u.categories)
    #dialogue.extract_keywords()
    #dialogue.label_speech_acts()
    #dialogue.label_stage()
    #dialogue.fig_dict()
Exemple #22
0
              self).__init__(timed_session,
                             configs,
                             policy=policy,
                             max_chats_per_config=max_chats_per_config,
                             db=db)
        self.lexicon = lexicon
        self.templates = templates

    def _new_session(self, agent, kb, config):
        config = Config(*config)
        return RulebasedSession.get_session(agent, kb, self.lexicon, config,
                                            self.templates)


############# TEST ##############

if __name__ == '__main__':
    from cocoa.core.util import read_json
    configs = read_json('data/rulebased_configs.json')
    configs = [tuple(c) for c in configs]
    s = BaseConfigurableRulebasedSystem(False, configs)
    #print s.configs
    #print s.choose_config()
    #print s.choose_config()
    print s.trials
    #s.update_trials([(configs[0], '0', {'margin': 0.1, 'humanlike': 1})])
    #s.update_trials([(configs[0], '1', {'margin': 0.1, 'humanlike': 1})])
    s.update_trials([(configs[1], '3', {'margin': 0.1})])
    #s.update_trials([(configs[1], '2', {'humanlike': 0.1})])
    print s.trials
Exemple #23
0
def scenarios(schema):
    scenarios_path = 'data/negotiation/craigslist-scenarios.json'
    scenario_db = ScenarioDB.from_dict(schema, read_json(scenarios_path))
    return scenario_db
Exemple #24
0
    def __init__(self,
                 schema,
                 price_tracker,
                 model_path,
                 mappings_path,
                 decoding,
                 index=None,
                 num_candidates=20,
                 retriever_context_len=2,
                 timed_session=False):
        super(NeuralSystem, self).__init__()
        self.schema = schema
        self.price_tracker = price_tracker
        self.timed_session = timed_session

        # Load arguments
        args_path = os.path.join(model_path, 'config.json')
        config = read_json(args_path)
        config['batch_size'] = 1
        config['gpu'] = 0  # Don't need GPU for batch_size=1
        config['decoding'] = decoding
        config['pretrained_wordvec'] = None
        args = argparse.Namespace(**config)

        vocab_path = os.path.join(mappings_path, 'vocab.pkl')
        mappings = read_pickle(vocab_path)
        vocab = mappings['vocab']

        # TODO: different models have the same key now
        args.dropout = 0
        logstats.add_args('model_args', args)
        model = build_model(schema, mappings, None, args)

        # Tensorflow config
        if args.gpu == 0:
            print 'GPU is disabled'
            config = tf.ConfigProto(device_count={'GPU': 0})
        else:
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5,
                                        allow_growth=True)
            config = tf.ConfigProto(device_count={'GPU': 1},
                                    gpu_options=gpu_options)

        # NOTE: need to close the session when done
        tf_session = tf.Session(config=config)
        tf.initialize_all_variables().run(session=tf_session)

        # Load TF model parameters
        ckpt = tf.train.get_checkpoint_state(model_path + '-best')
        assert ckpt, 'No checkpoint found'
        assert ckpt.model_checkpoint_path, 'No model path found in checkpoint'
        saver = tf.train.Saver()
        saver.restore(tf_session, ckpt.model_checkpoint_path)

        # Model config tells data generator which batcher to use
        model_config = {}
        if args.retrieve or args.model in ('ir', 'selector'):
            model_config['retrieve'] = True
        if args.predict_price:
            model_config['price'] = True

        self.model_name = args.model
        preprocessor = Preprocessor(schema, price_tracker,
                                    args.entity_encoding_form,
                                    args.entity_decoding_form,
                                    args.entity_target_form)
        textint_map = TextIntMap(vocab, preprocessor)
        int_markers = SpecialSymbols(
            *[mappings['vocab'].to_ind(m) for m in markers])
        dialogue_batcher = DialogueBatcherFactory.get_dialogue_batcher(
            model_config,
            int_markers=int_markers,
            slot_filling=False,
            kb_pad=mappings['kb_vocab'].to_ind(markers.PAD))

        # Retriever
        if args.model == 'selector':
            retriever = Retriever(index,
                                  context_size=retriever_context_len,
                                  num_candidates=num_candidates)
        else:
            retriever = None

        #TODO: class variable is not a good way to do this
        Dialogue.mappings = mappings
        Dialogue.textint_map = textint_map
        Dialogue.preprocessor = preprocessor
        Dialogue.num_context = args.num_context

        Env = namedtuple('Env', [
            'model', 'tf_session', 'preprocessor', 'vocab', 'textint_map',
            'stop_symbol', 'remove_symbols', 'max_len', 'dialogue_batcher',
            'retriever'
        ])
        self.env = Env(model,
                       tf_session,
                       preprocessor,
                       mappings['vocab'],
                       textint_map,
                       stop_symbol=vocab.to_ind(markers.EOS),
                       remove_symbols=map(vocab.to_ind,
                                          (markers.EOS, markers.PAD)),
                       max_len=20,
                       dialogue_batcher=dialogue_batcher,
                       retriever=retriever)
Exemple #25
0
    add_model_arguments(parser)
    add_learner_arguments(parser)
    args = parser.parse_args()

    random.seed(args.random_seed)
    if not os.path.isdir(os.path.dirname(args.stats_file)):
        os.makedirs(os.path.dirname(args.stats_file))
    logstats.init(args.stats_file)
    logstats.add_args('config', args)

    # Save or load models
    if args.init_from:
        start = time.time()
        print 'Load model (config, vocab, checkpoint) from', args.init_from
        config_path = os.path.join(args.init_from, 'config.json')
        saved_config = read_json(config_path)

        # NOTE: args below can be overwritten
        # TODO: separate temperature from decoding arg
        saved_config['decoding'] = args.decoding
        saved_config['temperature'] = args.temperature
        saved_config['batch_size'] = args.batch_size
        saved_config['pretrained_wordvec'] = args.pretrained_wordvec
        saved_config['ranker'] = args.ranker

        model_args = argparse.Namespace(**saved_config)

        # Checkpoint
        if args.test and args.best:
            ckpt = tf.train.get_checkpoint_state(args.init_from + '-best')
        else:
Exemple #26
0
 def from_json(cls, data_path):
     """Construct from dumped data.
     """
     data = read_json(data_path)
     return cls(data)
import argparse
from cocoa.core.util import read_json, write_json
from cocoa.core.scenario_db import ScenarioDB
from cocoa.core.schema import Schema
from core.scenario import Scenario

parser = argparse.ArgumentParser()
parser.add_argument('--chats')
parser.add_argument('--scenarios')
parser.add_argument('--max', type=int)
args = parser.parse_args()

chats = read_json(args.chats)
scenarios = []
n = args.max or len(chats)
for chat in chats[:n]:
    scenarios.append(Scenario.from_dict(None, chat['scenario']))
scenario_db = ScenarioDB(scenarios)
write_json(scenario_db.to_dict(), args.scenarios)
Exemple #28
0
                        action='store_true',
                        help='Whether or not to have verbose prints')
    parser.add_argument('--valid-scenarios-path',
                        help='Output path for the validation scenarios')
    cocoa.options.add_scenario_arguments(parser)
    options.add_system_arguments(parser)
    options.add_rl_arguments(parser)
    options.add_model_arguments(parser)
    args = parser.parse_args()

    if args.random_seed:
        random.seed(args.random_seed)
        np.random.seed(args.random_seed)

    schema = Schema(args.schema_path)
    scenario_db = ScenarioDB.from_dict(schema, read_json(args.scenarios_path),
                                       Scenario)
    valid_scenario_db = ScenarioDB.from_dict(
        schema, read_json(args.valid_scenarios_path), Scenario)

    assert len(args.agent_checkpoints) <= len(args.agents)
    systems = [
        get_system(name, args, schema, False, args.agent_checkpoints[i])
        for i, name in enumerate(args.agents)
    ]

    rl_agent = 0
    system = systems[rl_agent]
    model = system.env.model
    loss = make_loss(args, model, system.mappings['tgt_vocab'])
    optim = build_optim(args, model, None)
Exemple #29
0
        '''
        examples: json chats
        Use "$xxx$ as ground truth, and record n-gram context before and after the price.
        '''
        context = {'left': defaultdict(int), 'right': defaultdict(int)}
        for ex in examples:
            for event in ex['events']:
                if event['action'] == 'message':
                    tokens = tokenize(event['data'])
                    tokens = ['<s>'] + tokens + ['</s>']
                    for i, token in enumerate(tokens):
                        if token[0] == '$' or token[-1] == '$':
                            context['left'][tokens[i - 1]] += 1
                            context['right'][tokens[i + 1]] += 1
        if output_path:
            write_pickle(context, output_path)
        return context


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('--train-examples-path',
                        help='Path to training json file')
    parser.add_argument('--output', help='Path to output model')
    args = parser.parse_args()

    examples = read_json(args.train_examples_path)
    PriceTracker.train(examples, args.output)
Exemple #30
0
def get_data_generator(args, model_args, mappings, schema):
    from cocoa.core.scenario_db import ScenarioDB
    from cocoa.core.dataset import read_dataset, EvalExample
    from cocoa.core.util import read_json

    from core.scenario import Scenario
    from core.price_tracker import PriceTracker
    from core.slot_detector import SlotDetector
    from retriever import Retriever
    from preprocess import DataGenerator, LMDataGenerator, EvalDataGenerator, Preprocessor
    import os.path

    # TODO: move this to dataset
    if args.eval:
        dataset = []
        for path in args.eval_examples_paths:
            dataset.extend(
                [EvalExample.from_dict(schema, e) for e in read_json(path)])
    else:
        dataset = read_dataset(args, Scenario)
    lexicon = PriceTracker(model_args.price_tracker_model)
    slot_detector = SlotDetector(slot_scores_path=model_args.slot_scores)

    # Model config tells data generator which batcher to use
    model_config = {}
    if args.retrieve or model_args.model in ('ir', 'selector'):
        model_config['retrieve'] = True
    if args.predict_price:
        model_config['price'] = True

    # For retrieval-based models only: whether to add ground truth response in the candidates
    if model_args.model in ('selector', 'ir'):
        if 'loss' in args.eval_modes and 'generation' in args.eval_modes:
            print '"loss" requires ground truth reponse to be added to the candidate set. Please evaluate "loss" and "generation" separately.'
            raise ValueError
        if (not args.test) or args.eval_modes == ['loss']:
            add_ground_truth = True
        else:
            add_ground_truth = False
        print 'Ground truth response {} be added to the candidate set.'.format(
            'will' if add_ground_truth else 'will not')
    else:
        add_ground_truth = False

    # TODO: hacky
    if args.model == 'lm':
        DataGenerator = LMDataGenerator

    if args.retrieve or args.model in ('selector', 'ir'):
        retriever = Retriever(args.index,
                              context_size=args.retriever_context_len,
                              num_candidates=args.num_candidates)
    else:
        retriever = None

    preprocessor = Preprocessor(schema,
                                lexicon,
                                model_args.entity_encoding_form,
                                model_args.entity_decoding_form,
                                model_args.entity_target_form,
                                slot_filling=model_args.slot_filling,
                                slot_detector=slot_detector)

    trie_path = os.path.join(model_args.mappings, 'trie.pkl')

    if args.eval:
        data_generator = EvalDataGenerator(dataset, preprocessor, mappings,
                                           model_args.num_context)
    else:
        if args.test:
            model_args.dropout = 0
            train, dev, test = None, None, dataset.test_examples
        else:
            train, dev, test = dataset.train_examples, dataset.test_examples, None
        data_generator = DataGenerator(train,
                                       dev,
                                       test,
                                       preprocessor,
                                       schema,
                                       mappings,
                                       retriever=retriever,
                                       cache=args.cache,
                                       ignore_cache=args.ignore_cache,
                                       candidates_path=args.candidates_path,
                                       num_context=model_args.num_context,
                                       trie_path=trie_path,
                                       batch_size=args.batch_size,
                                       model_config=model_config,
                                       add_ground_truth=add_ground_truth)

    return data_generator