def init_trainer(self, args): if args.gpuid: print('Running with GPU {}.'.format(args.gpuid[0])) cuda.set_device(args.gpuid[0]) else: print('Running with CPU.') if args.random_seed: random.seed(args.random_seed + os.getpid()) np.random.seed(args.random_seed + os.getpid()) schema = Schema(args.schema_path) scenario_db = ScenarioDB.from_dict(schema, read_json(args.scenarios_path), Scenario) valid_scenario_db = ScenarioDB.from_dict( schema, read_json(args.valid_scenarios_path), Scenario) # if len(args.agent_checkpoints) == 0 # assert len(args.agent_checkpoints) <= len(args.agents) if len(args.agent_checkpoints) < len(args.agents): ckpt = [None] * 2 else: ckpt = args.agent_checkpoints systems = [ get_system(name, args, schema, False, ckpt[i]) for i, name in enumerate(args.agents) ] rl_agent = 0 system = systems[rl_agent] model = system.env.model loss = None # optim = build_optim(args, [model, system.env.critic], None) optim = { 'model': build_optim(args, model, None), 'critic': build_optim(args, system.env.critic, None) } optim['critic']._set_rate(0.05) scenarios = { 'train': scenario_db.scenarios_list, 'dev': valid_scenario_db.scenarios_list } from neural.a2c_trainer import RLTrainer as A2CTrainer trainer = A2CTrainer(systems, scenarios, loss, optim, rl_agent, reward_func=args.reward, cuda=(len(args.gpuid) > 0), args=args) self.args = args self.trainer = trainer self.systems = systems
def __init__(self, chats, surveys=None, worker_ids=None): self.chats = [] for f in chats: self.chats.extend(read_json(f)) self.uuid_to_chat = {chat['uuid']: chat for chat in self.chats} if surveys: # This is a list because we might have multiple batches of surveys self.surveys = [read_json(survey) for survey in surveys] if worker_ids: self.worker_ids = {} for f in worker_ids: self.worker_ids.update(read_json(f)) else: self.worker_ids = None
def init(path): global stats_path, STATS stats_path = path try: STATS = read_json(stats_path) except Exception: STATS = {}
def _read_transcripts(self, transcripts_paths, max_examples): transcripts = [] for transcripts_path in transcripts_paths: transcripts.extend(read_json(transcripts_path)) if max_examples is not None: transcripts = transcripts[:max_examples] return transcripts
def __init__(self, schema, price_tracker, retriever, model_path, mappings, timed_session=False): super(NeuralRankerSystem, self).__init__() self.schema = schema self.price_tracker = price_tracker self.timed_session = timed_session # Load arguments args_path = os.path.join(model_path, 'config.json') config = read_json(args_path) # TODO: handle this properly config['batch_size'] = 1 config['pretrained_wordvec'] = None args = argparse.Namespace(**config) mappings_path = os.path.join(mappings, 'vocab.pkl') mappings = read_pickle(mappings_path) vocab = mappings['vocab'] logstats.add_args('model_args', args) model = build_model(schema, mappings, None, args) # Tensorflow config if args.gpu == 0: print 'GPU is disabled' config = tf.ConfigProto(device_count = {'GPU': 0}) else: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.5, allow_growth=True) config = tf.ConfigProto(device_count = {'GPU': 1}, gpu_options=gpu_options) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # NOTE: need to close the session when done tf_session = tf.Session(config=config) tf_session.run(tf.global_variables_initializer()) # Load TF model parameters ckpt = tf.train.get_checkpoint_state(model_path+'-best') assert ckpt, 'No checkpoint found' assert ckpt.model_checkpoint_path, 'No model path found in checkpoint' saver = tf.train.Saver() saver.restore(tf_session, ckpt.model_checkpoint_path) preprocessor = Preprocessor(schema, price_tracker, 'canonical', 'canonical', 'canonical') textint_map = TextIntMap(vocab, preprocessor) int_markers = SpecialSymbols(*[mappings['vocab'].to_ind(m) for m in markers]) model_config = {'retrieve': True} batcher = DialogueBatcherFactory.get_dialogue_batcher(model_config, int_markers=int_markers, slot_filling=False, kb_pad=mappings['kb_vocab'].to_ind(markers.PAD)) StreamingDialogue.textint_map = textint_map StreamingDialogue.num_context = args.num_context StreamingDialogue.mappings = mappings Env = namedtuple('Env', ['ranker', 'retriever', 'tf_session', 'preprocessor', 'mappings', 'textint_map', 'batcher']) self.env = Env(model, retriever, tf_session, preprocessor, mappings, textint_map, batcher)
def __init__(self, schema, lexicon, model_path, fact_check, decoding, timed_session=False, consecutive_entity=True, realizer=None): super(NeuralSystem, self).__init__() self.schema = schema self.lexicon = lexicon self.timed_session = timed_session self.consecutive_entity = consecutive_entity # Load arguments args_path = os.path.join(model_path, 'config.json') config = read_json(args_path) config['batch_size'] = 1 config['gpu'] = 0 # Don't need GPU for batch_size=1 config['decoding'] = decoding args = argparse.Namespace(**config) mappings_path = os.path.join(model_path, 'vocab.pkl') mappings = read_pickle(mappings_path) vocab = mappings['vocab'] # TODO: different models have the same key now args.dropout = 0 logstats.add_args('model_args', args) model = build_model(schema, mappings, args) # Tensorflow config if args.gpu == 0: print 'GPU is disabled' config = tf.ConfigProto(device_count = {'GPU': 0}) else: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.5, allow_growth=True) config = tf.ConfigProto(device_count = {'GPU': 1}, gpu_options=gpu_options) # NOTE: need to close the session when done tf_session = tf.Session(config=config) tf.initialize_all_variables().run(session=tf_session) # Load TF model parameters ckpt = tf.train.get_checkpoint_state(model_path+'-best') assert ckpt, 'No checkpoint found' assert ckpt.model_checkpoint_path, 'No model path found in checkpoint' saver = tf.train.Saver() saver.restore(tf_session, ckpt.model_checkpoint_path) self.model_name = args.model if self.model_name == 'attn-copy-encdec': args.entity_target_form = 'graph' copy = True else: copy = False preprocessor = Preprocessor(schema, lexicon, args.entity_encoding_form, args.entity_decoding_form, args.entity_target_form, args.prepend) textint_map = TextIntMap(vocab, mappings['entity'], preprocessor) Env = namedtuple('Env', ['model', 'tf_session', 'preprocessor', 'vocab', 'copy', 'textint_map', 'stop_symbol', 'remove_symbols', 'max_len', 'evaluator', 'prepend', 'consecutive_entity', 'realizer']) self.env = Env(model, tf_session, preprocessor, mappings['vocab'], copy, textint_map, stop_symbol=vocab.to_ind(markers.EOS), remove_symbols=map(vocab.to_ind, (markers.EOS, markers.PAD)), max_len=20, evaluator=FactEvaluator() if fact_check else None, prepend=args.prepend, consecutive_entity=self.consecutive_entity, realizer=realizer)
def get_evaluated_qids(cls, db_path): db = read_json(db_path) qids = set() for hit_id, hit_info in db.iteritems(): for assignment_id, result in hit_info.iteritems(): answers = result['answers'] for answer in answers: if answer['qid'] == 'comment': continue qids.add(answer['qid']) return qids
def init(path, verbose=False): global stats_path, STATS stats_path = path try: STATS = read_json(stats_path) if verbose: print("Stats file loaded from {}".format(stats_path)) except Exception: STATS = {} if verbose: print("New stats file created, will be stored in {}".format( stats_path))
def read_examples(paths, max_examples, Scenario): ''' Read a maximum of |max_examples| examples from |paths|. ''' examples = [] for path in paths: print('read_examples: %s' % path) for raw in read_json(path): if max_examples >= 0 and len(examples) >= max_examples: break examples.append(Example.from_dict(raw, Scenario)) return examples
def __init__(self, slot_scores_path=None, stop_words_path=None, threshold=4.): if not stop_words_path: self.stopwords = set(stopwords.words('english')) else: with open(stop_words_path, 'r') as fin: self.stopwords = set(fin.read().split()[:200]) self.stopwords.update( ['.', '...', ',', '?', '!', '"', "n't", "'m", "'d", "'ll"]) if slot_scores_path: self.slot_scores = read_json(slot_scores_path) self.threshold = threshold self.stemmer = PorterStemmer()
def load_candidates(self, paths): candidates = defaultdict(list) # When dumped to json, NamedTuple becomes list. Now convert it back. is_str = lambda x: isinstance(x, basestring) # x[0] (surface of entity): note that for prices from the offer action, # surface is float instead of string to_ent = lambda x: x.encode('utf-8') if is_str(x) else \ Entity(x[0].encode('utf-8') if is_str(x[0]) else x[0], CanonicalEntity(*x[1])) for path in paths: print 'Load candidates from', path results = read_json(path) for r in results: # None for encoding turns if r['candidates'] is None: candidates[(r['uuid'], r['role'])].append(None) else: # Only take the response (list of tokens) candidates_ = [[to_ent(x) for x in c['response']] for c in ifilter(lambda x: 'response' in x, r['candidates'])] candidates[(r['uuid'], r['role'])].append(candidates_) return candidates
def read_system_responses(cls, system, path, num_context_utterances, data): """Read system responses and update the database. Args: system (str): system name path (str): a JSON file containing system outputs. |-[] |-ex_id (str): unique id that identifies a context-reference pair |-prev_turns (list) |-reference |-response num_context_utterances (int) data (dict): see from_file. """ examples = read_json(path) for ex in examples: if not cls.valid_example(ex, num_context_utterances): continue qid = ex['ex_id'] context_turns = ex['prev_turns'][-1*num_context_utterances:] agent_names = cls.get_agent_name(context_turns + [ex['reference']]) context = [] for i, u in enumerate(context_turns): u = cls.process_utterance(u, role=agent_names[i]) if len(u[1]) > 0: context.append(u) reference = cls.process_utterance(ex['reference'], role=agent_names[-1]) response = cls.process_utterance(ex['response'], role=agent_names[-1]) if not (len(reference) and len(response) and len(context)): continue if qid not in data: data[qid] = { 'context': context, 'responses': {} } assert system not in data[qid]['responses'] data[qid]['responses'][system] = response data[qid]['responses']['reference'] = reference
def get_data_generator(args, model_args, mappings, schema): from preprocess import DataGenerator, Preprocessor from cocoa.core.scenario_db import ScenarioDB from cocoa.core.mutualfriends.lexicon import Lexicon from cocoa.core.dataset import read_dataset from cocoa.core.util import read_json import time scenario_db = ScenarioDB.from_dict(schema, read_json(args.scenarios_path)) dataset = read_dataset(scenario_db, args) print 'Building lexicon...' start = time.time() lexicon = Lexicon(schema, args.learned_lex, stop_words=args.stop_words) print '%.2f s' % (time.time() - start) # Dataset use_kb = False if model_args.model == 'encdec' else True copy = True if model_args.model == 'attn-copy-encdec' else False if model_args.model == 'attn-copy-encdec': model_args.entity_target_form = 'graph' preprocessor = Preprocessor(schema, lexicon, model_args.entity_encoding_form, model_args.entity_decoding_form, model_args.entity_target_form, model_args.prepend) if args.test: model_args.dropout = 0 data_generator = DataGenerator(None, None, dataset.test_examples, preprocessor, schema, model_args.num_items, mappings, use_kb, copy) else: data_generator = DataGenerator(dataset.train_examples, dataset.test_examples, None, preprocessor, schema, model_args.num_items, mappings, use_kb, copy) return data_generator
default=10, help='Number of questions repeated in each HIT for estimating agreement' ) parser.add_argument('--compare', action='store_true', help='Rate by comparing two responses') parser.add_argument('--title', default='Dialogue response evaluation', help='Title of the HIT') add_turk_task_arguments(parser) add_eval_data_arguments(parser) args = parser.parse_args() random.seed(1) config = read_json(args.aws_config) mtc = get_mturk_connection(config, debug=args.debug) if args.debug: lifetime = 0.1 args.hit_db = 'test.json' else: lifetime = 30 if args.compare: Task = HTMLCompareEvalTask else: Task = HTMLEvalTask task = Task( mtc=mtc, title=args.title, description='Rate a response in a dialogue',
raise ValueError( "Location of HTML templates should be specified in config with the key templates_dir" ) if not os.path.exists(templates_dir): raise ValueError("Specified HTML template location doesn't exist: %s" % templates_dir) app = create_app(debug=False, templates_dir=templates_dir) schema_path = args.schema_path if not os.path.exists(schema_path): raise ValueError("No schema file found at %s" % schema_path) schema = Schema(schema_path) scenarios = read_json(args.scenarios_path) if args.num_scenarios is not None: scenarios = scenarios[:args.num_scenarios] scenario_db = ScenarioDB.from_dict(schema, scenarios, Scenario) app.config['scenario_db'] = scenario_db if 'models' not in params.keys(): params['models'] = {} if 'quit_after' not in params.keys(): params['quit_after'] = params['status_params']['chat'][ 'num_seconds'] + 500 if 'skip_chat_enabled' not in params.keys(): params['skip_chat_enabled'] = False
default=[], help='Input test examples') parser.add_argument('--train-max-examples', type=int, help='Maximum number of training examples') parser.add_argument('--test-max-examples', type=int, help='Maximum number of test examples') parser.add_argument('--eval-examples-paths', nargs='*', default=[], help='Path to multi-response evaluation files') def read_dataset(args, Scenario): ''' Return the dataset specified by the given args. ''' train_examples = read_examples(args.train_examples_paths, args.train_max_examples, Scenario) test_examples = read_examples(args.test_examples_paths, args.test_max_examples, Scenario) dataset = Dataset(train_examples, test_examples) return dataset if __name__ == "__main__": raw = read_json("fb-negotiation/data/transformed_test.json") for idx, example in enumerate(raw): print(Example.test_dict(example))
raise ValueError( "Location of HTML templates should be specified in config with the key templates_dir" ) if not os.path.exists(templates_dir): raise ValueError("Specified HTML template location doesn't exist: %s" % templates_dir) app = create_app(debug=False, templates_dir=templates_dir) schema_path = args.schema_path if not os.path.exists(schema_path): raise ValueError("No schema file found at %s" % schema_path) schema = Schema(schema_path) scenarios = read_json(args.scenarios_path) if args.num_scenarios is not None: scenarios = scenarios[:args.num_scenarios] scenario_db = ScenarioDB.from_dict(schema, scenarios, Scenario) app.config['scenario_db'] = scenario_db if 'models' not in params.keys(): params['models'] = {} if 'quit_after' not in params.keys(): params[ 'quit_after'] = params['status_params']['chat']['num_seconds'] + 1 if 'skip_chat_enabled' not in params.keys(): params['skip_chat_enabled'] = False
help='Transciprts paths', nargs='*', default=[]) parser.add_argument('--train-frac', help='Fraction of training examples', type=float, default=0.6) parser.add_argument('--test-frac', help='Fraction of test examples', type=float, default=0.2) parser.add_argument('--dev-frac', help='Fraction of dev examples', type=float, default=0.2) parser.add_argument('--output-path', help='Output path for splits') args = parser.parse_args() np.random.seed(0) json_data = ([], [], []) for path in args.example_paths: examples = read_json(path) folds = np.random.choice( 3, len(examples), p=[args.train_frac, args.dev_frac, args.test_frac]) for ex, fold in izip(examples, folds): json_data[fold].append(ex) for fold, dataset in izip(('train', 'dev', 'test'), json_data): if len(dataset) > 0: write_json(dataset, '%s%s.json' % (args.output_path, fold))
required=True, help="Fractions of data from different categories") parser.add_argument('--skip', default=0, type=int, help="Skip the first X scenarios") cocoa.options.add_scenario_arguments(parser) args = parser.parse_args() random.seed(args.random_seed) np.random.seed(args.random_seed) schema = Schema(args.schema_path) listings = [ read_json( os.path.join(args.scraped_data, 'craigslist_{}.json'.format(c))) for c in args.categories ] fractions = np.array([float(x) for x in args.fractions]) fractions = fractions / np.sum(fractions) # Sample listings sampled_listings = [] N = sum([len(l) for l in listings]) for listing, fraction in izip(listings, fractions): n = int(N * fraction) print listing[0]['category'], len(listing), fraction, n sampled_listings.append(listing[:n]) listings = [x for l in sampled_listings for x in l] N = len(listings) inds = np.random.permutation(N)
def _read_surveys(self, survey_paths): dialogue_scores = {} for path in survey_paths: dialogue_scores.update(read_json(path)[1]) return dialogue_scores
tooltip = mpld3.plugins.PointHTMLTooltip(points, labels=labels, voffset=10, hoffset=10, css=self.css) mpld3.plugins.connect(fig, tooltip) fig_dict = mpld3.fig_to_dict(fig) plt.close() return fig_dict ####### TEST ######### if __name__ == '__main__': from cocoa.core.negotiation.price_tracker import PriceTracker from cocoa.core.util import read_json from liwc import LIWC transcripts = read_json('web_output/combined/transcripts/transcripts.json') price_tracker = PriceTracker('/scr/hehe/game-dialogue/price_tracker.pkl') liwc = LIWC.from_pkl('data/liwc.pkl') dialogue = Dialogue.from_dict(transcripts[0], price_tracker) dialogue.label_liwc(liwc) for u in dialogue.iter_utterances(): print(u.text) print(u.categories) #dialogue.extract_keywords() #dialogue.label_speech_acts() #dialogue.label_stage() #dialogue.fig_dict()
self).__init__(timed_session, configs, policy=policy, max_chats_per_config=max_chats_per_config, db=db) self.lexicon = lexicon self.templates = templates def _new_session(self, agent, kb, config): config = Config(*config) return RulebasedSession.get_session(agent, kb, self.lexicon, config, self.templates) ############# TEST ############## if __name__ == '__main__': from cocoa.core.util import read_json configs = read_json('data/rulebased_configs.json') configs = [tuple(c) for c in configs] s = BaseConfigurableRulebasedSystem(False, configs) #print s.configs #print s.choose_config() #print s.choose_config() print s.trials #s.update_trials([(configs[0], '0', {'margin': 0.1, 'humanlike': 1})]) #s.update_trials([(configs[0], '1', {'margin': 0.1, 'humanlike': 1})]) s.update_trials([(configs[1], '3', {'margin': 0.1})]) #s.update_trials([(configs[1], '2', {'humanlike': 0.1})]) print s.trials
def scenarios(schema): scenarios_path = 'data/negotiation/craigslist-scenarios.json' scenario_db = ScenarioDB.from_dict(schema, read_json(scenarios_path)) return scenario_db
def __init__(self, schema, price_tracker, model_path, mappings_path, decoding, index=None, num_candidates=20, retriever_context_len=2, timed_session=False): super(NeuralSystem, self).__init__() self.schema = schema self.price_tracker = price_tracker self.timed_session = timed_session # Load arguments args_path = os.path.join(model_path, 'config.json') config = read_json(args_path) config['batch_size'] = 1 config['gpu'] = 0 # Don't need GPU for batch_size=1 config['decoding'] = decoding config['pretrained_wordvec'] = None args = argparse.Namespace(**config) vocab_path = os.path.join(mappings_path, 'vocab.pkl') mappings = read_pickle(vocab_path) vocab = mappings['vocab'] # TODO: different models have the same key now args.dropout = 0 logstats.add_args('model_args', args) model = build_model(schema, mappings, None, args) # Tensorflow config if args.gpu == 0: print 'GPU is disabled' config = tf.ConfigProto(device_count={'GPU': 0}) else: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5, allow_growth=True) config = tf.ConfigProto(device_count={'GPU': 1}, gpu_options=gpu_options) # NOTE: need to close the session when done tf_session = tf.Session(config=config) tf.initialize_all_variables().run(session=tf_session) # Load TF model parameters ckpt = tf.train.get_checkpoint_state(model_path + '-best') assert ckpt, 'No checkpoint found' assert ckpt.model_checkpoint_path, 'No model path found in checkpoint' saver = tf.train.Saver() saver.restore(tf_session, ckpt.model_checkpoint_path) # Model config tells data generator which batcher to use model_config = {} if args.retrieve or args.model in ('ir', 'selector'): model_config['retrieve'] = True if args.predict_price: model_config['price'] = True self.model_name = args.model preprocessor = Preprocessor(schema, price_tracker, args.entity_encoding_form, args.entity_decoding_form, args.entity_target_form) textint_map = TextIntMap(vocab, preprocessor) int_markers = SpecialSymbols( *[mappings['vocab'].to_ind(m) for m in markers]) dialogue_batcher = DialogueBatcherFactory.get_dialogue_batcher( model_config, int_markers=int_markers, slot_filling=False, kb_pad=mappings['kb_vocab'].to_ind(markers.PAD)) # Retriever if args.model == 'selector': retriever = Retriever(index, context_size=retriever_context_len, num_candidates=num_candidates) else: retriever = None #TODO: class variable is not a good way to do this Dialogue.mappings = mappings Dialogue.textint_map = textint_map Dialogue.preprocessor = preprocessor Dialogue.num_context = args.num_context Env = namedtuple('Env', [ 'model', 'tf_session', 'preprocessor', 'vocab', 'textint_map', 'stop_symbol', 'remove_symbols', 'max_len', 'dialogue_batcher', 'retriever' ]) self.env = Env(model, tf_session, preprocessor, mappings['vocab'], textint_map, stop_symbol=vocab.to_ind(markers.EOS), remove_symbols=map(vocab.to_ind, (markers.EOS, markers.PAD)), max_len=20, dialogue_batcher=dialogue_batcher, retriever=retriever)
add_model_arguments(parser) add_learner_arguments(parser) args = parser.parse_args() random.seed(args.random_seed) if not os.path.isdir(os.path.dirname(args.stats_file)): os.makedirs(os.path.dirname(args.stats_file)) logstats.init(args.stats_file) logstats.add_args('config', args) # Save or load models if args.init_from: start = time.time() print 'Load model (config, vocab, checkpoint) from', args.init_from config_path = os.path.join(args.init_from, 'config.json') saved_config = read_json(config_path) # NOTE: args below can be overwritten # TODO: separate temperature from decoding arg saved_config['decoding'] = args.decoding saved_config['temperature'] = args.temperature saved_config['batch_size'] = args.batch_size saved_config['pretrained_wordvec'] = args.pretrained_wordvec saved_config['ranker'] = args.ranker model_args = argparse.Namespace(**saved_config) # Checkpoint if args.test and args.best: ckpt = tf.train.get_checkpoint_state(args.init_from + '-best') else:
def from_json(cls, data_path): """Construct from dumped data. """ data = read_json(data_path) return cls(data)
import argparse from cocoa.core.util import read_json, write_json from cocoa.core.scenario_db import ScenarioDB from cocoa.core.schema import Schema from core.scenario import Scenario parser = argparse.ArgumentParser() parser.add_argument('--chats') parser.add_argument('--scenarios') parser.add_argument('--max', type=int) args = parser.parse_args() chats = read_json(args.chats) scenarios = [] n = args.max or len(chats) for chat in chats[:n]: scenarios.append(Scenario.from_dict(None, chat['scenario'])) scenario_db = ScenarioDB(scenarios) write_json(scenario_db.to_dict(), args.scenarios)
action='store_true', help='Whether or not to have verbose prints') parser.add_argument('--valid-scenarios-path', help='Output path for the validation scenarios') cocoa.options.add_scenario_arguments(parser) options.add_system_arguments(parser) options.add_rl_arguments(parser) options.add_model_arguments(parser) args = parser.parse_args() if args.random_seed: random.seed(args.random_seed) np.random.seed(args.random_seed) schema = Schema(args.schema_path) scenario_db = ScenarioDB.from_dict(schema, read_json(args.scenarios_path), Scenario) valid_scenario_db = ScenarioDB.from_dict( schema, read_json(args.valid_scenarios_path), Scenario) assert len(args.agent_checkpoints) <= len(args.agents) systems = [ get_system(name, args, schema, False, args.agent_checkpoints[i]) for i, name in enumerate(args.agents) ] rl_agent = 0 system = systems[rl_agent] model = system.env.model loss = make_loss(args, model, system.mappings['tgt_vocab']) optim = build_optim(args, model, None)
''' examples: json chats Use "$xxx$ as ground truth, and record n-gram context before and after the price. ''' context = {'left': defaultdict(int), 'right': defaultdict(int)} for ex in examples: for event in ex['events']: if event['action'] == 'message': tokens = tokenize(event['data']) tokens = ['<s>'] + tokens + ['</s>'] for i, token in enumerate(tokens): if token[0] == '$' or token[-1] == '$': context['left'][tokens[i - 1]] += 1 context['right'][tokens[i + 1]] += 1 if output_path: write_pickle(context, output_path) return context if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--train-examples-path', help='Path to training json file') parser.add_argument('--output', help='Path to output model') args = parser.parse_args() examples = read_json(args.train_examples_path) PriceTracker.train(examples, args.output)
def get_data_generator(args, model_args, mappings, schema): from cocoa.core.scenario_db import ScenarioDB from cocoa.core.dataset import read_dataset, EvalExample from cocoa.core.util import read_json from core.scenario import Scenario from core.price_tracker import PriceTracker from core.slot_detector import SlotDetector from retriever import Retriever from preprocess import DataGenerator, LMDataGenerator, EvalDataGenerator, Preprocessor import os.path # TODO: move this to dataset if args.eval: dataset = [] for path in args.eval_examples_paths: dataset.extend( [EvalExample.from_dict(schema, e) for e in read_json(path)]) else: dataset = read_dataset(args, Scenario) lexicon = PriceTracker(model_args.price_tracker_model) slot_detector = SlotDetector(slot_scores_path=model_args.slot_scores) # Model config tells data generator which batcher to use model_config = {} if args.retrieve or model_args.model in ('ir', 'selector'): model_config['retrieve'] = True if args.predict_price: model_config['price'] = True # For retrieval-based models only: whether to add ground truth response in the candidates if model_args.model in ('selector', 'ir'): if 'loss' in args.eval_modes and 'generation' in args.eval_modes: print '"loss" requires ground truth reponse to be added to the candidate set. Please evaluate "loss" and "generation" separately.' raise ValueError if (not args.test) or args.eval_modes == ['loss']: add_ground_truth = True else: add_ground_truth = False print 'Ground truth response {} be added to the candidate set.'.format( 'will' if add_ground_truth else 'will not') else: add_ground_truth = False # TODO: hacky if args.model == 'lm': DataGenerator = LMDataGenerator if args.retrieve or args.model in ('selector', 'ir'): retriever = Retriever(args.index, context_size=args.retriever_context_len, num_candidates=args.num_candidates) else: retriever = None preprocessor = Preprocessor(schema, lexicon, model_args.entity_encoding_form, model_args.entity_decoding_form, model_args.entity_target_form, slot_filling=model_args.slot_filling, slot_detector=slot_detector) trie_path = os.path.join(model_args.mappings, 'trie.pkl') if args.eval: data_generator = EvalDataGenerator(dataset, preprocessor, mappings, model_args.num_context) else: if args.test: model_args.dropout = 0 train, dev, test = None, None, dataset.test_examples else: train, dev, test = dataset.train_examples, dataset.test_examples, None data_generator = DataGenerator(train, dev, test, preprocessor, schema, mappings, retriever=retriever, cache=args.cache, ignore_cache=args.ignore_cache, candidates_path=args.candidates_path, num_context=model_args.num_context, trie_path=trie_path, batch_size=args.batch_size, model_config=model_config, add_ground_truth=add_ground_truth) return data_generator