def filter_rejected_chats(transcripts): filtered = [] for chat in transcripts: ex = Example.from_dict(None, chat, Scenario) if not Preprocessor.skip_example(ex): filtered.append(chat) return filtered
def log_examples_with_templates(self, examples, log): for example in examples: if Preprocessor.skip_example(example): continue for event in example.events: template_id = event.template if template_id is not None: event.template = self.templates[template_id] write_json([ex.to_dict() for ex in examples], log)
def __init__(self, schema, price_tracker, retriever, timed_session=False): super(IRRankerSystem, self).__init__() self.schema = schema self.price_tracker = price_tracker self.timed_session = timed_session Env = namedtuple('Env', ['ranker', 'retriever', 'preprocessor']) ranker = IRRanker() preprocessor = Preprocessor(schema, price_tracker, 'canonical', 'canonical', 'canonical') self.env = Env(ranker, retriever, preprocessor)
def __init__(self, schema, lexicon, model_path, fact_check, decoding, timed_session=False, consecutive_entity=True, realizer=None): super(NeuralSystem, self).__init__() self.schema = schema self.lexicon = lexicon self.timed_session = timed_session self.consecutive_entity = consecutive_entity # Load arguments args_path = os.path.join(model_path, 'config.json') config = read_json(args_path) config['batch_size'] = 1 config['gpu'] = 0 # Don't need GPU for batch_size=1 config['decoding'] = decoding args = argparse.Namespace(**config) mappings_path = os.path.join(model_path, 'vocab.pkl') mappings = read_pickle(mappings_path) vocab = mappings['vocab'] # TODO: different models have the same key now args.dropout = 0 logstats.add_args('model_args', args) model = build_model(schema, mappings, args) # Tensorflow config if args.gpu == 0: print 'GPU is disabled' config = tf.ConfigProto(device_count = {'GPU': 0}) else: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.5, allow_growth=True) config = tf.ConfigProto(device_count = {'GPU': 1}, gpu_options=gpu_options) # NOTE: need to close the session when done tf_session = tf.Session(config=config) tf.initialize_all_variables().run(session=tf_session) # Load TF model parameters ckpt = tf.train.get_checkpoint_state(model_path+'-best') assert ckpt, 'No checkpoint found' assert ckpt.model_checkpoint_path, 'No model path found in checkpoint' saver = tf.train.Saver() saver.restore(tf_session, ckpt.model_checkpoint_path) self.model_name = args.model if self.model_name == 'attn-copy-encdec': args.entity_target_form = 'graph' copy = True else: copy = False preprocessor = Preprocessor(schema, lexicon, args.entity_encoding_form, args.entity_decoding_form, args.entity_target_form, args.prepend) textint_map = TextIntMap(vocab, mappings['entity'], preprocessor) Env = namedtuple('Env', ['model', 'tf_session', 'preprocessor', 'vocab', 'copy', 'textint_map', 'stop_symbol', 'remove_symbols', 'max_len', 'evaluator', 'prepend', 'consecutive_entity', 'realizer']) self.env = Env(model, tf_session, preprocessor, mappings['vocab'], copy, textint_map, stop_symbol=vocab.to_ind(markers.EOS), remove_symbols=map(vocab.to_ind, (markers.EOS, markers.PAD)), max_len=20, evaluator=FactEvaluator() if fact_check else None, prepend=args.prepend, consecutive_entity=self.consecutive_entity, realizer=realizer)
def __init__(self, schema, price_tracker, retriever, model_path, mappings, timed_session=False): super(NeuralRankerSystem, self).__init__() self.schema = schema self.price_tracker = price_tracker self.timed_session = timed_session # Load arguments args_path = os.path.join(model_path, 'config.json') config = read_json(args_path) # TODO: handle this properly config['batch_size'] = 1 config['pretrained_wordvec'] = None args = argparse.Namespace(**config) mappings_path = os.path.join(mappings, 'vocab.pkl') mappings = read_pickle(mappings_path) vocab = mappings['vocab'] logstats.add_args('model_args', args) model = build_model(schema, mappings, None, args) # Tensorflow config if args.gpu == 0: print 'GPU is disabled' config = tf.ConfigProto(device_count = {'GPU': 0}) else: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.5, allow_growth=True) config = tf.ConfigProto(device_count = {'GPU': 1}, gpu_options=gpu_options) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # NOTE: need to close the session when done tf_session = tf.Session(config=config) tf_session.run(tf.global_variables_initializer()) # Load TF model parameters ckpt = tf.train.get_checkpoint_state(model_path+'-best') assert ckpt, 'No checkpoint found' assert ckpt.model_checkpoint_path, 'No model path found in checkpoint' saver = tf.train.Saver() saver.restore(tf_session, ckpt.model_checkpoint_path) preprocessor = Preprocessor(schema, price_tracker, 'canonical', 'canonical', 'canonical') textint_map = TextIntMap(vocab, preprocessor) int_markers = SpecialSymbols(*[mappings['vocab'].to_ind(m) for m in markers]) model_config = {'retrieve': True} batcher = DialogueBatcherFactory.get_dialogue_batcher(model_config, int_markers=int_markers, slot_filling=False, kb_pad=mappings['kb_vocab'].to_ind(markers.PAD)) StreamingDialogue.textint_map = textint_map StreamingDialogue.num_context = args.num_context StreamingDialogue.mappings = mappings Env = namedtuple('Env', ['ranker', 'retriever', 'tf_session', 'preprocessor', 'mappings', 'textint_map', 'batcher']) self.env = Env(model, retriever, tf_session, preprocessor, mappings, textint_map, batcher)
def extract_templates(self, transcripts_paths, max_examples=-1, ngram_N=4, log=None): examples = read_examples(transcripts_paths, max_examples, Scenario) for example in examples: if Preprocessor.skip_example(example): continue self.parse_example(example, ngram_N) self.add_counts(ngram_N) self.detokenize_templates() if log: self.log_examples_with_templates(examples, log)
% (k, base_precision, ref_precision, base_recall, ref_recall, base_ndcg, ref_ndcg)) self.eval_base[i, 0, instance] = base_precision self.eval_base[i, 1, instance] = base_recall self.eval_base[i, 2, instance] = base_ndcg self.eval_reform[i, 0, instance] = ref_precision self.eval_reform[i, 1, instance] = ref_recall self.eval_reform[i, 2, instance] = ref_ndcg if __name__ == '__main__': q_reform = QueryReformulation(model_path='../../saved_model/qr_cnn_retrained_model_[p0.4104]_2020-06-14.h5') preprocessor = Preprocessor() preprocessor.load_data(path='../../query_reformulation_dataset') query_objs, query_sequence, terms_sequence, candidate_terms = \ preprocessor.get_query_and_candidate_terms(sequence_length=20) _, query_objs, _, query_sequence, _, terms_sequence, _, candidate_terms = train_test_split(query_objs, query_sequence, terms_sequence, candidate_terms, test_size=0.3, random_state=42) evaluate = Evaluate(search_engine=search_engine, reformulation_model=q_reform, query_list=zip(query_objs, query_sequence, terms_sequence, candidate_terms), sample_size=len(query_objs))
def __init__(self, schema, price_tracker, model_path, mappings_path, decoding, index=None, num_candidates=20, retriever_context_len=2, timed_session=False): super(NeuralSystem, self).__init__() self.schema = schema self.price_tracker = price_tracker self.timed_session = timed_session # Load arguments args_path = os.path.join(model_path, 'config.json') config = read_json(args_path) config['batch_size'] = 1 config['gpu'] = 0 # Don't need GPU for batch_size=1 config['decoding'] = decoding config['pretrained_wordvec'] = None args = argparse.Namespace(**config) vocab_path = os.path.join(mappings_path, 'vocab.pkl') mappings = read_pickle(vocab_path) vocab = mappings['vocab'] # TODO: different models have the same key now args.dropout = 0 logstats.add_args('model_args', args) model = build_model(schema, mappings, None, args) # Tensorflow config if args.gpu == 0: print 'GPU is disabled' config = tf.ConfigProto(device_count={'GPU': 0}) else: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5, allow_growth=True) config = tf.ConfigProto(device_count={'GPU': 1}, gpu_options=gpu_options) # NOTE: need to close the session when done tf_session = tf.Session(config=config) tf.initialize_all_variables().run(session=tf_session) # Load TF model parameters ckpt = tf.train.get_checkpoint_state(model_path + '-best') assert ckpt, 'No checkpoint found' assert ckpt.model_checkpoint_path, 'No model path found in checkpoint' saver = tf.train.Saver() saver.restore(tf_session, ckpt.model_checkpoint_path) # Model config tells data generator which batcher to use model_config = {} if args.retrieve or args.model in ('ir', 'selector'): model_config['retrieve'] = True if args.predict_price: model_config['price'] = True self.model_name = args.model preprocessor = Preprocessor(schema, price_tracker, args.entity_encoding_form, args.entity_decoding_form, args.entity_target_form) textint_map = TextIntMap(vocab, preprocessor) int_markers = SpecialSymbols( *[mappings['vocab'].to_ind(m) for m in markers]) dialogue_batcher = DialogueBatcherFactory.get_dialogue_batcher( model_config, int_markers=int_markers, slot_filling=False, kb_pad=mappings['kb_vocab'].to_ind(markers.PAD)) # Retriever if args.model == 'selector': retriever = Retriever(index, context_size=retriever_context_len, num_candidates=num_candidates) else: retriever = None #TODO: class variable is not a good way to do this Dialogue.mappings = mappings Dialogue.textint_map = textint_map Dialogue.preprocessor = preprocessor Dialogue.num_context = args.num_context Env = namedtuple('Env', [ 'model', 'tf_session', 'preprocessor', 'vocab', 'textint_map', 'stop_symbol', 'remove_symbols', 'max_len', 'dialogue_batcher', 'retriever' ]) self.env = Env(model, tf_session, preprocessor, mappings['vocab'], textint_map, stop_symbol=vocab.to_ind(markers.EOS), remove_symbols=map(vocab.to_ind, (markers.EOS, markers.PAD)), max_len=20, dialogue_batcher=dialogue_batcher, retriever=retriever)
if initial_run: query_manager.search_queries() query_manager.save_queries(path=output_path) else: query_manager.load_queries(path=output_path) query_manager.clear_query_list(min_precision=0.2, min_recall=0.01) query_list = query_manager.query_list precisions = np.zeros(shape=(len(query_list), )) recalls = np.zeros(shape=(len(query_list), )) for i, query in enumerate(query_list): print('P:%.5f, R:%.5f < %s >' % (query.base_precision, query.base_recall, query.query)) print('\tKeywords:', query.keywords) precisions[i] = query.base_precision recalls[i] = query.base_recall print('Usable query number:', len(query_list)) print('Avg. baseline precision :', precisions.mean()) print('Avg. baseline recall :', recalls.mean()) # Preprocess ####################################################################################################### preprocessor = Preprocessor() if initial_run: preprocessor.initialize(query_manager=query_manager, emb_path=embedding_file_path) preprocessor.save_data(path=output_path) else: preprocessor.load_data(path=output_path)
parser.add_argument('--transcripts', nargs='*', help='JSON transcripts to extract templates') parser.add_argument('--price-tracker-model') parser.add_argument('--max-examples', default=-1, type=int) # parser.add_argument('--templates', help='Path to load templates') # parser.add_argument('--templates-output', help='Path to save templates') # parser.add_argument('--model', help='Path to load model') # parser.add_argument('--model-output', help='Path to save the dialogue manager model') args = parser.parse_args() price_tracker = PriceTracker(args.price_tracker_model) examples = read_examples(args.transcripts, args.max_examples, Scenario) parsed_dialogues = [] templates = Templates() for example in examples: if Preprocessor.skip_example(example): continue utterances = parse_example(example, price_tracker, templates) parsed_dialogues.append(utterances) #for d in parsed_dialogues[:2]: # for u in d: # print u #import sys; sys.exit() # Train n-gram model sequences = [] for d in parsed_dialogues: sequences.append([u.lf.intent for u in d]) manager = Manager.from_train(sequences) manager.save(args.model_output)
def preprocessor(schema, lexicon): return Preprocessor(schema, lexicon, 'canonical', 'canonical', 'graph')