Example #1
0
def filter_rejected_chats(transcripts):
    filtered = []
    for chat in transcripts:
        ex = Example.from_dict(None, chat, Scenario)
        if not Preprocessor.skip_example(ex):
            filtered.append(chat)
    return filtered
Example #2
0
 def log_examples_with_templates(self, examples, log):
     for example in examples:
         if Preprocessor.skip_example(example):
             continue
         for event in example.events:
             template_id = event.template
             if template_id is not None:
                 event.template = self.templates[template_id]
     write_json([ex.to_dict() for ex in examples], log)
Example #3
0
    def __init__(self, schema, price_tracker, retriever, timed_session=False):
        super(IRRankerSystem, self).__init__()
        self.schema = schema
        self.price_tracker = price_tracker
        self.timed_session = timed_session

        Env = namedtuple('Env', ['ranker', 'retriever', 'preprocessor'])
        ranker = IRRanker()
        preprocessor = Preprocessor(schema, price_tracker, 'canonical', 'canonical', 'canonical')
        self.env = Env(ranker, retriever, preprocessor)
Example #4
0
    def __init__(self, schema, lexicon, model_path, fact_check, decoding, timed_session=False, consecutive_entity=True, realizer=None):
        super(NeuralSystem, self).__init__()
        self.schema = schema
        self.lexicon = lexicon
        self.timed_session = timed_session
        self.consecutive_entity = consecutive_entity

        # Load arguments
        args_path = os.path.join(model_path, 'config.json')
        config = read_json(args_path)
        config['batch_size'] = 1
        config['gpu'] = 0  # Don't need GPU for batch_size=1
        config['decoding'] = decoding
        args = argparse.Namespace(**config)

        mappings_path = os.path.join(model_path, 'vocab.pkl')
        mappings = read_pickle(mappings_path)
        vocab = mappings['vocab']

        # TODO: different models have the same key now
        args.dropout = 0
        logstats.add_args('model_args', args)
        model = build_model(schema, mappings, args)

        # Tensorflow config
        if args.gpu == 0:
            print 'GPU is disabled'
            config = tf.ConfigProto(device_count = {'GPU': 0})
        else:
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.5, allow_growth=True)
            config = tf.ConfigProto(device_count = {'GPU': 1}, gpu_options=gpu_options)

        # NOTE: need to close the session when done
        tf_session = tf.Session(config=config)
        tf.initialize_all_variables().run(session=tf_session)

        # Load TF model parameters
        ckpt = tf.train.get_checkpoint_state(model_path+'-best')
        assert ckpt, 'No checkpoint found'
        assert ckpt.model_checkpoint_path, 'No model path found in checkpoint'
        saver = tf.train.Saver()
        saver.restore(tf_session, ckpt.model_checkpoint_path)

        self.model_name = args.model
        if self.model_name == 'attn-copy-encdec':
            args.entity_target_form = 'graph'
            copy = True
        else:
            copy = False
        preprocessor = Preprocessor(schema, lexicon, args.entity_encoding_form, args.entity_decoding_form, args.entity_target_form, args.prepend)
        textint_map = TextIntMap(vocab, mappings['entity'], preprocessor)

        Env = namedtuple('Env', ['model', 'tf_session', 'preprocessor', 'vocab', 'copy', 'textint_map', 'stop_symbol', 'remove_symbols', 'max_len', 'evaluator', 'prepend', 'consecutive_entity', 'realizer'])
        self.env = Env(model, tf_session, preprocessor, mappings['vocab'], copy, textint_map, stop_symbol=vocab.to_ind(markers.EOS), remove_symbols=map(vocab.to_ind, (markers.EOS, markers.PAD)), max_len=20, evaluator=FactEvaluator() if fact_check else None, prepend=args.prepend, consecutive_entity=self.consecutive_entity, realizer=realizer)
Example #5
0
    def __init__(self, schema, price_tracker, retriever, model_path, mappings, timed_session=False):
        super(NeuralRankerSystem, self).__init__()
        self.schema = schema
        self.price_tracker = price_tracker
        self.timed_session = timed_session

        # Load arguments
        args_path = os.path.join(model_path, 'config.json')
        config = read_json(args_path)
        # TODO: handle this properly
        config['batch_size'] = 1
        config['pretrained_wordvec'] = None
        args = argparse.Namespace(**config)

        mappings_path = os.path.join(mappings, 'vocab.pkl')
        mappings = read_pickle(mappings_path)
        vocab = mappings['vocab']

        logstats.add_args('model_args', args)
        model = build_model(schema, mappings, None, args)

        # Tensorflow config
        if args.gpu == 0:
            print 'GPU is disabled'
            config = tf.ConfigProto(device_count = {'GPU': 0})
        else:
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.5, allow_growth=True)
            config = tf.ConfigProto(device_count = {'GPU': 1}, gpu_options=gpu_options)
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

        # NOTE: need to close the session when done
        tf_session = tf.Session(config=config)
        tf_session.run(tf.global_variables_initializer())

        # Load TF model parameters
        ckpt = tf.train.get_checkpoint_state(model_path+'-best')
        assert ckpt, 'No checkpoint found'
        assert ckpt.model_checkpoint_path, 'No model path found in checkpoint'
        saver = tf.train.Saver()
        saver.restore(tf_session, ckpt.model_checkpoint_path)

        preprocessor = Preprocessor(schema, price_tracker, 'canonical', 'canonical', 'canonical')
        textint_map = TextIntMap(vocab, preprocessor)

        int_markers = SpecialSymbols(*[mappings['vocab'].to_ind(m) for m in markers])
        model_config = {'retrieve': True}
        batcher = DialogueBatcherFactory.get_dialogue_batcher(model_config, int_markers=int_markers, slot_filling=False, kb_pad=mappings['kb_vocab'].to_ind(markers.PAD))

        StreamingDialogue.textint_map = textint_map
        StreamingDialogue.num_context = args.num_context
        StreamingDialogue.mappings = mappings

        Env = namedtuple('Env', ['ranker', 'retriever', 'tf_session', 'preprocessor', 'mappings', 'textint_map', 'batcher'])
        self.env = Env(model, retriever, tf_session, preprocessor, mappings, textint_map, batcher)
Example #6
0
    def extract_templates(self,
                          transcripts_paths,
                          max_examples=-1,
                          ngram_N=4,
                          log=None):
        examples = read_examples(transcripts_paths, max_examples, Scenario)

        for example in examples:
            if Preprocessor.skip_example(example):
                continue
            self.parse_example(example, ngram_N)

        self.add_counts(ngram_N)
        self.detokenize_templates()

        if log:
            self.log_examples_with_templates(examples, log)
Example #7
0
                  % (k, base_precision, ref_precision,
                     base_recall, ref_recall, base_ndcg, ref_ndcg))

            self.eval_base[i, 0, instance] = base_precision
            self.eval_base[i, 1, instance] = base_recall
            self.eval_base[i, 2, instance] = base_ndcg

            self.eval_reform[i, 0, instance] = ref_precision
            self.eval_reform[i, 1, instance] = ref_recall
            self.eval_reform[i, 2, instance] = ref_ndcg


if __name__ == '__main__':
    q_reform = QueryReformulation(model_path='../../saved_model/qr_cnn_retrained_model_[p0.4104]_2020-06-14.h5')

    preprocessor = Preprocessor()
    preprocessor.load_data(path='../../query_reformulation_dataset')
    query_objs, query_sequence, terms_sequence, candidate_terms = \
        preprocessor.get_query_and_candidate_terms(sequence_length=20)

    _, query_objs, _, query_sequence, _, terms_sequence, _, candidate_terms = train_test_split(query_objs,
                                                                                               query_sequence,
                                                                                               terms_sequence,
                                                                                               candidate_terms,
                                                                                               test_size=0.3,
                                                                                               random_state=42)

    evaluate = Evaluate(search_engine=search_engine, reformulation_model=q_reform,
                        query_list=zip(query_objs, query_sequence, terms_sequence, candidate_terms),
                        sample_size=len(query_objs))
Example #8
0
    def __init__(self,
                 schema,
                 price_tracker,
                 model_path,
                 mappings_path,
                 decoding,
                 index=None,
                 num_candidates=20,
                 retriever_context_len=2,
                 timed_session=False):
        super(NeuralSystem, self).__init__()
        self.schema = schema
        self.price_tracker = price_tracker
        self.timed_session = timed_session

        # Load arguments
        args_path = os.path.join(model_path, 'config.json')
        config = read_json(args_path)
        config['batch_size'] = 1
        config['gpu'] = 0  # Don't need GPU for batch_size=1
        config['decoding'] = decoding
        config['pretrained_wordvec'] = None
        args = argparse.Namespace(**config)

        vocab_path = os.path.join(mappings_path, 'vocab.pkl')
        mappings = read_pickle(vocab_path)
        vocab = mappings['vocab']

        # TODO: different models have the same key now
        args.dropout = 0
        logstats.add_args('model_args', args)
        model = build_model(schema, mappings, None, args)

        # Tensorflow config
        if args.gpu == 0:
            print 'GPU is disabled'
            config = tf.ConfigProto(device_count={'GPU': 0})
        else:
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5,
                                        allow_growth=True)
            config = tf.ConfigProto(device_count={'GPU': 1},
                                    gpu_options=gpu_options)

        # NOTE: need to close the session when done
        tf_session = tf.Session(config=config)
        tf.initialize_all_variables().run(session=tf_session)

        # Load TF model parameters
        ckpt = tf.train.get_checkpoint_state(model_path + '-best')
        assert ckpt, 'No checkpoint found'
        assert ckpt.model_checkpoint_path, 'No model path found in checkpoint'
        saver = tf.train.Saver()
        saver.restore(tf_session, ckpt.model_checkpoint_path)

        # Model config tells data generator which batcher to use
        model_config = {}
        if args.retrieve or args.model in ('ir', 'selector'):
            model_config['retrieve'] = True
        if args.predict_price:
            model_config['price'] = True

        self.model_name = args.model
        preprocessor = Preprocessor(schema, price_tracker,
                                    args.entity_encoding_form,
                                    args.entity_decoding_form,
                                    args.entity_target_form)
        textint_map = TextIntMap(vocab, preprocessor)
        int_markers = SpecialSymbols(
            *[mappings['vocab'].to_ind(m) for m in markers])
        dialogue_batcher = DialogueBatcherFactory.get_dialogue_batcher(
            model_config,
            int_markers=int_markers,
            slot_filling=False,
            kb_pad=mappings['kb_vocab'].to_ind(markers.PAD))

        # Retriever
        if args.model == 'selector':
            retriever = Retriever(index,
                                  context_size=retriever_context_len,
                                  num_candidates=num_candidates)
        else:
            retriever = None

        #TODO: class variable is not a good way to do this
        Dialogue.mappings = mappings
        Dialogue.textint_map = textint_map
        Dialogue.preprocessor = preprocessor
        Dialogue.num_context = args.num_context

        Env = namedtuple('Env', [
            'model', 'tf_session', 'preprocessor', 'vocab', 'textint_map',
            'stop_symbol', 'remove_symbols', 'max_len', 'dialogue_batcher',
            'retriever'
        ])
        self.env = Env(model,
                       tf_session,
                       preprocessor,
                       mappings['vocab'],
                       textint_map,
                       stop_symbol=vocab.to_ind(markers.EOS),
                       remove_symbols=map(vocab.to_ind,
                                          (markers.EOS, markers.PAD)),
                       max_len=20,
                       dialogue_batcher=dialogue_batcher,
                       retriever=retriever)
Example #9
0
    if initial_run:
        query_manager.search_queries()
        query_manager.save_queries(path=output_path)
    else:
        query_manager.load_queries(path=output_path)

    query_manager.clear_query_list(min_precision=0.2, min_recall=0.01)
    query_list = query_manager.query_list
    precisions = np.zeros(shape=(len(query_list), ))
    recalls = np.zeros(shape=(len(query_list), ))
    for i, query in enumerate(query_list):
        print('P:%.5f, R:%.5f  < %s >' %
              (query.base_precision, query.base_recall, query.query))
        print('\tKeywords:', query.keywords)
        precisions[i] = query.base_precision
        recalls[i] = query.base_recall

    print('Usable query number:', len(query_list))
    print('Avg. baseline precision :', precisions.mean())
    print('Avg. baseline recall    :', recalls.mean())

    # Preprocess #######################################################################################################

    preprocessor = Preprocessor()
    if initial_run:
        preprocessor.initialize(query_manager=query_manager,
                                emb_path=embedding_file_path)
        preprocessor.save_data(path=output_path)
    else:
        preprocessor.load_data(path=output_path)
Example #10
0
    parser.add_argument('--transcripts', nargs='*', help='JSON transcripts to extract templates')
    parser.add_argument('--price-tracker-model')
    parser.add_argument('--max-examples', default=-1, type=int)
    # parser.add_argument('--templates', help='Path to load templates')
    # parser.add_argument('--templates-output', help='Path to save templates')
    # parser.add_argument('--model', help='Path to load model')
    # parser.add_argument('--model-output', help='Path to save the dialogue manager model')
    args = parser.parse_args()

    price_tracker = PriceTracker(args.price_tracker_model)
    examples = read_examples(args.transcripts, args.max_examples, Scenario)
    parsed_dialogues = []
    templates = Templates()

    for example in examples:
        if Preprocessor.skip_example(example):
            continue
        utterances = parse_example(example, price_tracker, templates)
        parsed_dialogues.append(utterances)

    #for d in parsed_dialogues[:2]:
    #    for u in d:
    #        print u
    #import sys; sys.exit()

    # Train n-gram model
    sequences = []
    for d in parsed_dialogues:
        sequences.append([u.lf.intent for u in d])
    manager = Manager.from_train(sequences)
    manager.save(args.model_output)
Example #11
0
def preprocessor(schema, lexicon):
    return Preprocessor(schema, lexicon, 'canonical', 'canonical', 'graph')