Example #1
0
def main():
    preprocessing = PreProcessing()

    inputs, outputs = preprocessing.loadPortmanteauData(
        "./data/finalports.csv")
    train, val, test = preprocessing.prepareMTData(inputs, outputs)
    #return

    # get model
    params = {}
    params['embeddings_dim'] = config.embeddings_dim
    params['lstm_cell_size'] = config.lstm_cell_size
    params['vocab_size'] = preprocessing.vocab_size
    print "params['vocab_size'] --------> ", params['vocab_size']
    params['max_input_seq_length'] = config.max_input_seq_length
    params[
        'max_output_seq_length'] = config.max_output_seq_length - 1  #inputs are all but last element, outputs are al but first element
    params['batch_size'] = 20
    params['pretrained_embeddings'] = True

    #return
    print params
    buckets = {0: {'max_input_seq_length': 40, 'max_output_seq_length': 19}}
    #,1:{'max_input_seq_length':40,'max_output_seq_length':19}, 2:{'max_input_seq_length':40, 'max_output_seq_length':19} }
    print buckets

    # train
    lim = params['batch_size'] * (len(train[0]) / params['batch_size'])
    if lim != -1:
        train_encoder_inputs, train_decoder_inputs, train_decoder_outputs = train
        train_encoder_inputs = train_encoder_inputs[:lim]
        train_decoder_inputs = train_decoder_inputs[:lim]
        train_decoder_outputs = train_decoder_outputs[:lim]
        train = train_encoder_inputs, train_decoder_inputs, train_decoder_outputs
    if params['pretrained_embeddings']:
        pretrained_embeddings = getPretrainedEmbeddings(
            src="pretrained_embeddings.txt")
        char_to_idx = preprocessing.word_index
        encoder_embedding_matrix = np.random.rand(params['vocab_size'],
                                                  params['embeddings_dim'])
        decoder_embedding_matrix = np.random.rand(params['vocab_size'],
                                                  params['embeddings_dim'])
        for char, idx in char_to_idx.items():
            if char in pretrained_embeddings:
                encoder_embedding_matrix[idx] = pretrained_embeddings[char]
                decoder_embedding_matrix[idx] = pretrained_embeddings[char]
            else:
                print "No pretrained embedding for ", char
        params['encoder_embeddings_matrix'] = encoder_embedding_matrix
        params['decoder_embeddings_matrix'] = decoder_embedding_matrix

    mode = 'train'
    if mode == 'train':
        train_buckets = {}
        for bucket, _ in enumerate(buckets):
            train_buckets[bucket] = train

        rnn_model = solver.Solver(buckets)
        _ = rnn_model.getModel(params,
                               mode='train',
                               reuse=False,
                               buckets=buckets)
        rnn_model.trainModel(config=params,
                             train_feed_dict=train_buckets,
                             val_feed_dct=None,
                             reverse_vocab=preprocessing.index_word,
                             do_init=True)

    else:
        val_encoder_inputs, val_decoder_inputs, val_decoder_outputs = val
        print "val_encoder_inputs = ", val_encoder_inputs

        if len(val_decoder_outputs.shape) == 3:
            val_decoder_outputs = np.reshape(
                val_decoder_outputs,
                (val_decoder_outputs.shape[0], val_decoder_outputs.shape[1]))

        rnn_model = solver.Solver(buckets=None, mode='inference')
        _ = rnn_model.getModel(params,
                               mode='inference',
                               reuse=False,
                               buckets=None)
        print "----Running inference-----"
        #rnn_model.runInference(params, val_encoder_inputs[:params['batch_size']], val_decoder_outputs[:params['batch_size']], preprocessing.index_word)
        rnn_model.solveAll(params, val_encoder_inputs, val_decoder_outputs,
                           preprocessing.index_word)
Example #2
0
    def loadModel(self, saved_model_path, inference_type="greedy"):

        # params
        params = {}
        params['embeddings_dim'] = config.embeddings_dim
        params['lstm_cell_size'] = config.lstm_cell_size
        params['max_input_seq_length'] = config.max_input_seq_length
        params[
            'max_output_seq_length'] = config.max_output_seq_length - 1  #inputs are all but last element, outputs are al but first element
        params['batch_size'] = config.batch_size
        params['pretrained_embeddings'] = config.use_pretrained_embeddings
        params['pretrained_embeddings'] = True
        params[
            'share_encoder_decoder_embeddings'] = config.share_encoder_decoder_embeddings
        params['use_pointer'] = config.use_pointer
        params[
            'pretrained_embeddings_path'] = config.pretrained_embeddings_path
        params[
            'pretrained_embeddings_are_trainable'] = config.pretrained_embeddings_are_trainable
        params[
            'use_additional_info_from_pretrained_embeddings'] = config.use_additional_info_from_pretrained_embeddings
        params['max_vocab_size'] = config.max_vocab_size
        params['do_vocab_pruning'] = config.do_vocab_pruning
        params['use_reverse_encoder'] = config.use_reverse_encoder
        params['use_sentinel_loss'] = config.use_sentinel_loss
        params['lambd'] = config.lambd
        params['use_context_for_out'] = config.use_context_for_out
        params['batch_size'] = 32

        #	data = pickle.load(open(data_src + "data.obj","r") )
        preprocessing = pickle.load(open(data_src + "preprocessing.obj", "r"))

        params['vocab_size'] = preprocessing.vocab_size
        params['preprocessing'] = preprocessing
        saved_model_path = saved_model_path
        params['saved_model_path'] = saved_model_path

        if params['pretrained_embeddings']:
            pretrained_embeddings = pickle.load(
                open(params['pretrained_embeddings_path'], "r"))
            word_to_idx = preprocessing.word_to_idx
            encoder_embedding_matrix = np.random.rand(params['vocab_size'],
                                                      params['embeddings_dim'])
            decoder_embedding_matrix = np.random.rand(params['vocab_size'],
                                                      params['embeddings_dim'])
            not_found_count = 0
            for token, idx in word_to_idx.items():
                if token in pretrained_embeddings:
                    encoder_embedding_matrix[idx] = pretrained_embeddings[
                        token]
                    decoder_embedding_matrix[idx] = pretrained_embeddings[
                        token]
                else:
                    not_found_count += 1
            #print "not found count = ", not_found_count
            params['encoder_embeddings_matrix'] = encoder_embedding_matrix
            params['decoder_embeddings_matrix'] = decoder_embedding_matrix

            if params['use_additional_info_from_pretrained_embeddings']:
                additional_count = 0
                tmp = []
                for token in pretrained_embeddings:
                    if token not in preprocessing.word_to_idx:
                        preprocessing.word_to_idx[
                            token] = preprocessing.word_to_idx_ctr
                        preprocessing.idx_to_word[
                            preprocessing.word_to_idx_ctr] = token
                        preprocessing.word_to_idx_ctr += 1
                        tmp.append(pretrained_embeddings[token])
                        additional_count += 1
                #print "additional_count = ",additional_count
                params['vocab_size'] = preprocessing.word_to_idx_ctr
                tmp = np.array(tmp)
                encoder_embedding_matrix = np.vstack(
                    [encoder_embedding_matrix, tmp])
                decoder_embedding_matrix = np.vstack(
                    [decoder_embedding_matrix, tmp])
                #print "decoder_embedding_matrix.shape ",decoder_embedding_matrix.shape
                #print "New vocab size = ",params['vocab_size']

        rnn_model = solver.Solver(params, buckets=None, mode='inference')
        _ = rnn_model.getModel(params,
                               mode='inference',
                               reuse=False,
                               buckets=None)

        sess = tf.Session()
        saver = tf.train.Saver()
        saved_model_path = params['saved_model_path']
        saver.restore(sess, saved_model_path)  #"./tmp/model39.ckpt")

        self.sess = sess
        self.rnn_model = rnn_model
        self.params = params
        self.preprocessing = preprocessing
        self.inference_type = inference_type
Example #3
0
def main():
    # params
    params = {}
    params['embeddings_dim'] = config.embeddings_dim
    params['lstm_cell_size'] = config.lstm_cell_size
    params['max_input_seq_length'] = config.max_input_seq_length

    # inputs are all but last element,
    # outputs are all but first element
    params['max_output_seq_length'] = config.max_output_seq_length - 1

    params['batch_size'] = config.batch_size
    params['pretrained_embeddings'] = config.use_pretrained_embeddings
    params[
        'share_encoder_decoder_embeddings'] = config.share_encoder_decoder_embeddings
    params['use_pointer'] = config.use_pointer
    params['pretrained_embeddings_path'] = config.pretrained_embeddings_path
    params[
        'pretrained_embeddings_are_trainable'] = config.pretrained_embeddings_are_trainable
    params[
        'use_additional_info_from_pretrained_embeddings'] = config.use_additional_info_from_pretrained_embeddings
    params['max_vocab_size'] = config.max_vocab_size
    params['do_vocab_pruning'] = config.do_vocab_pruning
    params['use_reverse_encoder'] = config.use_reverse_encoder
    params['use_sentinel_loss'] = config.use_sentinel_loss
    params['lambd'] = config.lambd
    params['use_context_for_out'] = config.use_context_for_out

    print "PARAMS:"
    for key, value in params.items():
        print " -- ", key, " = ", value
    buckets = {
        0: {
            'max_input_seq_length': params['max_input_seq_length'],
            'max_output_seq_length': params['max_output_seq_length']
        }
    }

    # Preprocess data
    preprocessing, val_encoder_inputs, val_decoder_outputs = preprocess(
        "style_transfer_test.in")
    params['vocab_size'] = preprocessing.vocab_size

    # Pretrained embeddibngs
    if params['pretrained_embeddings']:
        pretrained_embeddings = pickle.load(
            open(params['pretrained_embeddings_path'], "r"))
        word_to_idx = preprocessing.word_to_idx
        encoder_embedding_matrix = np.random.rand(params['vocab_size'],
                                                  params['embeddings_dim'])
        decoder_embedding_matrix = np.random.rand(params['vocab_size'],
                                                  params['embeddings_dim'])
        not_found_count = 0
        for token, idx in word_to_idx.items():
            if token in pretrained_embeddings:
                encoder_embedding_matrix[idx] = pretrained_embeddings[token]
                decoder_embedding_matrix[idx] = pretrained_embeddings[token]
            else:
                if not_found_count < 10:
                    print "No pretrained embedding for (only first 10 such cases will be printed. other prints are suppressed) ", token
                not_found_count += 1
        print "not found count = ", not_found_count
        params['encoder_embeddings_matrix'] = encoder_embedding_matrix
        params['decoder_embeddings_matrix'] = decoder_embedding_matrix

        if params['use_additional_info_from_pretrained_embeddings']:
            additional_count = 0
            tmp = []
            for token in pretrained_embeddings:
                if token not in preprocessing.word_to_idx:
                    preprocessing.word_to_idx[
                        token] = preprocessing.word_to_idx_ctr
                    preprocessing.idx_to_word[
                        preprocessing.word_to_idx_ctr] = token
                    preprocessing.word_to_idx_ctr += 1
                    tmp.append(pretrained_embeddings[token])
                    additional_count += 1
            print "additional_count = ", additional_count
            params['vocab_size'] = preprocessing.word_to_idx_ctr
            tmp = np.array(tmp)
            encoder_embedding_matrix = np.vstack(
                [encoder_embedding_matrix, tmp])
            decoder_embedding_matrix = np.vstack(
                [decoder_embedding_matrix, tmp])
        print "decoder_embedding_matrix.shape ", decoder_embedding_matrix.shape
        print "New vocab size = ", params['vocab_size']

    # Make inference
    saved_model_path = sys.argv[2]
    print "saved_model_path = ", saved_model_path
    inference_type = "greedy"
    print "inference_type = ", inference_type
    params['saved_model_path'] = saved_model_path
    rnn_model = solver.Solver(params, buckets=None, mode='inference')
    _ = rnn_model.getModel(params, mode='inference', reuse=False, buckets=None)
    print "----Running inference-----"

    # val
    print "val_encoder_inputs = ", val_encoder_inputs
    if len(val_decoder_outputs.shape) == 3:
        val_decoder_outputs = np.reshape(
            val_decoder_outputs,
            (val_decoder_outputs.shape[0], val_decoder_outputs.shape[1]))
    decoder_outputs_inference, decoder_ground_truth_outputs = rnn_model.solveAll(
        params,
        val_encoder_inputs,
        val_decoder_outputs,
        preprocessing.idx_to_word,
        inference_type=inference_type)
    validOutFile_name = saved_model_path + ".style_transfer_test.output"
    original_data_path = data_src + "style_transfer_test.in"
    BLEUOutputFile_path = saved_model_path + ".style_transfer_test.BLEU"
    utilities.getBlue(validOutFile_name, original_data_path,
                      BLEUOutputFile_path, decoder_outputs_inference,
                      decoder_ground_truth_outputs, preprocessing)
Example #4
0
def main():

    # params
    params = {}
    params['embeddings_dim'] =  config.embeddings_dim
    params['lstm_cell_size'] = config.lstm_cell_size
    params['max_input_seq_length'] = config.max_input_seq_length
    params['max_output_seq_length'] = config.max_output_seq_length-1 #inputs are all but last element, outputs are al but first element
    params['batch_size'] = config.batch_size
    params['pretrained_embeddings'] = config.use_pretrained_embeddings
    params['share_encoder_decoder_embeddings'] = config.share_encoder_decoder_embeddings
    params['use_pointer'] = config.use_pointer
    params['pretrained_embeddings_path'] = config.pretrained_embeddings_path
    params['pretrained_embeddings_are_trainable'] = config.pretrained_embeddings_are_trainable
    params['use_additional_info_from_pretrained_embeddings'] = config.use_additional_info_from_pretrained_embeddings
    params['max_vocab_size'] = config.max_vocab_size
    params['do_vocab_pruning'] = config.do_vocab_pruning
    params['use_reverse_encoder'] = config.use_reverse_encoder
    params['use_sentinel_loss'] =config.use_sentinel_loss
    params['lambd'] = config.lambd
    params['use_context_for_out'] = config.use_context_for_out

    print("PARAMS:")
    for key,value in params.items():
        print(" -- ",key," = ",value)
    buckets = {  0:{'max_input_seq_length':params['max_input_seq_length'], 'max_output_seq_length':params['max_output_seq_length']} }
    #print "buckets = ",buckets

    # train
    mode=sys.argv[1]
    print("mode = ",mode)

    ########### PREPROCESSING
    if mode=="preprocessing":
        # preprocessing
        print("------------------------------------------------------------------------")
        preprocessing = PreProcessing()
        splits =["train","valid","test"]
        #for split in splits: preprocessing.loadVocab(split)
        preprocessing.loadVocab('train')
        if params['do_vocab_pruning']:
            preprocessing.pruneVocab(max_vocab_size=params['max_vocab_size'])
        data_seq = {split:preprocessing.loadData(split=split) for split in splits}
        data = { split:preprocessing.prepareMTData(cur_data) for split,cur_data in data_seq.items()  }
        for split,split_data in data.items():
            #print "Split: ",split
            inp,dinp,dout,dout_inp_matches = split_data
            #print inp.shape, dinp.shape, dout.shape, dout_inp_matches.shape
        #print "------------------------------------------------------------------------"
        #print ""
        pickle.dump(data,open(data_src + "data.obj",mode="wb"))
        pickle.dump(preprocessing, open(data_src + "preprocessing.obj",mode="wb") )
        return
    else:
        data = pickle.load(open(data_src + "data.obj","rb") )
        preprocessing = pickle.load(open(data_src + "preprocessing.obj","rb") )

    params['vocab_size'] = preprocessing.vocab_size
    params['preprocessing'] = preprocessing
    train = data['train']
    val = data['valid']
    test = data['test']

    # DEBUG
    if mode=="debug":
        lim = 64
    else:
        lim= int(params['batch_size'] * ( len(train[0])/params['batch_size']))
    if lim!=-1:
        train_encoder_inputs, train_decoder_inputs, train_decoder_outputs, train_decoder_outputs_matching_inputs = train
        train_encoder_inputs = train_encoder_inputs[:lim]
        train_decoder_inputs = train_decoder_inputs[:lim]
        train_decoder_outputs = train_decoder_outputs[:lim]
        train_decoder_outputs_matching_inputs = train_decoder_outputs_matching_inputs[:lim]
        train = train_encoder_inputs, train_decoder_inputs, train_decoder_outputs, train_decoder_outputs_matching_inputs

    #Pretrained embeddibngs
    if params['pretrained_embeddings']:
        pretrained_embeddings = pickle.load(open(params['pretrained_embeddings_path'],mode="rb"),encoding='bytes')
        word_to_idx = preprocessing.word_to_idx
        encoder_embedding_matrix = np.random.rand( params['vocab_size'], params['embeddings_dim'] )
        decoder_embedding_matrix = np.random.rand( params['vocab_size'], params['embeddings_dim'] )
        not_found_count = 0
        for token,idx in word_to_idx.items():
            if token in pretrained_embeddings:
                encoder_embedding_matrix[idx]=pretrained_embeddings[token]
                decoder_embedding_matrix[idx]=pretrained_embeddings[token]
            else:
                if not_found_count<10:
                    print("No pretrained embedding for (only first 10 such cases will be printed. other prints are suppressed) ",token)
                not_found_count+=1
        #print "not found count = ", not_found_count
        params['encoder_embeddings_matrix'] = encoder_embedding_matrix
        params['decoder_embeddings_matrix'] = decoder_embedding_matrix

        if params['use_additional_info_from_pretrained_embeddings']:
            additional_count=0
            tmp=[]
            for token in pretrained_embeddings:
                if token not in preprocessing.word_to_idx:
                    preprocessing.word_to_idx[token] = preprocessing.word_to_idx_ctr
                    preprocessing.idx_to_word[preprocessing.word_to_idx_ctr] = token
                    preprocessing.word_to_idx_ctr+=1
                    tmp.append(pretrained_embeddings[token])
                    additional_count+=1
            #print "additional_count = ",additional_count
            params['vocab_size'] = preprocessing.word_to_idx_ctr
            tmp = np.array(tmp)
            encoder_embedding_matrix = np.vstack([encoder_embedding_matrix,tmp])
            decoder_embedding_matrix = np.vstack([decoder_embedding_matrix,tmp])
            #print "decoder_embedding_matrix.shape ",decoder_embedding_matrix.shape
            #print "New vocab size = ",params['vocab_size']


    # TRAIN/DEBUG
    if mode=='train' or mode=="debug":
        if mode=="train":
            training_iters = int(sys.argv[2])
            model_name = sys.argv[3]
        else:
            training_iters = 5
            model_name = "test"
        params['training_iters'] = training_iters
        params['model_name'] = model_name
        train_buckets = {}
        for bucket,_ in enumerate(buckets):
            train_buckets[bucket] = train

        rnn_model = solver.Solver(params,buckets)
        _ = rnn_model.getModel(params, mode='train',reuse=False, buckets=buckets)
        rnn_model.trainModel(config=params, train_feed_dict=train_buckets, val_feed_dct=val, reverse_vocab=preprocessing.idx_to_word, do_init=True)

    # INFERENCE
    elif mode=="inference":
        saved_model_path = sys.argv[2]
        print("saved_model_path = ",saved_model_path)
        inference_type = sys.argv[3] # greedy / beam
        print("inference_type = ",inference_type)
        params['saved_model_path'] = saved_model_path
        rnn_model = solver.Solver(params, buckets=None, mode='inference')
        _ = rnn_model.getModel(params, mode='inference', reuse=False, buckets=None)
        print("----Running inference-----")

        #val
        val_encoder_inputs, val_decoder_inputs, val_decoder_outputs, val_decoder_outputs_matching_inputs = val
        #print "val_encoder_inputs = ",val_encoder_inputs
        if len(val_decoder_outputs.shape)==3:
            val_decoder_outputs=np.reshape(val_decoder_outputs, (val_decoder_outputs.shape[0], val_decoder_outputs.shape[1]))
        decoder_outputs_inference, decoder_ground_truth_outputs = rnn_model.solveAll(params, val_encoder_inputs, val_decoder_outputs, preprocessing.idx_to_word, inference_type=inference_type)
        validOutFile_name = saved_model_path+".valid.output"
        original_data_path = data_src + "valid.original.nltktok"
        BLEUOutputFile_path = saved_model_path + ".valid.BLEU"
        utilities.getBlue(validOutFile_name, original_data_path, BLEUOutputFile_path, decoder_outputs_inference, decoder_ground_truth_outputs, preprocessing)
        print("VALIDATION: ",open(BLEUOutputFile_path,"r").read())

        #test
        test_encoder_inputs, test_decoder_inputs, test_decoder_outputs, test_decoder_outputs_matching_inputs = test
        if len(test_decoder_outputs.shape)==3:
            test_decoder_outputs=np.reshape(test_decoder_outputs, (test_decoder_outputs.shape[0], test_decoder_outputs.shape[1]))
        decoder_outputs_inference, decoder_ground_truth_outputs = rnn_model.solveAll(params, test_encoder_inputs, test_decoder_outputs, preprocessing.idx_to_word, inference_type=inference_type)
        validOutFile_name = saved_model_path+".test.output"
        original_data_path = data_src + "test.original.nltktok"
        BLEUOutputFile_path = saved_model_path + ".test.BLEU"
        utilities.getBlue(validOutFile_name, original_data_path, BLEUOutputFile_path, decoder_outputs_inference, decoder_ground_truth_outputs, preprocessing)
        print("TEST: ",open(BLEUOutputFile_path,"r").read())

    else:
        print("Please see usage")