def prepare(path, suffix=''):
    data0 = load_sent(path + '.0' + suffix)
    data1 = load_sent(path + '.1' + suffix)
    x = data0 + data1
    y = [0] * len(data0) + [1] * len(data1)
    z = sorted(zip(x, y), key=lambda i: len(i[0]))
    return zip(*z)
Esempio n. 2
0
def prepare(path, suffix=''):
    data0 = load_sent(path + 'formal' + suffix)
    data1 = load_sent(path + 'informal' + suffix)
    x = data0 + data1
    y = [0] * len(data0) + [1] * len(data1)
    z = sorted(zip(x, y), key=lambda i: len(i[0]))
    return zip(*z)
Esempio n. 3
0
def prepare(path, suffix='', default_data=False):
    if default_data:
        data0 = load_sent(path + '.0' + suffix)
        data1 = load_sent(path + '.1' + suffix)
    else:
        data0 = load_sent('../data/anto2/sentiment.test.anto2.1')
        data1 = load_sent('../data/anto2/sentiment.test.anto2.0') 
        # data0 = load_sent('../data/runtime/aa/epoch20.1.tsf')
        # data1 = load_sent('../data/runtime/aa/epoch20.0.tsf')   


    x = data0 + data1
    y = [0] * len(data0) + [1] * len(data1)
    z = sorted(zip(x, y), key=lambda i: len(i[0]))
    return zip(*z)
Esempio n. 4
0

if __name__ == '__main__':
    args = load_arguments()

    if not os.path.exists(args.model):
        os.system("mkdir -p {}".format(args.model))

    #####   data preparation   #####
    if args.train or args.latent_train:
        chosen = args.train if len(args.train) > len(args.latent_train) else \
          args.latent_train
        # train0 = load_sent(chosen + '.0', args.max_train_size)
        # train1 = load_sent(chosen + '.1', args.max_train_size)

        train0 = load_sent(chosen + 'formal', args.max_train_size)
        train1 = load_sent(chosen + 'informal', args.max_train_size)

        print('#sents of training file 0:', len(train0))
        print('#sents of training file 1:', len(train1))

        if not os.path.isfile(args.vocab):
            build_vocab(train0 + train1, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print('vocabulary size:', vocab.size)

    if args.dev or args.latent_dev:
        chosen = args.dev if len(args.dev) > len(args.latent_dev) else \
          args.latent_dev
        dev0 = load_sent(chosen + 'formal')
def run_model(args):
    time = datetime.now().timestamp()
    train_filename = "sarc/sarc.train"
    sp_model_path = "tmp/sarc_bpe"

    sp = spm.SentencePieceProcessor()
    
    #####   data preparation   #####
    if args.train:
        
        logger = utils.init_logging(args, time)
        
        print("args: ", args)
        logger.info("args: "+str(args))
        no_of_epochs = args.max_epochs
        train0 = load_sent(args.train + '.0', args.max_train_size)
        train1 = load_sent(args.train + '.1', args.max_train_size)

        #train0, train1 = load_sent_csvgz(args.train, args.max_train_size)
        # if not os.path.isfile(train_filename):
        with open(train_filename, "w") as f:
            for sent in train0+train1:
                f.write(" ".join(sent)+"\n")
    
    
        # if not os.path.isfile(train_filename+".1"):
        #     with open(train_filename+".1", "w") as f:
        #         for sent in train1:
        #             f.write(" ".join(sent)+"\n")
        print('#sents of training file 0:', len(train0))
        print('#sents of training file 1:', len(train1))

        logger.info('#sents of training file 0: ' + str(len(train0)))
        logger.info('#sents of training file 1: ' + str(len(train1)))

        # if not os.path.isfile(args.vocab):
        #     build_vocab(train0 + train1, args.vocab)
    # if not os.path.isfile(sp_model_path+".model") or not os.path.isfile(sp_model_path+".vocab"):        
    if args.train:
        spm.SentencePieceTrainer.Train('--input='+train_filename+' --model_prefix='+sp_model_path+' \
            --vocab_size=10000 --hard_vocab_limit=false --bos_piece=<go> --eos_piece=<eos> --pad_id=0 \
                --bos_id=1 --eos_id=2 --unk_id=3 --user_defined_symbols=<url>,<at>,<hashtag>')        
    
    sp.Load(sp_model_path+".model")
    # vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    
    dev0 = []
    dev1 = []
    
    if args.dev:
        dev0 = load_sent(args.dev + '.0')
        dev1 = load_sent(args.dev + '.1')
    
    if args.predict:
        if args.model_path:
            # logger.info("Predicting a sample input\n---------------------\n")
            model = torch.load(args.model_path)
            model.training = False
            output = utils.predict(model, args.predict, args.target_sentiment, sp, args.beam)
            # output = output.replace(" ","")
            # output_new = ""      
            # # output = re.sub(r"(\s\s+)", " ", output)
            # for val in output:
            #     if val == "  ":
            #         output_new += " "
            #     elif val == " ":
            #         pass
            #     else:
            #         output_new += val
            print(f"Input given: {args.predict} \nTarget sentiment: {args.target_sentiment} \nTranslated output: {output}")
            # logger.info(f"Input given: {args.predict} \nTarget sentiment: {args.target_sentiment} \nTranslated output: {output}")
    if args.test:
        file0 = open(args.test+".0", "r")
        file1 = open(args.test+".1", "r")
        saves_path = os.path.join(args.saves_path, utils.get_filename(args, time, "model"))
        Path(saves_path).mkdir(parents=True, exist_ok=True)
        out_file_0 = open(os.path.join(saves_path, "test_outputs_neg_to_pos"), "w")
        out_file_1 = open(os.path.join(saves_path, "test_outputs_pos_to_neg"), "w")
        model = torch.load(args.model_path)
        model.training = False

        test_neg = file0.readlines()
        for line in test_neg:
            output = utils.predict(model, line, 1, sp, args.beam)
            # out_file_0.write(output+"\n")
        print("second")
        test_pos = file1.readlines()
        for line in test_pos:
            output = utils.predict(model, line, 0, sp, args.beam)
            out_file_1.write(output+"\n")

        # test0 = load_sent(args.test + '.0')
        # test1 = load_sent(args.test + '.1')
        # if args.model_path:
        #     saves_path = os.path.join(args.saves_path, utils.get_filename(args, time, "model"))
        #     Path(saves_path).mkdir(parents=True, exist_ok=True)
        #     model = torch.load(args.model_path)
        #     model.training = False
        #     batches0, batches1, _, _ = utils.get_batches(test0, test1, model.vocab.word2id, model.args.batch_size)

        #     output_file_0 = open(os.path.join(saves_path, "test_outputs_neg_to_pos"), "w")
        #     output_file_1 = open(os.path.join(saves_path, "test_outputs_pos_to_neg"), "w")

        #     for batch0, batch1 in zip(batches0, batches1):
        #         batch0 = batch0["enc_inputs"]
        #         batch1 = batch1["enc_inputs"]
        #         test_outputs_0 = utils.predict_batch(model, batch0, sentiment=1, beam_size=args.beam, plain_format=True)
        #         test_outputs_1 = utils.predict_batch(model, batch1, sentiment=0, beam_size=args.beam, plain_format=True)
        #         output_file_0.write('\n'.join(test_outputs_0) + '\n')
        #         output_file_1.write('\n'.join(test_outputs_1) + '\n')
                
    if args.train:
        summ_filename = 'runs/cross-alignment/'+utils.get_filename(args, time, "summary")
        writer = SummaryWriter(summ_filename)

        model = get_model(args, logger, sp)
        model.train_max_epochs(args, train0, train1, dev0, dev1, no_of_epochs, writer, time, sp,
        save_epochs_flag=True)
Esempio n. 6
0
    model = Model(args, vocab)
    if args.load_model:
        print 'Loading model from', args.model
        model.saver.restore(sess, args.model)
    else:
        print 'Creating model with fresh parameters.'
        sess.run(tf.global_variables_initializer())
    return model


if __name__ == '__main__':
    args = load_arguments()

    #####   data preparation   #####
    if args.train:
        train0 = load_sent(args.train + '.0', args.max_train_size)
        train1 = load_sent(args.train + '.1', args.max_train_size)
        print '#sents of training file 0:', len(train0)
        print '#sents of training file 1:', len(train1)

        if not os.path.isfile(args.vocab):
            build_vocab(train0 + train1, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print 'vocabulary size:', vocab.size

    if args.dev:
        dev0 = load_sent(args.dev + '.0')
        dev1 = load_sent(args.dev + '.1')

    if args.test:
                                 model.batch_size: batch['size'],
                                 model.inputs: batch['inputs'],
                                 model.targets: batch['targets'],
                                 model.weights: batch['weights'],
                                 model.dropout: 1
                             })
        n_words += np.sum(batch['weights'])

    return np.exp(tot_loss / n_words)


if __name__ == '__main__':
    args = load_arguments()

    if args.train:
        train = load_sent(args.train)

        if not os.path.isfile(args.vocab):
            build_vocab(train, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print('vocabulary size', vocab.size)

    if args.dev:
        dev = load_sent(args.dev)

    if args.test:
        test = load_sent(args.test)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    if args.load_model:
        print 'Loading model from', args.model
        model.saver.restore(sess, args.model)
    else:
        print 'Creating model with fresh parameters.'
        sess.run(tf.global_variables_initializer())
    return model

if __name__ == '__main__':
    args = load_arguments()

    #####   data preparation   #####
    if args.train:
        # KS train0 = load_sent(args.train + '.0', args.max_train_size)
        # KS train1 = load_sent(args.train + '.1', args.max_train_size)
        train0 = load_sent('../data/st/train.original', args.max_train_size) 
        train1 = load_sent('../data/st/train.modern', args.max_train_size) 
        print '#sents of training file 0:', len(train0)
        print '#sents of training file 1:', len(train1)

        dev0 = load_sent('../data/st/dev.original')
        dev1 = load_sent('../data/st/dev.modern')
        print '#sents of training file 0:', len(dev0)
        print '#sents of training file 1:', len(dev1)

        test0 = load_sent('../data/st/test.original')
        test1 = load_sent('../data/st/test.modern')
        print '#sents of training file 0:', len(test0)
        print '#sents of training file 1:', len(test1)

        if not os.path.isfile(args.vocab):
def prepare_test(path):
    x = load_sent(path)
    y = [1]*len(x)
    z = sorted(zip(x, y), key=lambda i: len(i[0]))
    return zip(*z)
Esempio n. 10
0
    model = Model(args, vocab)
    if args.load_model:
        print 'Loading model from', args.model
        model.saver.restore(sess, args.model)
    else:
        print 'Creating model with fresh parameters.'
        sess.run(tf.global_variables_initializer())
    return model


if __name__ == '__main__':
    args = load_arguments()

    #####   data preparation   #####
    if args.train:
        train0 = load_sent(args.train + '.0', args.max_train_size)
        train0_n = load_sent(args.train + '.noised1.0', args.max_train_size)
        train1 = load_sent(args.train + '.1', args.max_train_size)
        train1_n = load_sent(args.train + '.noised1.1', args.max_train_size)
        print '#sents of training file 0:', len(train0)
        print '#sents of training file 1:', len(train1)

        if not os.path.isfile(args.vocab):
            build_vocab(train0 + train1, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print 'vocabulary size:', vocab.size

    if args.dev:
        dev0 = load_sent(args.dev + '.0')
        dev0_n = load_sent(args.dev + '.noised1.0')
def run_model(args):
    time = datetime.now().timestamp()

    #####   data preparation   #####
    if args.train:

        logger, saves_dir = utils.init_logging(args, time)

        print("args: ", args)
        logger.info("args: " + str(args))
        no_of_epochs = args.max_epochs
        train0 = load_sent(args.train + '.0', args.max_train_size,
                           args.max_seq_length, args.sentence_flag)
        train1 = load_sent(args.train + '.1', args.max_train_size,
                           args.max_seq_length, args.sentence_flag)

        print('#sents of training file 0:', len(train0))
        print('#sents of training file 1:', len(train1))

        logger.info('#sents of training file 0: ' + str(len(train0)))
        logger.info('#sents of training file 1: ' + str(len(train1)))

        # build vocab for every run
        if not os.path.isfile(args.vocab):
            build_vocab(train0 + train1, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)

    dev0 = []
    dev1 = []

    if args.dev:
        dev0 = load_sent(args.dev + '.0', -1, args.max_seq_length,
                         args.sentence_flag)
        dev1 = load_sent(args.dev + '.1', -1, args.max_seq_length,
                         args.sentence_flag)

    if args.predict:
        if args.model_path:
            # logger.info("Predicting a sample input\n---------------------\n")
            device = torch.device(
                "cuda:" +
                str(args.cuda_device) if torch.cuda.is_available() else "cpu")
            model = torch.load(args.model_path, map_location=device)
            model.training = False
            output = utils.predict(model, args.predict, args.target_sentiment,
                                   args.beam)
            print(
                f"Input given: {args.predict} \nTarget sentiment: {args.target_sentiment} \nTranslated output: {output}"
            )
            # logger.info(f"Input given: {args.predict} \nTarget sentiment: {args.target_sentiment} \nTranslated output: {output}")
    if args.test:
        logger, saves_dir = utils.init_logging(args, time)

        print("args: ", args)
        logger.info("args: " + str(args))
        device = torch.device(
            "cuda:" +
            str(args.cuda_device) if torch.cuda.is_available() else "cpu")
        file0 = open(args.test + ".0", "r")
        file1 = open(args.test + ".1", "r")
        saves_path = os.path.join(args.saves_path,
                                  utils.get_filename(args, time, ""))
        Path(saves_path).mkdir(parents=True, exist_ok=True)
        out_file_0 = open(os.path.join(saves_path, "test_outputs_neg_to_pos"),
                          "w")
        out_file_1 = open(os.path.join(saves_path, "test_outputs_pos_to_neg"),
                          "w")
        model = torch.load(args.model_path, map_location=device)
        model.training = False

        for line in file0:
            line = line.strip("\n")
            output = utils.predict(model, line, 1, args.beam)
            out_file_0.write(output + "\n")

        for line in file1:
            line = line.strip("\n")
            output = utils.predict(model, line, 0, args.beam)
            out_file_1.write(output + "\n")

    if args.train:
        summ_filename = 'runs/cross-alignment/' + utils.get_filename(
            args, time, "summary")
        writer = SummaryWriter(summ_filename)

        model = get_model(args, vocab, logger)
        model.train_max_epochs(saves_dir,
                               args,
                               train0,
                               train1,
                               dev0,
                               dev1,
                               vocab,
                               no_of_epochs,
                               writer,
                               time,
                               save_epochs_flag=True)
Esempio n. 12
0
            f.write('{}\n'.format(' '.join(line)))
            f.write('{}\n'.format(' '.join(w for w in ori[0])))
            f.write('{}\n'.format(' '.join(w for w in tsf[0])))


if __name__ == '__main__':
    args = load_arguments()

    if not os.path.exists(args.model):
        os.system("mkdir -p {}".format(args.model))

    #####   data preparation   #####
    if args.train or args.latent_train:
        chosen = args.train if len(args.train) > len(args.latent_train) else \
          args.latent_train
        train0 = load_sent(chosen + '.0', args.max_train_size)
        train1 = load_sent(chosen + '.1', args.max_train_size)
        print('#sents of training file 0:', len(train0))
        print('#sents of training file 1:', len(train1))

        if not os.path.isfile(args.vocab):
            build_vocab(train0 + train1, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print('vocabulary size:', vocab.size)

    if args.dev or args.latent_dev:
        chosen = args.dev if len(args.dev) > len(args.latent_dev) else \
          args.latent_dev
        dev0 = load_sent(chosen + '.0')
        dev1 = load_sent(chosen + '.1')
        train0 = load_sent_lines(args.train + '.0', args.train_start,
                                 args.train_end)
        train1 = load_sent_lines(args.train + '.1', args.train_start,
                                 args.train_end)

        print '#sents of training file 0:', len(train0)
        print '#sents of training file 1:', len(train1)

        if not os.path.isfile(args.vocab):
            build_vocab(train0 + train1, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print 'vocabulary size:', vocab.size

    if args.dev:
        dev0 = load_sent(args.dev + '.0')
        dev1 = load_sent(args.dev + '.1')

    if args.test:
        test0 = load_sent(args.test + '.0')
        test1 = load_sent(args.test + '.1')

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = create_model(sess, args, vocab)

        if args.beam > 1:
            decoder = beam_search.Decoder(sess, args, vocab, model)
        else:
            decoder = greedy_decoding.Decoder(sess, args, vocab, model)
    for batch in batches:
        tot_loss += sess.run(model.tot_loss,
            feed_dict={model.batch_size: batch['size'],
                       model.inputs: batch['inputs'],
                       model.targets: batch['targets'],
                       model.weights: batch['weights'],
                       model.dropout: 1})
        n_words += np.sum(batch['weights'])

    return np.exp(tot_loss / n_words)

if __name__ == '__main__':
    args = load_arguments()

    if args.train:
        train = load_sent(args.train)

        if not os.path.isfile(args.vocab):
            build_vocab(train, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print 'vocabulary size', vocab.size

    if args.dev:
        dev = load_sent(args.dev)

    if args.test:
        test = load_sent(args.test)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
def create_model(sess, args, vocab):
    model = Model(args, vocab)
    if args.load_model:
        print 'Loading model from', args.model
        model.saver.restore(sess, args.model)
    else:
        print 'Creating model with fresh parameters.'
        sess.run(tf.global_variables_initializer())
    return model

if __name__ == '__main__':
    args = load_arguments()

    #####   data preparation   #####
    if args.train:
        train0 = load_sent(args.train + '.0', args.max_train_size)
        train1 = load_sent(args.train + '.1', args.max_train_size)
        print '#sents of training file 0:', len(train0)
        print '#sents of training file 1:', len(train1)

        if not os.path.isfile(args.vocab):
            build_vocab(train0 + train1, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print 'vocabulary size:', vocab.size

    if args.dev:
        dev0 = load_sent(args.dev + '.0')
        dev1 = load_sent(args.dev + '.1')

    if args.test:
Esempio n. 16
0
        print 'Loading model from', args.model
        model.saver.restore(sess, args.model)
    else:
        print 'Creating model with fresh parameters.'
        sess.run(tf.global_variables_initializer())
    return model


if __name__ == '__main__':
    args = load_arguments()

    #####   data preparation   #####
    if args.train:

        # 0 is the starting style !
        train0 = load_sent(args.train + '.0', args.max_train_size)
        train1 = load_sent(args.train + '.1', args.max_train_size)
        train2 = load_sent(args.train + '.2', args.max_train_size)
        train3 = load_sent(args.train + '.3', args.max_train_size)
        train4 = load_sent(args.train + '.4', args.max_train_size)
        train5 = load_sent(args.train + '.5', args.max_train_size)
        print '#sents of training file 0:', len(train0)
        print '#sents of training file 1:', len(train1)
        print '#sents of training file 2:', len(train2)
        print '#sents of training file 3:', len(train3)
        print '#sents of training file 4:', len(train4)
        print '#sents of training file 5:', len(train5)

        # loaded all three datasets here. Train once with 0-1 and once with 0-2

        print("=====got here training=====")