Esempio n. 1
0
def train_data_require(args, word2idx):
    corpus_train = SNLIDataset(train=True,
                               vocab_size=args.vocab_size,
                               path=args.data_path,
                               attack_label=args.attack_label,
                               reset_vocab=word2idx)
    corpus_test = SNLIDataset(train=False,
                              vocab_size=args.vocab_size,
                              path=args.data_path,
                              attack_label=args.attack_label,
                              reset_vocab=word2idx)
    trainloader = torch.utils.data.DataLoader(corpus_train,
                                              batch_size=args.batch_size,
                                              collate_fn=collate_snli,
                                              shuffle=True)
    testloader = torch.utils.data.DataLoader(corpus_test,
                                             batch_size=args.batch_size,
                                             collate_fn=collate_snli,
                                             shuffle=False)
    return trainloader, testloader
parser.add_argument('--lr', type=float, default=1e-05,
                    help='learning rate')
parser.add_argument('--seed', type=int, default=1111,
                    help='seed')
parser.add_argument('--beta1', type=float, default=0.9,
                    help='beta1 for adam. default=0.9')
parser.add_argument('--cuda', action='store_true', default=True,
                    help='use CUDA')
parser.add_argument('--save_path', type=str, required=True,
                    help='used for saving the models')
parser.add_argument('--vocab_size', type=int, default=11004,
                    help='vocabulary size')

args = parser.parse_args()

corpus_train = SNLIDataset(train=True, vocab_size=args.vocab_size-4, path=args.data_path)
corpus_test = SNLIDataset(train=False, vocab_size=args.vocab_size-4, path=args.data_path)
trainloader= torch.utils.data.DataLoader(corpus_train, batch_size = args.batch_size, collate_fn=collate_snli, shuffle=True)
train_iter = iter(trainloader)
testloader= torch.utils.data.DataLoader(corpus_test, batch_size = args.batch_size, collate_fn=collate_snli, shuffle=False)

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

if args.model_type=="lstm":
    baseline_model = Baseline_LSTM(100,300,maxlen=args.maxlen, gpu=args.cuda)
elif args.model_type=="emb":
    baseline_model = Baseline_Embeddings(100, vocab_size=args.vocab_size)
    
if args.cuda:
Esempio n. 3
0
else:
    corpus = Corpus(args.data_path,
                    maxlen=args.maxlen,
                    vocab_size=args.vocab_size,
                    lowercase=args.lowercase)

eval_batch_size = 10
if not args.convolution_enc:
    args.packed_rep = True
train_data = batchify(corpus.train,
                      args.batch_size,
                      args.maxlen,
                      packed_rep=args.packed_rep,
                      shuffle=True)
corpus_test = SNLIDataset(
    train=False,
    vocab_size=41578,
    reset_vocab="/home/ddua/data/arae/output/example/1504200881/vocab.json")
testloader = torch.utils.data.DataLoader(corpus_test,
                                         batch_size=10,
                                         collate_fn=collate_snli,
                                         shuffle=False)
test_data = iter(testloader)

classifier1 = Baseline_Embeddings(100, maxlen=10, gpu=True, vocab_size=41578)
classifier1.load_state_dict(
    torch.load("/home/ddua/data/snli/baseline/model_emb.pt"))
classifier2 = Baseline_LSTM(100, 300, maxlen=10, gpu=args.cuda)
classifier2.load_state_dict(
    torch.load("/home/ddua/data/snli/baseline/model_lstm.pt"))

vocab_classifier1 = pkl.load(
Esempio n. 4
0
                        load_vocab=cur_dir + '/vocab.json')
    else:
        corpus = Corpus(args.data_path,
                        maxlen=args.maxlen,
                        vocab_size=args.vocab_size,
                        lowercase=args.lowercase)

    if not args.convolution_enc:
        args.packed_rep = True

    train_data = batchify(corpus.train, args.batch_size, args.maxlen,
                          packed_rep=args.packed_rep, shuffle=True)
    valid_data = batchify(corpus.test, args.batch_size, args.maxlen,
                          packed_rep=args.packed_rep, shuffle=False)

    corpus_test = SNLIDataset(train=False, vocab_size=args.vocab_size+4,
                              reset_vocab=corpus.dictionary.word2idx)
    testloader = torch.utils.data.DataLoader(corpus_test, batch_size=10,
                                             collate_fn=collate_snli, shuffle=False)
    test_data = iter(testloader)        # different format from train_data and valid_data

    classifier1 = Baseline_Embeddings(100, vocab_size=args.vocab_size+4)
    classifier1.load_state_dict(torch.load(args.classifier_path + "/baseline/model_emb.pt"))
    vocab_classifier1 = pkl.load(open(args.classifier_path + "/vocab.pkl", 'rb'))

    classifier2 = Baseline_LSTM(100, 300, maxlen=10, gpu=args.cuda)
    classifier2.load_state_dict(torch.load(args.classifier_path + "/baseline/model_lstm.pt"))
    vocab_classifier2 = pkl.load(open(args.classifier_path + "/vocab.pkl", 'rb'))

    print("Loaded data and target classifiers!")

    ###############################################################################
                    type=float,
                    default=0.9,
                    help='beta1 for adam. default=0.9')
parser.add_argument('--cuda',
                    action='store_true',
                    default=True,
                    help='use CUDA')
parser.add_argument('--save_path',
                    action='store_true',
                    default=True,
                    help='used for saving the models')

args = parser.parse_args()

corpus_train = SNLIDataset(train=True,
                           vocab_size=11004,
                           lvt=False,
                           path=args.data_path)
corpus_test = SNLIDataset(train=False,
                          vocab_size=11004,
                          lvt=False,
                          path=args.data_path)
trainloader = torch.utils.data.DataLoader(corpus_train,
                                          batch_size=args.batch_size,
                                          collate_fn=collate_snli,
                                          shuffle=True)
train_iter = iter(trainloader)
testloader = torch.utils.data.DataLoader(corpus_test,
                                         batch_size=args.batch_size,
                                         collate_fn=collate_snli,
                                         shuffle=False)
Esempio n. 6
0
                    help='vocabulary size')
parser.add_argument('--vocab_path',
                    type=str,
                    default='./output/1593075369/vocab.json',
                    help='vocabulary size')
parser.add_argument('--hidden_size',
                    type=int,
                    default=300,
                    help='hidden  size')
parser.add_argument('--dropout', type=int, default=0.5, help='drop_out')
args = parser.parse_args()

word2idx = json.load(open(args.vocab_path, "rb"))
# model_idx2word = {v: k for k, v in model_word2idx.items()}
corpus_train = SNLIDataset(train=True,
                           vocab_size=args.vocab_size,
                           path=args.data_path,
                           reset_vocab=word2idx)
corpus_test = SNLIDataset(train=False,
                          vocab_size=args.vocab_size,
                          path=args.data_path,
                          reset_vocab=word2idx)

# embed_matrix = corpus_train.build_embedding_matrix('./data/embeddings/glove.840B.300d.txt')
embeddings_file = './data/embeddings/embeddings.pkl'
with open(embeddings_file, "rb") as pkl:
    embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)

trainloader = torch.utils.data.DataLoader(corpus_train,
                                          batch_size=args.batch_size,
                                          collate_fn=collate_snli,
                                          shuffle=True)
parser.add_argument('--vocab_size',
                    type=int,
                    default=11000,
                    help='vocabulary size')
parser.add_argument('--attack_label', type=int, default=0, help='attack_label')
parser.add_argument('--vocab_path',
                    type=str,
                    default='./output/1593075369/vocab.json',
                    help='vocabulary size')
args = parser.parse_args()

word2idx = json.load(open(args.vocab_path, "rb"))

corpus_test = SNLIDataset(train=False,
                          vocab_size=args.vocab_size,
                          path=args.data_path,
                          attack_label=args.attack_label,
                          reset_vocab=word2idx)
print(len(corpus_test.test_data))

testloader = torch.utils.data.DataLoader(corpus_test,
                                         batch_size=args.batch_size,
                                         collate_fn=collate_snli,
                                         shuffle=False)
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if args.model_type == "lstm":
    baseline_model = Baseline_LSTM(100, 300, maxlen=args.maxlen, gpu=args.cuda)