]
    filenames = copy.deepcopy(train_filenames)
    fp_train.close()
    fp_test = open(test_file, 'r')
    test_filenames = [os.path.join(TEST_DIR, line.strip()) for line in fp_test]
    fp_test.close()
    filenames.extend(test_filenames)

    corpus = DP.Corpus(DATA_DIR, filenames)
    nlabel = 8

    # create model
    model = LSTMC.LSTMClassifier(embedding_dim=embedding_dim,
                                 hidden_dim=hidden_dim,
                                 vocab_size=len(corpus.dictionary),
                                 label_size=nlabel,
                                 batch_size=batch_size,
                                 use_gpu=use_gpu,
                                 attn_flag=attn_flag)
    if use_gpu:
        model = model.cuda()
    # data processing
    dtrain_set = DP.TxtDatasetProcessing(DATA_DIR, TRAIN_DIR, TRAIN_FILE,
                                         TRAIN_LABEL, sentence_len, corpus)

    train_loader = DataLoader(dtrain_set, batch_size=batch_size, shuffle=True)
    dtest_set = DP.TxtDatasetProcessing(DATA_DIR, TEST_DIR, TEST_FILE,
                                        TEST_LABEL, sentence_len, corpus)

    test_loader = DataLoader(dtest_set, batch_size=batch_size, shuffle=False)
Beispiel #2
0
    char2int = {ch: ii for ii, ch in int2char.items()}
    ### data processing
    if html:
        dtrain_set = HTMLCharDataset(int2char, char2int, input_len,
                                     'html_trainset.pkl')
        dtest_set = HTMLCharDataset(int2char, char2int, input_len,
                                    'html_valset.pkl')
    else:
        dtrain_set = URLCharDataset(int2char, char2int, input_len, TRAIN_URLS,
                                    TRAIN_LABELS)
        dtest_set = URLCharDataset(int2char, char2int, input_len, TEST_URLS,
                                   TEST_LABELS)
    ### create model
    model = LSTMC.LSTMClassifier(embedding_dim=embedding_dim,
                                 hidden_dim=hidden_dim,
                                 vocab_size=dtrain_set.vocab_size,
                                 label_size=nlabel,
                                 batch_size=batch_size,
                                 gpu=gpu)

    if gpu >= 0:
        model = model.cuda(gpu)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    loss_function = nn.CrossEntropyLoss()
    train_loss_ = []
    test_loss_ = []
    train_acc_ = []
    test_acc_ = []
    ### training procedure
    train_loader = DataLoader(dtrain_set,
                              batch_size=batch_size,
                              shuffle=True,