Ejemplo n.º 1
0
def text_transformer(n_gram, window_size):
    """
    Get tweet transformer
    :param lang:
    :param n_gram:
    :return:
    """
    if n_gram == 'c1':
        return transforms.Compose([
            ltransforms.ToLower(),
            ltransforms.Character(),
            ltransforms.ToIndex(start_ix=0),
            ltransforms.ToNGram(n=window_size, overlapse=True),
            ltransforms.Reshape((-1, window_size)),
            ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram] - 1)
        ])
    else:
        return transforms.Compose([
            ltransforms.ToLower(),
            ltransforms.Character2Gram(),
            ltransforms.ToIndex(start_ix=0),
            ltransforms.ToNGram(n=window_size, overlapse=True),
            ltransforms.Reshape((-1, window_size)),
            ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram] - 1)
        ])
Ejemplo n.º 2
0
def text_transformer_cnn(window_size, n_gram, token_to_ix):
    """
    Get text transformer for CNNSCD
    :param window_size:
    :param n_gram:
    :return:
    """
    if n_gram == 'c1':
        return ltransforms.Compose([
            ltransforms.ToLower(),
            ltransforms.Character(),
            ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix),
            ltransforms.ToLength(length=window_size),
            ltransforms.Reshape((-1)),
            ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram])
        ])
    else:
        return ltransforms.Compose([
            ltransforms.ToLower(),
            ltransforms.Character2Gram(),
            ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix),
            ltransforms.ToLength(length=window_size),
            ltransforms.Reshape((-1)),
            ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram])
        ])
    args.n_samples,
    args.k,
    verbose=args.verbose
)

# CNN Glove Feature Selector
cgfs = models.cgfs(pretrained=True, n_gram=2, n_features=60)

# Remove last linear layer
cgfs.linear2 = echotorch.nn.Identity()

# Transformer
transformer = transforms.Compose([
    transforms.GloveVector(),
    transforms.ToNGram(n=2, overlapse=True),
    transforms.Reshape((-1, 1, 2, 300)),
    transforms.FeatureSelector(cgfs, 60, to_variable=True),
    transforms.Reshape((1, -1, 60)),
    transforms.Normalize(mean=-4.56512329954, std=0.911449706065)
])

# Reuters C50 dataset
reutersloader = torch.utils.data.DataLoader(datasets.ReutersC50Dataset(
    root=args.dataset,
    download=True,
    n_authors=args.n_authors,
    transform=transformer),
                                            batch_size=1,
                                            shuffle=False)

# Print authors
def train_ccsaa(fold=0,
                ccsaa_epoch=100,
                text_length=20,
                n_gram='c1',
                dataset_size=100,
                dataset_start=0,
                cuda=True,
                save=False,
                save_dir='.'):
    """
    Train CCSAA
    :param fold:
    :param ccsaa_epoch:
    :param text_length:
    :param n_gram:
    :param dataset_size:
    :param dataset_start:
    :param cuda:
    :return:
    """
    # Save path
    save_path = os.path.join(save_dir, str(int(dataset_size)),
                             str(int(dataset_start)))

    # Transforms
    if n_gram == 'c1':
        transform = transforms.Compose([
            transforms.Character(),
            transforms.ToIndex(start_ix=0),
            transforms.ToNGram(n=text_length, overlapse=True),
            transforms.Reshape((-1, text_length))
        ])
    else:
        transform = transforms.Compose([
            transforms.Character2Gram(),
            transforms.ToIndex(start_ix=0),
            transforms.ToNGram(n=text_length, overlapse=True),
            transforms.Reshape((-1, text_length))
        ])
    # end if

    # Load from directory
    reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset(
        dataset_size=dataset_size, dataset_start=dataset_start)
    reutersc50_dataset.transform = transform

    # Loss function
    loss_function = nn.CrossEntropyLoss()

    # Set fold
    reuters_loader_train.dataset.set_fold(fold)
    reuters_loader_test.dataset.set_fold(fold)

    # Model
    model = torchlanguage.models.CCSAA(
        text_length=text_length,
        vocab_size=settings.ccsaa_voc_size,
        embedding_dim=settings.ccsaa_embedding_dim,
        n_classes=settings.n_authors)
    if cuda:
        model.cuda()
    # end if

    # Load
    if save and os.path.exists(
            os.path.join(save_path,
                         u"ccsaa." + str(fold) + u".pth")) and os.path.exists(
                             os.path.join(save_path, u"ccsaa." + str(fold) +
                                          u".voc.pth")):
        model.load_state_dict(
            torch.load(
                open(os.path.join(save_path, u"ccsaa." + str(fold) + u".pth"),
                     'rb')))
        voc = torch.load(
            open(os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth"),
                 'rb'))
        return model, voc
    # end if

    # Optimizer
    optimizer = optim.SGD(model.parameters(),
                          lr=settings.ccsaa_lr,
                          momentum=settings.ccsaa_momentum)

    # Best model
    best_acc = 0.0
    best_model = model.state_dict()

    # Fail count
    fail_count = 0

    # Epoch
    for epoch in range(10000):
        # Total losses
        training_loss = 0.0
        training_total = 0.0
        test_loss = 0.0
        test_total = 0.0

        # Get test data for this fold
        for i, data in enumerate(reuters_loader_train):
            # Inputs and labels
            inputs, labels, time_labels = data

            # Reshape
            inputs = inputs.view(-1, text_length)

            # Outputs
            outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0])

            # To variable
            inputs, outputs = Variable(inputs), Variable(outputs)
            if cuda:
                inputs, outputs = inputs.cuda(), outputs.cuda()
            # end if

            # Zero grad
            model.zero_grad()

            # Compute output
            log_probs = model(inputs)

            # Loss
            loss = loss_function(log_probs, outputs)

            # Backward and step
            loss.backward()
            optimizer.step()

            # Add
            training_loss += loss.data[0]
            training_total += 1.0
        # end for

        # Counters
        total = 0.0
        success = 0.0

        # For each test sample
        for i, data in enumerate(reuters_loader_test):
            # Inputs and labels
            inputs, labels, time_labels = data

            # Reshape
            inputs = inputs.view(-1, text_length)

            # Outputs
            outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0])

            # To variable
            inputs, outputs = Variable(inputs), Variable(outputs)
            if cuda:
                inputs, outputs = inputs.cuda(), outputs.cuda()
            # end if

            # Forward
            model_outputs = model(inputs)
            loss = loss_function(model_outputs, outputs)

            # Take the max as predicted
            _, predicted = torch.max(model_outputs.data, 1)

            # Add to correctly classified word
            success += (predicted == outputs.data).sum()
            total += predicted.size(0)

            # Add loss
            test_loss += loss.data[0]
            test_total += 1.0
        # end for

        # Accuracy
        accuracy = success / total * 100.0
        # print(u"Epoch {}, train loss {}, test loss {}, accuracy {}".format(epoch, training_loss / training_total, test_loss / test_total, accuracy))

        # Save if best
        if accuracy > best_acc and epoch > 10:
            best_acc = accuracy
            best_model = model.state_dict()
            fail_count = 0
        elif epoch > 10:
            fail_count += 1
        # end if

        if fail_count > ccsaa_epoch:
            break
        # end if
    # end for

    # Load best
    model.load_state_dict(best_model)

    # Save
    if save:
        # Create dir if not exists
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        # end if

        # Save
        torch.save(
            model.state_dict(),
            open(os.path.join(save_path, u"ccsaa." + str(fold) + u".pth"),
                 'wb'))

        # Save doc
        torch.save(
            transform.transforms[1].token_to_ix,
            open(os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth"),
                 'wb'))
    # end if

    return model, transform.transforms[1].token_to_ix
# Settings
stride = 100
window_size = 3000
security_border = 200

# Parse arguments
args = functions.argument_parser_training_model()

if args.n_gram == 'c1':
    transforms = ltransforms.Compose([
        ltransforms.ToLower(),
        ltransforms.Character(),
        ltransforms.ToIndex(start_ix=1),
        ltransforms.ToLength(length=window_size),
        ltransforms.Reshape((-1)),
        ltransforms.MaxIndex(max_id=settings.voc_sizes[args.n_gram])
    ])
else:
    transforms = ltransforms.Compose([
        ltransforms.ToLower(),
        ltransforms.Character2Gram(),
        ltransforms.ToIndex(start_ix=1),
        ltransforms.ToLength(length=window_size),
        ltransforms.Reshape((-1)),
        ltransforms.MaxIndex(max_id=settings.voc_sizes[args.n_gram])
    ])
# end if

# Style change detection dataset, training set
pan18loader_train, pan18loader_valid = functions.load_dataset(transforms, 1, args.root)
parser.add_argument("--no-cuda",
                    action='store_true',
                    default=False,
                    help="Enables CUDA training")
args = parser.parse_args()

# Use CUDA?
args.cuda = not args.no_cuda and torch.cuda.is_available()

# Transforms
if args.n_gram == 'c1':
    transform = transforms.Compose([
        transforms.Character(),
        transforms.ToIndex(start_ix=0),
        transforms.ToNGram(n=args.text_length, overlapse=True),
        transforms.Reshape((-1, args.text_length))
    ])
else:
    transform = transforms.Compose([
        transforms.Character2Gram(),
        transforms.ToIndex(start_ix=0),
        transforms.ToNGram(n=args.text_length, overlapse=True),
        transforms.Reshape((-1, args.text_length))
    ])
# end if

# Dataset
dataset = datasets.ReutersC50Dataset(download=True,
                                     n_authors=15,
                                     transform=transform)
    # Eval. dataset
    reutersloader_val = torch.utils.data.DataLoader(
        torchlanguage.utils.CrossValidation(reuters_dataset, k=10,
                                            train=False),
        batch_size=1,
        shuffle=False)

    # 10-CV
    for k in np.arange(args.fold, 10):
        # Model
        if model_type == 'linear':
            # Transformer
            transformer = transforms.Compose([
                transforms.GloveVector(),
                transforms.ToNGram(n=n_gram, overlapse=True),
                transforms.Reshape((-1, n_gram * 300)),
            ])

            # Transformer
            reuters_dataset.transform = transformer

            # Linear regression
            model = etnn.RRCell(n_gram * 300, n_authors)
        elif model_type == 'cgfs':
            # CNN Glove Feature Selector
            cgfs, transformer = cgfs_selector.load_cgfs(fold=k)

            # Transformer
            reuters_dataset.transform = transformer

            # Linear regression
input_scaling = 0.5
n_test = 10
n_samples = 2
n_epoch = 100
text_length = 20

# Argument
args = tools.functions.argument_parser_training_model()

# Transforms
transform = transforms.Compose([
    transforms.Character(),
    transforms.ToIndex(start_ix=0),
    transforms.MaxIndex(max_id=83),
    transforms.ToNGram(n=text_length, overlapse=True),
    transforms.Reshape((-1, 20))
])

# Author identification training dataset
dataset_train = dataset.AuthorIdentificationDataset(root="./data/", download=True, transform=transform, problem=1, lang='en')

# Author identification test dataset
dataset_valid = dataset.AuthorIdentificationDataset(root="./data/", download=True, transform=transform, problem=1, train=False, lang='en')

# Cross validation
dataloader_train = torch.utils.data.DataLoader(torchlanguage.utils.CrossValidation(dataset_train), batch_size=1, shuffle=True)
dataloader_valid = torch.utils.data.DataLoader(torchlanguage.utils.CrossValidation(dataset_valid, train=False), batch_size=1, shuffle=True)

# Author to idx
author_to_ix = dict()
for idx, author in enumerate(dataset_train.authors):