Ejemplo n.º 1
0
def text_transformer(n_gram, window_size):
    """
    Get tweet transformer
    :param lang:
    :param n_gram:
    :return:
    """
    if n_gram == 'c1':
        return transforms.Compose([
            ltransforms.ToLower(),
            ltransforms.Character(),
            ltransforms.ToIndex(start_ix=0),
            ltransforms.ToNGram(n=window_size, overlapse=True),
            ltransforms.Reshape((-1, window_size)),
            ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram] - 1)
        ])
    else:
        return transforms.Compose([
            ltransforms.ToLower(),
            ltransforms.Character2Gram(),
            ltransforms.ToIndex(start_ix=0),
            ltransforms.ToNGram(n=window_size, overlapse=True),
            ltransforms.Reshape((-1, window_size)),
            ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram] - 1)
        ])
Ejemplo n.º 2
0
def tweet_transformer(lang, n_gram, voc=None):
    """
    Get tweet transformer
    :param lang:
    :param n_gram:
    :return:
    """
    if voc is None:
        token_to_ix = dict()
    else:
        token_to_ix = voc
    # end if
    if n_gram == 'c1':
        return transforms.Compose([
            ltransforms.RemoveRegex(
                regex=r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'),
            ltransforms.ToLower(),
            ltransforms.Character(),
            ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix),
            ltransforms.ToLength(length=settings.min_length),
            ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram][lang] - 1)
        ])
    else:
        return transforms.Compose([
            ltransforms.RemoveRegex(
                regex=r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'),
            ltransforms.ToLower(),
            ltransforms.Character2Gram(),
            ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix),
            ltransforms.ToLength(length=settings.min_length),
            ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram][lang] - 1)
        ])
Ejemplo n.º 3
0
def text_transformer_cnn(window_size, n_gram, token_to_ix):
    """
    Get text transformer for CNNSCD
    :param window_size:
    :param n_gram:
    :return:
    """
    if n_gram == 'c1':
        return ltransforms.Compose([
            ltransforms.ToLower(),
            ltransforms.Character(),
            ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix),
            ltransforms.ToLength(length=window_size),
            ltransforms.Reshape((-1)),
            ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram])
        ])
    else:
        return ltransforms.Compose([
            ltransforms.ToLower(),
            ltransforms.Character2Gram(),
            ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix),
            ltransforms.ToLength(length=window_size),
            ltransforms.Reshape((-1)),
            ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram])
        ])
# Eval
model.eval()

if args.n_gram == 'c1':
    transforms = ltransforms.Compose([
        ltransforms.ToLower(),
        ltransforms.Character(),
        ltransforms.ToIndex(start_ix=1, token_to_ix=voc),
        ltransforms.ToLength(length=window_size),
        ltransforms.MaxIndex(max_id=settings.voc_sizes[args.n_gram])
    ])
else:
    transforms = ltransforms.Compose([
        ltransforms.ToLower(),
        ltransforms.Character2Gram(),
        ltransforms.ToIndex(start_ix=1, token_to_ix=voc),
        ltransforms.ToLength(length=window_size),
        ltransforms.MaxIndex(max_id=settings.voc_sizes[args.n_gram])
    ])
# end if

# Validation losses
validation_total = 0
validation_success = np.zeros((n_levels, n_thresholds))
n_files = 0.0
n_same = 0.0
n_diff = 0.0

# Values
same_distance_values = np.array([])
def train_ccsaa(fold=0,
                ccsaa_epoch=100,
                text_length=20,
                n_gram='c1',
                dataset_size=100,
                dataset_start=0,
                cuda=True,
                save=False,
                save_dir='.'):
    """
    Train CCSAA
    :param fold:
    :param ccsaa_epoch:
    :param text_length:
    :param n_gram:
    :param dataset_size:
    :param dataset_start:
    :param cuda:
    :return:
    """
    # Save path
    save_path = os.path.join(save_dir, str(int(dataset_size)),
                             str(int(dataset_start)))

    # Transforms
    if n_gram == 'c1':
        transform = transforms.Compose([
            transforms.Character(),
            transforms.ToIndex(start_ix=0),
            transforms.ToNGram(n=text_length, overlapse=True),
            transforms.Reshape((-1, text_length))
        ])
    else:
        transform = transforms.Compose([
            transforms.Character2Gram(),
            transforms.ToIndex(start_ix=0),
            transforms.ToNGram(n=text_length, overlapse=True),
            transforms.Reshape((-1, text_length))
        ])
    # end if

    # Load from directory
    reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset(
        dataset_size=dataset_size, dataset_start=dataset_start)
    reutersc50_dataset.transform = transform

    # Loss function
    loss_function = nn.CrossEntropyLoss()

    # Set fold
    reuters_loader_train.dataset.set_fold(fold)
    reuters_loader_test.dataset.set_fold(fold)

    # Model
    model = torchlanguage.models.CCSAA(
        text_length=text_length,
        vocab_size=settings.ccsaa_voc_size,
        embedding_dim=settings.ccsaa_embedding_dim,
        n_classes=settings.n_authors)
    if cuda:
        model.cuda()
    # end if

    # Load
    if save and os.path.exists(
            os.path.join(save_path,
                         u"ccsaa." + str(fold) + u".pth")) and os.path.exists(
                             os.path.join(save_path, u"ccsaa." + str(fold) +
                                          u".voc.pth")):
        model.load_state_dict(
            torch.load(
                open(os.path.join(save_path, u"ccsaa." + str(fold) + u".pth"),
                     'rb')))
        voc = torch.load(
            open(os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth"),
                 'rb'))
        return model, voc
    # end if

    # Optimizer
    optimizer = optim.SGD(model.parameters(),
                          lr=settings.ccsaa_lr,
                          momentum=settings.ccsaa_momentum)

    # Best model
    best_acc = 0.0
    best_model = model.state_dict()

    # Fail count
    fail_count = 0

    # Epoch
    for epoch in range(10000):
        # Total losses
        training_loss = 0.0
        training_total = 0.0
        test_loss = 0.0
        test_total = 0.0

        # Get test data for this fold
        for i, data in enumerate(reuters_loader_train):
            # Inputs and labels
            inputs, labels, time_labels = data

            # Reshape
            inputs = inputs.view(-1, text_length)

            # Outputs
            outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0])

            # To variable
            inputs, outputs = Variable(inputs), Variable(outputs)
            if cuda:
                inputs, outputs = inputs.cuda(), outputs.cuda()
            # end if

            # Zero grad
            model.zero_grad()

            # Compute output
            log_probs = model(inputs)

            # Loss
            loss = loss_function(log_probs, outputs)

            # Backward and step
            loss.backward()
            optimizer.step()

            # Add
            training_loss += loss.data[0]
            training_total += 1.0
        # end for

        # Counters
        total = 0.0
        success = 0.0

        # For each test sample
        for i, data in enumerate(reuters_loader_test):
            # Inputs and labels
            inputs, labels, time_labels = data

            # Reshape
            inputs = inputs.view(-1, text_length)

            # Outputs
            outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0])

            # To variable
            inputs, outputs = Variable(inputs), Variable(outputs)
            if cuda:
                inputs, outputs = inputs.cuda(), outputs.cuda()
            # end if

            # Forward
            model_outputs = model(inputs)
            loss = loss_function(model_outputs, outputs)

            # Take the max as predicted
            _, predicted = torch.max(model_outputs.data, 1)

            # Add to correctly classified word
            success += (predicted == outputs.data).sum()
            total += predicted.size(0)

            # Add loss
            test_loss += loss.data[0]
            test_total += 1.0
        # end for

        # Accuracy
        accuracy = success / total * 100.0
        # print(u"Epoch {}, train loss {}, test loss {}, accuracy {}".format(epoch, training_loss / training_total, test_loss / test_total, accuracy))

        # Save if best
        if accuracy > best_acc and epoch > 10:
            best_acc = accuracy
            best_model = model.state_dict()
            fail_count = 0
        elif epoch > 10:
            fail_count += 1
        # end if

        if fail_count > ccsaa_epoch:
            break
        # end if
    # end for

    # Load best
    model.load_state_dict(best_model)

    # Save
    if save:
        # Create dir if not exists
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        # end if

        # Save
        torch.save(
            model.state_dict(),
            open(os.path.join(save_path, u"ccsaa." + str(fold) + u".pth"),
                 'wb'))

        # Save doc
        torch.save(
            transform.transforms[1].token_to_ix,
            open(os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth"),
                 'wb'))
    # end if

    return model, transform.transforms[1].token_to_ix